diff --git a/src/xenia/app/xenia_main.cc b/src/xenia/app/xenia_main.cc index bc9b662c5..80ed35551 100644 --- a/src/xenia/app/xenia_main.cc +++ b/src/xenia/app/xenia_main.cc @@ -78,7 +78,7 @@ std::unique_ptr CreateGraphicsSystem() { std::unique_ptr best; best = std::unique_ptr( - new xe::gpu::gl4::GL4GraphicsSystem()); + new xe::gpu::vulkan::VulkanGraphicsSystem()); if (best) { return best; } diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index e5412d8e7..3edd9703e 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -87,13 +87,12 @@ bool MMIOHandler::CheckStore(uint32_t virtual_address, uint32_t value) { return false; } -uintptr_t MMIOHandler::AddPhysicalWriteWatch(uint32_t guest_address, - size_t length, - WriteWatchCallback callback, - void* callback_context, - void* callback_data) { - uint32_t base_address = guest_address; - assert_true(base_address < 0x1FFFFFFF); +uintptr_t MMIOHandler::AddPhysicalAccessWatch(uint32_t guest_address, + size_t length, WatchType type, + AccessWatchCallback callback, + void* callback_context, + void* callback_data) { + uint32_t base_address = guest_address & 0x1FFFFFFF; // Can only protect sizes matching system page size. // This means we need to round up, which will cause spurious access @@ -103,32 +102,45 @@ uintptr_t MMIOHandler::AddPhysicalWriteWatch(uint32_t guest_address, xe::memory::page_size()); base_address = base_address - (base_address % xe::memory::page_size()); + auto lock = global_critical_region_.Acquire(); + // Add to table. The slot reservation may evict a previous watch, which // could include our target, so we do it first. - auto entry = new WriteWatchEntry(); + auto entry = new AccessWatchEntry(); entry->address = base_address; entry->length = uint32_t(length); entry->callback = callback; entry->callback_context = callback_context; entry->callback_data = callback_data; - global_critical_region_.mutex().lock(); - write_watches_.push_back(entry); - global_critical_region_.mutex().unlock(); + access_watches_.push_back(entry); - // Make the desired range read only under all address spaces. + auto page_access = memory::PageAccess::kNoAccess; + switch (type) { + case kWatchWrite: + page_access = memory::PageAccess::kReadOnly; + break; + case kWatchReadWrite: + page_access = memory::PageAccess::kNoAccess; + break; + default: + assert_unhandled_case(type); + break; + } + + // Protect the range under all address spaces memory::Protect(physical_membase_ + entry->address, entry->length, - xe::memory::PageAccess::kReadOnly, nullptr); + page_access, nullptr); memory::Protect(virtual_membase_ + 0xA0000000 + entry->address, entry->length, - xe::memory::PageAccess::kReadOnly, nullptr); + page_access, nullptr); memory::Protect(virtual_membase_ + 0xC0000000 + entry->address, entry->length, - xe::memory::PageAccess::kReadOnly, nullptr); + page_access, nullptr); memory::Protect(virtual_membase_ + 0xE0000000 + entry->address, entry->length, - xe::memory::PageAccess::kReadOnly, nullptr); + page_access, nullptr); return reinterpret_cast(entry); } -void MMIOHandler::ClearWriteWatch(WriteWatchEntry* entry) { +void MMIOHandler::ClearAccessWatch(AccessWatchEntry* entry) { memory::Protect(physical_membase_ + entry->address, entry->length, xe::memory::PageAccess::kReadWrite, nullptr); memory::Protect(virtual_membase_ + 0xA0000000 + entry->address, entry->length, @@ -139,19 +151,20 @@ void MMIOHandler::ClearWriteWatch(WriteWatchEntry* entry) { xe::memory::PageAccess::kReadWrite, nullptr); } -void MMIOHandler::CancelWriteWatch(uintptr_t watch_handle) { - auto entry = reinterpret_cast(watch_handle); +void MMIOHandler::CancelAccessWatch(uintptr_t watch_handle) { + auto entry = reinterpret_cast(watch_handle); + auto lock = global_critical_region_.Acquire(); // Allow access to the range again. - ClearWriteWatch(entry); + ClearAccessWatch(entry); // Remove from table. - global_critical_region_.mutex().lock(); - auto it = std::find(write_watches_.begin(), write_watches_.end(), entry); - if (it != write_watches_.end()) { - write_watches_.erase(it); + auto it = std::find(access_watches_.begin(), access_watches_.end(), entry); + assert_false(it == access_watches_.end()); + + if (it != access_watches_.end()) { + access_watches_.erase(it); } - global_critical_region_.mutex().unlock(); delete entry; } @@ -159,18 +172,19 @@ void MMIOHandler::CancelWriteWatch(uintptr_t watch_handle) { void MMIOHandler::InvalidateRange(uint32_t physical_address, size_t length) { auto lock = global_critical_region_.Acquire(); - for (auto it = write_watches_.begin(); it != write_watches_.end();) { + for (auto it = access_watches_.begin(); it != access_watches_.end();) { auto entry = *it; if ((entry->address <= physical_address && entry->address + entry->length > physical_address) || (entry->address >= physical_address && entry->address < physical_address + length)) { // This watch lies within the range. End it. - ClearWriteWatch(entry); + ClearAccessWatch(entry); entry->callback(entry->callback_context, entry->callback_data, entry->address); - it = write_watches_.erase(it); + it = access_watches_.erase(it); + delete entry; continue; } @@ -178,50 +192,49 @@ void MMIOHandler::InvalidateRange(uint32_t physical_address, size_t length) { } } -bool MMIOHandler::CheckWriteWatch(uint64_t fault_address) { - uint32_t physical_address = uint32_t(fault_address); - if (physical_address > 0x1FFFFFFF) { - physical_address &= 0x1FFFFFFF; - } - std::list pending_invalidates; - global_critical_region_.mutex().lock(); - // Now that we hold the lock, recheck and see if the pages are still - // protected. - memory::PageAccess cur_access; - size_t page_length = memory::page_size(); - memory::QueryProtect((void*)fault_address, page_length, cur_access); - if (cur_access != memory::PageAccess::kReadOnly && - cur_access != memory::PageAccess::kNoAccess) { - // Another thread has cleared this write watch. Abort. - global_critical_region_.mutex().unlock(); - return true; +bool MMIOHandler::IsRangeWatched(uint32_t physical_address, size_t length) { + auto lock = global_critical_region_.Acquire(); + + for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) { + auto entry = *it; + if ((entry->address <= physical_address && + entry->address + entry->length > physical_address) || + (entry->address >= physical_address && + entry->address < physical_address + length)) { + // This watch lies within the range. + return true; + } } - for (auto it = write_watches_.begin(); it != write_watches_.end();) { + return false; +} + +bool MMIOHandler::CheckAccessWatch(uint32_t physical_address) { + auto lock = global_critical_region_.Acquire(); + + bool hit = false; + for (auto it = access_watches_.begin(); it != access_watches_.end();) { auto entry = *it; if (entry->address <= physical_address && entry->address + entry->length > physical_address) { - // Hit! Remove the writewatch. - pending_invalidates.push_back(entry); + // Hit! Remove the watch. + hit = true; + ClearAccessWatch(entry); + entry->callback(entry->callback_context, entry->callback_data, + physical_address); - ClearWriteWatch(entry); - it = write_watches_.erase(it); + it = access_watches_.erase(it); + delete entry; continue; } ++it; } - global_critical_region_.mutex().unlock(); - if (pending_invalidates.empty()) { + + if (!hit) { // Rethrow access violation - range was not being watched. return false; } - while (!pending_invalidates.empty()) { - auto entry = pending_invalidates.back(); - pending_invalidates.pop_back(); - entry->callback(entry->callback_context, entry->callback_data, - physical_address); - delete entry; - } + // Range was watched, so lets eat this access violation. return true; } @@ -414,9 +427,33 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { } } if (!range) { + auto fault_address = reinterpret_cast(ex->fault_address()); + uint32_t guest_address = 0; + if (fault_address >= virtual_membase_ && + fault_address < physical_membase_) { + // Faulting on a virtual address. + guest_address = static_cast(ex->fault_address()) & 0x1FFFFFFF; + } else { + // Faulting on a physical address. + guest_address = static_cast(ex->fault_address()); + } + + // HACK: Recheck if the pages are still protected (race condition - another + // thread clears the writewatch we just hit) + // Do this under the lock so we don't introduce another race condition. + auto lock = global_critical_region_.Acquire(); + memory::PageAccess cur_access; + size_t page_length = memory::page_size(); + memory::QueryProtect((void*)fault_address, page_length, cur_access); + if (cur_access != memory::PageAccess::kReadOnly && + cur_access != memory::PageAccess::kNoAccess) { + // Another thread has cleared this write watch. Abort. + return true; + } + // Access is not found within any range, so fail and let the caller handle // it (likely by aborting). - return CheckWriteWatch(ex->fault_address()); + return CheckAccessWatch(guest_address); } auto rip = ex->pc(); diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index 70d89ac02..bb8cd665f 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -28,9 +28,8 @@ typedef uint32_t (*MMIOReadCallback)(void* ppc_context, void* callback_context, uint32_t addr); typedef void (*MMIOWriteCallback)(void* ppc_context, void* callback_context, uint32_t addr, uint32_t value); - -typedef void (*WriteWatchCallback)(void* context_ptr, void* data_ptr, - uint32_t address); +typedef void (*AccessWatchCallback)(void* context_ptr, void* data_ptr, + uint32_t address); struct MMIORange { uint32_t address; @@ -46,6 +45,12 @@ class MMIOHandler { public: virtual ~MMIOHandler(); + enum WatchType { + kWatchInvalid = 0, + kWatchWrite = 1, + kWatchReadWrite = 2, + }; + static std::unique_ptr Install(uint8_t* virtual_membase, uint8_t* physical_membase, uint8_t* membase_end); @@ -59,17 +64,24 @@ class MMIOHandler { bool CheckLoad(uint32_t virtual_address, uint32_t* out_value); bool CheckStore(uint32_t virtual_address, uint32_t value); - uintptr_t AddPhysicalWriteWatch(uint32_t guest_address, size_t length, - WriteWatchCallback callback, - void* callback_context, void* callback_data); - void CancelWriteWatch(uintptr_t watch_handle); + // Memory watches: These are one-shot alarms that fire a callback (in the + // context of the thread that caused the callback) when a memory range is + // either written to or read from, depending on the watch type. These fire as + // soon as a read/write happens, and only fire once. + // These watches may be spuriously fired if memory is accessed nearby. + uintptr_t AddPhysicalAccessWatch(uint32_t guest_address, size_t length, + WatchType type, AccessWatchCallback callback, + void* callback_context, void* callback_data); + void CancelAccessWatch(uintptr_t watch_handle); void InvalidateRange(uint32_t physical_address, size_t length); + bool IsRangeWatched(uint32_t physical_address, size_t length); protected: - struct WriteWatchEntry { + struct AccessWatchEntry { uint32_t address; uint32_t length; - WriteWatchCallback callback; + WatchType type; + AccessWatchCallback callback; void* callback_context; void* callback_data; }; @@ -83,8 +95,8 @@ class MMIOHandler { static bool ExceptionCallbackThunk(Exception* ex, void* data); bool ExceptionCallback(Exception* ex); - void ClearWriteWatch(WriteWatchEntry* entry); - bool CheckWriteWatch(uint64_t fault_address); + void ClearAccessWatch(AccessWatchEntry* entry); + bool CheckAccessWatch(uint32_t guest_address); uint8_t* virtual_membase_; uint8_t* physical_membase_; @@ -94,7 +106,7 @@ class MMIOHandler { xe::global_critical_region global_critical_region_; // TODO(benvanik): data structure magic. - std::list write_watches_; + std::list access_watches_; static MMIOHandler* global_handler_; }; diff --git a/src/xenia/gpu/command_processor.h b/src/xenia/gpu/command_processor.h index f2fbb6c54..c2784480b 100644 --- a/src/xenia/gpu/command_processor.h +++ b/src/xenia/gpu/command_processor.h @@ -84,9 +84,9 @@ class CommandProcessor { swap_request_handler_ = fn; } - void RequestFrameTrace(const std::wstring& root_path); - void BeginTracing(const std::wstring& root_path); - void EndTracing(); + virtual void RequestFrameTrace(const std::wstring& root_path); + virtual void BeginTracing(const std::wstring& root_path); + virtual void EndTracing(); void InitializeRingBuffer(uint32_t ptr, uint32_t page_count); void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc index 4a8917e71..72e1c9639 100644 --- a/src/xenia/gpu/gl4/texture_cache.cc +++ b/src/xenia/gpu/gl4/texture_cache.cc @@ -427,7 +427,7 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture( // Not found, create. auto entry = std::make_unique(); entry->texture_info = texture_info; - entry->write_watch_handle = 0; + entry->access_watch_handle = 0; entry->pending_invalidation = false; entry->handle = 0; @@ -442,6 +442,7 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture( // Found! Acquire the handle and remove the readbuffer entry. read_buffer_textures_.erase(it); entry->handle = read_buffer_entry->handle; + entry->access_watch_handle = read_buffer_entry->access_watch_handle; delete read_buffer_entry; // TODO(benvanik): set more texture properties? swizzle/etc? auto entry_ptr = entry.get(); @@ -495,14 +496,15 @@ TextureCache::TextureEntry* TextureCache::LookupOrInsertTexture( // Add a write watch. If any data in the given range is touched we'll get a // callback and evict the texture. We could reuse the storage, though the // driver is likely in a better position to pool that kind of stuff. - entry->write_watch_handle = memory_->AddPhysicalWriteWatch( + entry->access_watch_handle = memory_->AddPhysicalAccessWatch( texture_info.guest_address, texture_info.input_length, + cpu::MMIOHandler::kWatchWrite, [](void* context_ptr, void* data_ptr, uint32_t address) { auto self = reinterpret_cast(context_ptr); auto touched_entry = reinterpret_cast(data_ptr); // Clear watch handle first so we don't redundantly // remove. - touched_entry->write_watch_handle = 0; + touched_entry->access_watch_handle = 0; touched_entry->pending_invalidation = true; // Add to pending list so Scavenge will clean it up. self->invalidated_textures_mutex_.lock(); @@ -574,14 +576,27 @@ GLuint TextureCache::ConvertTexture(Blitter* blitter, uint32_t guest_address, dest_rect, GL_LINEAR, swap_channels); } - // HACK: remove texture from write watch list so readback won't kill us. - // Not needed now, as readback is disabled. - /* - if (texture_entry->write_watch_handle) { - memory_->CancelWriteWatch(texture_entry->write_watch_handle); - texture_entry->write_watch_handle = 0; + // Setup a read/write access watch. If the game tries to touch the memory + // we were supposed to populate with this texture, then we'll actually + // populate it. + if (texture_entry->access_watch_handle) { + memory_->CancelAccessWatch(texture_entry->access_watch_handle); + texture_entry->access_watch_handle = 0; } - //*/ + + texture_entry->access_watch_handle = memory_->AddPhysicalAccessWatch( + guest_address, texture_entry->texture_info.input_length, + cpu::MMIOHandler::kWatchReadWrite, + [](void* context, void* data, uint32_t address) { + auto touched_entry = reinterpret_cast(data); + touched_entry->access_watch_handle = 0; + + // This happens. RDR resolves to a texture then upsizes it, BF1943 + // writes to a resolved texture. + // TODO (for Vulkan): Copy this texture back into system memory. + // assert_always(); + }, + nullptr, texture_entry); return texture_entry->handle; } @@ -618,6 +633,20 @@ GLuint TextureCache::ConvertTexture(Blitter* blitter, uint32_t guest_address, entry->block_height = block_height; entry->format = format; + entry->access_watch_handle = memory_->AddPhysicalAccessWatch( + guest_address, block_height * block_width * 4, + cpu::MMIOHandler::kWatchReadWrite, + [](void* context, void* data, uint32_t address) { + auto entry = reinterpret_cast(data); + entry->access_watch_handle = 0; + + // This happens. RDR resolves to a texture then upsizes it, BF1943 + // writes to a resolved texture. + // TODO (for Vulkan): Copy this texture back into system memory. + // assert_always(); + }, + nullptr, entry.get()); + glCreateTextures(GL_TEXTURE_2D, 1, &entry->handle); glTextureParameteri(entry->handle, GL_TEXTURE_BASE_LEVEL, 0); glTextureParameteri(entry->handle, GL_TEXTURE_MAX_LEVEL, 1); @@ -636,9 +665,9 @@ GLuint TextureCache::ConvertTexture(Blitter* blitter, uint32_t guest_address, } void TextureCache::EvictTexture(TextureEntry* entry) { - if (entry->write_watch_handle) { - memory_->CancelWriteWatch(entry->write_watch_handle); - entry->write_watch_handle = 0; + if (entry->access_watch_handle) { + memory_->CancelAccessWatch(entry->access_watch_handle); + entry->access_watch_handle = 0; } for (auto& view : entry->views) { diff --git a/src/xenia/gpu/gl4/texture_cache.h b/src/xenia/gpu/gl4/texture_cache.h index d214dac53..d55aa37a1 100644 --- a/src/xenia/gpu/gl4/texture_cache.h +++ b/src/xenia/gpu/gl4/texture_cache.h @@ -44,7 +44,7 @@ class TextureCache { }; struct TextureEntry { TextureInfo texture_info; - uintptr_t write_watch_handle; + uintptr_t access_watch_handle; GLuint handle; bool pending_invalidation; std::vector> views; @@ -74,8 +74,12 @@ class TextureCache { TextureFormat format, bool swap_channels, GLuint src_texture, Rect2D src_rect, Rect2D dest_rect); + TextureEntry* LookupAddress(uint32_t guest_address, uint32_t width, + uint32_t height, TextureFormat format); + private: struct ReadBufferTexture { + uintptr_t access_watch_handle; uint32_t guest_address; uint32_t logical_width; uint32_t logical_height; @@ -90,8 +94,6 @@ class TextureCache { void EvictSampler(SamplerEntry* entry); TextureEntry* LookupOrInsertTexture(const TextureInfo& texture_info, uint64_t opt_hash = 0); - TextureEntry* LookupAddress(uint32_t guest_address, uint32_t width, - uint32_t height, TextureFormat format); void EvictTexture(TextureEntry* entry); bool UploadTexture2D(GLuint texture, const TextureInfo& texture_info); diff --git a/src/xenia/gpu/premake5.lua b/src/xenia/gpu/premake5.lua index 1f6a1eea6..1c7870edc 100644 --- a/src/xenia/gpu/premake5.lua +++ b/src/xenia/gpu/premake5.lua @@ -22,6 +22,8 @@ project("xenia-gpu") project_root.."/third_party/gflags/src", }) local_platform_files() + local_platform_files("spirv") + local_platform_files("spirv/passes") group("src") project("xenia-gpu-shader-compiler") diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 476369e53..7e0cd3ab2 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -99,6 +99,17 @@ struct InstructionResult { bool has_all_writes() const { return write_mask[0] && write_mask[1] && write_mask[2] && write_mask[3]; } + // Returns number of components written + uint32_t num_writes() const { + uint32_t total = 0; + for (int i = 0; i < 4; i++) { + if (write_mask[i]) { + total++; + } + } + + return total; + } // Returns true if any non-constant components are written. bool stores_non_constants() const { for (int i = 0; i < 4; ++i) { @@ -547,6 +558,9 @@ class Shader { // True if the shader was translated and prepared without error. bool is_valid() const { return is_valid_; } + // True if the shader has already been translated. + bool is_translated() const { return is_translated_; } + // Errors that occurred during translation. const std::vector& errors() const { return errors_; } @@ -591,6 +605,7 @@ class Shader { bool writes_color_targets_[4] = {false, false, false, false}; bool is_valid_ = false; + bool is_translated_ = false; std::vector errors_; std::string ucode_disassembly_; diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index a89be80f5..5bb9ba016 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -51,6 +51,7 @@ void ShaderTranslator::Reset() { ucode_disasm_buffer_.Reset(); ucode_disasm_line_number_ = 0; previous_ucode_disasm_scan_offset_ = 0; + register_count_ = 64; total_attrib_count_ = 0; vertex_bindings_.clear(); texture_bindings_.clear(); @@ -95,9 +96,21 @@ bool ShaderTranslator::GatherAllBindingInformation(Shader* shader) { return true; } +bool ShaderTranslator::Translate(Shader* shader, + xenos::xe_gpu_program_cntl_t cntl) { + Reset(); + register_count_ = shader->type() == ShaderType::kVertex ? cntl.vs_regs + 1 + : cntl.ps_regs + 1; + + return TranslateInternal(shader); +} + bool ShaderTranslator::Translate(Shader* shader) { Reset(); + return TranslateInternal(shader); +} +bool ShaderTranslator::TranslateInternal(Shader* shader) { shader_type_ = shader->type(); ucode_dwords_ = shader->ucode_dwords(); ucode_dword_count_ = shader->ucode_dword_count(); @@ -155,6 +168,7 @@ bool ShaderTranslator::Translate(Shader* shader) { } shader->is_valid_ = true; + shader->is_translated_ = true; for (const auto& error : shader->errors_) { if (error.is_fatal) { shader->is_valid_ = false; @@ -369,9 +383,9 @@ bool ShaderTranslator::TranslateBlocks() { AddControlFlowTargetLabel(cf_a, &label_addresses); AddControlFlowTargetLabel(cf_b, &label_addresses); - PreProcessControlFlowInstruction(cf_index); + PreProcessControlFlowInstruction(cf_index, cf_a); ++cf_index; - PreProcessControlFlowInstruction(cf_index); + PreProcessControlFlowInstruction(cf_index, cf_b); ++cf_index; } @@ -672,11 +686,11 @@ void ShaderTranslator::TranslateExecInstructions( static_cast(ucode_dwords_[instr_offset * 3] & 0x1F); if (fetch_opcode == FetchOpcode::kVertexFetch) { auto& op = *reinterpret_cast( - ucode_dwords_ + instr_offset * 3); + ucode_dwords_ + instr_offset * 3); TranslateVertexFetchInstruction(op); } else { auto& op = *reinterpret_cast( - ucode_dwords_ + instr_offset * 3); + ucode_dwords_ + instr_offset * 3); TranslateTextureFetchInstruction(op); } } else { @@ -986,16 +1000,19 @@ void ShaderTranslator::TranslateAluInstruction(const AluInstruction& op) { return; } + ParsedAluInstruction instr; if (op.has_vector_op()) { const auto& opcode_info = alu_vector_opcode_infos_[static_cast(op.vector_opcode())]; - ParseAluVectorInstruction(op, opcode_info); + ParseAluVectorInstruction(op, opcode_info, instr); + ProcessAluInstruction(instr); } if (op.has_scalar_op()) { const auto& opcode_info = alu_scalar_opcode_infos_[static_cast(op.scalar_opcode())]; - ParseAluScalarInstruction(op, opcode_info); + ParseAluScalarInstruction(op, opcode_info, instr); + ProcessAluInstruction(instr); } } @@ -1044,9 +1061,8 @@ void ParseAluInstructionOperand(const AluInstruction& op, int i, uint32_t a = swizzle & 0x3; out_op->components[0] = GetSwizzleFromComponentIndex(a); } else if (swizzle_component_count == 2) { - swizzle >>= 4; - uint32_t a = ((swizzle >> 2) + 3) & 0x3; - uint32_t b = (swizzle + 2) & 0x3; + uint32_t a = ((swizzle >> 6) + 3) & 0x3; + uint32_t b = ((swizzle >> 0) + 0) & 0x3; out_op->components[0] = GetSwizzleFromComponentIndex(a); out_op->components[1] = GetSwizzleFromComponentIndex(b); } else { @@ -1088,8 +1104,8 @@ void ParseAluInstructionOperandSpecial(const AluInstruction& op, } void ShaderTranslator::ParseAluVectorInstruction( - const AluInstruction& op, const AluOpcodeInfo& opcode_info) { - ParsedAluInstruction i; + const AluInstruction& op, const AluOpcodeInfo& opcode_info, + ParsedAluInstruction& i) { i.dword_index = 0; i.type = ParsedAluInstruction::Type::kVector; i.vector_opcode = op.vector_opcode(); @@ -1126,6 +1142,10 @@ void ShaderTranslator::ParseAluVectorInstruction( } else { // Unimplemented. // assert_always(); + XELOGE( + "ShaderTranslator::ParseAluVectorInstruction: Unsupported write " + "to export %d", + dest_num); i.result.storage_target = InstructionStorageTarget::kNone; i.result.storage_index = 0; } @@ -1203,13 +1223,11 @@ void ShaderTranslator::ParseAluVectorInstruction( } i.Disassemble(&ucode_disasm_buffer_); - - ProcessAluInstruction(i); } void ShaderTranslator::ParseAluScalarInstruction( - const AluInstruction& op, const AluOpcodeInfo& opcode_info) { - ParsedAluInstruction i; + const AluInstruction& op, const AluOpcodeInfo& opcode_info, + ParsedAluInstruction& i) { i.dword_index = 0; i.type = ParsedAluInstruction::Type::kScalar; i.scalar_opcode = op.scalar_opcode(); @@ -1319,8 +1337,6 @@ void ShaderTranslator::ParseAluScalarInstruction( } i.Disassemble(&ucode_disasm_buffer_); - - ProcessAluInstruction(i); } } // namespace gpu diff --git a/src/xenia/gpu/shader_translator.h b/src/xenia/gpu/shader_translator.h index d1b27a997..9801cb2d6 100644 --- a/src/xenia/gpu/shader_translator.h +++ b/src/xenia/gpu/shader_translator.h @@ -30,6 +30,7 @@ class ShaderTranslator { // DEPRECATED(benvanik): remove this when shader cache is removed. bool GatherAllBindingInformation(Shader* shader); + bool Translate(Shader* shader, xenos::xe_gpu_program_cntl_t cntl); bool Translate(Shader* shader); protected: @@ -38,6 +39,8 @@ class ShaderTranslator { // Resets translator state before beginning translation. virtual void Reset(); + // Register count. + uint32_t register_count() const { return register_count_; } // True if the current shader is a vertex shader. bool is_vertex_shader() const { return shader_type_ == ShaderType::kVertex; } // True if the current shader is a pixel shader. @@ -79,7 +82,8 @@ class ShaderTranslator { } // Pre-process a control-flow instruction before anything else. - virtual void PreProcessControlFlowInstruction(uint32_t cf_index) {} + virtual void PreProcessControlFlowInstruction( + uint32_t cf_index, const ucode::ControlFlowInstruction& instr) {} // Handles translation for control flow label addresses. // This is triggered once for each label required (due to control flow @@ -131,6 +135,8 @@ class ShaderTranslator { int src_swizzle_component_count; }; + bool TranslateInternal(Shader* shader); + void MarkUcodeInstruction(uint32_t dword_offset); void AppendUcodeDisasm(char c); void AppendUcodeDisasm(const char* value); @@ -173,14 +179,18 @@ class ShaderTranslator { void TranslateAluInstruction(const ucode::AluInstruction& op); void ParseAluVectorInstruction(const ucode::AluInstruction& op, - const AluOpcodeInfo& opcode_info); + const AluOpcodeInfo& opcode_info, + ParsedAluInstruction& instr); void ParseAluScalarInstruction(const ucode::AluInstruction& op, - const AluOpcodeInfo& opcode_info); + const AluOpcodeInfo& opcode_info, + ParsedAluInstruction& instr); // Input shader metadata and microcode. ShaderType shader_type_; const uint32_t* ucode_dwords_; size_t ucode_dword_count_; + xenos::xe_gpu_program_cntl_t program_cntl_; + uint32_t register_count_; // Accumulated translation errors. std::vector errors_; diff --git a/src/xenia/gpu/spirv/compiler.cc b/src/xenia/gpu/spirv/compiler.cc new file mode 100644 index 000000000..d31b36996 --- /dev/null +++ b/src/xenia/gpu/spirv/compiler.cc @@ -0,0 +1,36 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv/compiler.h" + +namespace xe { +namespace gpu { +namespace spirv { + +Compiler::Compiler() {} + +void Compiler::AddPass(std::unique_ptr pass) { + compiler_passes_.push_back(std::move(pass)); +} + +bool Compiler::Compile(spv::Module* module) { + for (auto& pass : compiler_passes_) { + if (!pass->Run(module)) { + return false; + } + } + + return true; +} + +void Compiler::Reset() { compiler_passes_.clear(); } + +} // namespace spirv +} // namespace gpu +} // namespace xe \ No newline at end of file diff --git a/src/xenia/gpu/spirv/compiler.h b/src/xenia/gpu/spirv/compiler.h new file mode 100644 index 000000000..fd27969ee --- /dev/null +++ b/src/xenia/gpu/spirv/compiler.h @@ -0,0 +1,41 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SPIRV_COMPILER_H_ +#define XENIA_GPU_SPIRV_COMPILER_H_ + +#include "xenia/base/arena.h" +#include "xenia/gpu/spirv/compiler_pass.h" + +#include "third_party/glslang-spirv/SpvBuilder.h" +#include "third_party/spirv/GLSL.std.450.hpp11" + +namespace xe { +namespace gpu { +namespace spirv { + +// SPIR-V Compiler. Designed to optimize SPIR-V code before feeding it into the +// drivers. +class Compiler { + public: + Compiler(); + + void AddPass(std::unique_ptr pass); + void Reset(); + bool Compile(spv::Module* module); + + private: + std::vector> compiler_passes_; +}; + +} // namespace spirv +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_SPIRV_COMPILER_H_ \ No newline at end of file diff --git a/src/xenia/gpu/spirv/compiler_pass.h b/src/xenia/gpu/spirv/compiler_pass.h new file mode 100644 index 000000000..0d81aeeee --- /dev/null +++ b/src/xenia/gpu/spirv/compiler_pass.h @@ -0,0 +1,37 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SPIRV_COMPILER_PASS_H_ +#define XENIA_GPU_SPIRV_COMPILER_PASS_H_ + +#include "xenia/base/arena.h" + +#include "third_party/glslang-spirv/SpvBuilder.h" +#include "third_party/spirv/GLSL.std.450.hpp11" + +namespace xe { +namespace gpu { +namespace spirv { + +class CompilerPass { + public: + CompilerPass() = default; + virtual ~CompilerPass() {} + + virtual bool Run(spv::Module* module) = 0; + + private: + xe::Arena ir_arena_; +}; + +} // namespace spirv +} // namespace gpu +} // namespace xe + +#endif \ No newline at end of file diff --git a/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp new file mode 100644 index 000000000..4d719f769 --- /dev/null +++ b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.cpp @@ -0,0 +1,30 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv/passes/control_flow_analysis_pass.h" + +namespace xe { +namespace gpu { +namespace spirv { + +ControlFlowAnalysisPass::ControlFlowAnalysisPass() {} + +bool ControlFlowAnalysisPass::Run(spv::Module* module) { + for (auto function : module->getFunctions()) { + // For each OpBranchConditional, see if we can find a point where control + // flow converges and then append an OpSelectionMerge. + // Potential problems: while loops constructed from branch instructions + } + + return true; +} + +} // namespace spirv +} // namespace gpu +} // namespace xe \ No newline at end of file diff --git a/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h new file mode 100644 index 000000000..6b279e251 --- /dev/null +++ b/src/xenia/gpu/spirv/passes/control_flow_analysis_pass.h @@ -0,0 +1,34 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ +#define XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ + +#include "xenia/gpu/spirv/compiler_pass.h" + +namespace xe { +namespace gpu { +namespace spirv { + +// Control-flow analysis pass. Runs through control-flow and adds merge opcodes +// where necessary. +class ControlFlowAnalysisPass : public CompilerPass { + public: + ControlFlowAnalysisPass(); + + bool Run(spv::Module* module) override; + + private: +}; + +} // namespace spirv +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ \ No newline at end of file diff --git a/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc new file mode 100644 index 000000000..7b01aa5aa --- /dev/null +++ b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.cc @@ -0,0 +1,48 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv/passes/control_flow_simplification_pass.h" + +namespace xe { +namespace gpu { +namespace spirv { + +ControlFlowSimplificationPass::ControlFlowSimplificationPass() {} + +bool ControlFlowSimplificationPass::Run(spv::Module* module) { + for (auto function : module->getFunctions()) { + // Walk through the blocks in the function and merge any blocks which are + // unconditionally dominated. + for (auto it = function->getBlocks().end() - 1; + it != function->getBlocks().begin() - 1;) { + auto block = *it; + if (!block->isUnreachable() && block->getPredecessors().size() == 1) { + auto prev_block = block->getPredecessors()[0]; + auto last_instr = + prev_block->getInstruction(prev_block->getInstructionCount() - 1); + if (last_instr->getOpCode() == spv::Op::OpBranch) { + if (prev_block->getSuccessors().size() == 1 && + prev_block->getSuccessors()[0] == block) { + // We're dominated by this block. Merge into it. + prev_block->merge(block); + block->setUnreachable(); + } + } + } + + --it; + } + } + + return true; +} + +} // namespace spirv +} // namespace gpu +} // namespace xe \ No newline at end of file diff --git a/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h new file mode 100644 index 000000000..f851d24f1 --- /dev/null +++ b/src/xenia/gpu/spirv/passes/control_flow_simplification_pass.h @@ -0,0 +1,34 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_SIMPLIFICATION_PASS_H_ +#define XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_SIMPLIFICATION_PASS_H_ + +#include "xenia/gpu/spirv/compiler_pass.h" + +namespace xe { +namespace gpu { +namespace spirv { + +// Control-flow simplification pass. Combines adjacent blocks and marks +// any unreachable blocks. +class ControlFlowSimplificationPass : public CompilerPass { + public: + ControlFlowSimplificationPass(); + + bool Run(spv::Module* module) override; + + private: +}; + +} // namespace spirv +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_SPIRV_PASSES_CONTROL_FLOW_SIMPLIFICATION_PASS_H_ \ No newline at end of file diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index bdd4c7e97..229951c8e 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2015 Ben Vanik. All rights reserved. * + * Copyright 2016 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -12,16 +12,24 @@ #include #include "xenia/base/logging.h" +#include "xenia/gpu/spirv/passes/control_flow_analysis_pass.h" +#include "xenia/gpu/spirv/passes/control_flow_simplification_pass.h" namespace xe { namespace gpu { using namespace ucode; +constexpr uint32_t kMaxInterpolators = 16; +constexpr uint32_t kMaxTemporaryRegisters = 64; + using spv::GLSLstd450; using spv::Id; using spv::Op; -SpirvShaderTranslator::SpirvShaderTranslator() = default; +SpirvShaderTranslator::SpirvShaderTranslator() { + compiler_.AddPass(std::make_unique()); + compiler_.AddPass(std::make_unique()); +} SpirvShaderTranslator::~SpirvShaderTranslator() = default; @@ -39,6 +47,7 @@ void SpirvShaderTranslator::StartTranslation() { spv::MemoryModel::MemoryModelGLSL450); b.addCapability(spv::Capability::CapabilityShader); b.addCapability(spv::Capability::CapabilityGenericPointer); + if (is_vertex_shader()) { b.addCapability(spv::Capability::CapabilityClipDistance); b.addCapability(spv::Capability::CapabilityCullDistance); @@ -48,18 +57,18 @@ void SpirvShaderTranslator::StartTranslation() { } spv::Block* function_block = nullptr; - translated_main_ = b.makeFunctionEntry(spv::Decoration::DecorationInvariant, - b.makeVoidType(), "translated_main", - {}, {}, &function_block); + translated_main_ = + b.makeFunctionEntry(spv::NoPrecision, b.makeVoidType(), "translated_main", + {}, {}, &function_block); bool_type_ = b.makeBoolType(); float_type_ = b.makeFloatType(32); int_type_ = b.makeIntType(32); - Id uint_type = b.makeUintType(32); + uint_type_ = b.makeUintType(32); vec2_float_type_ = b.makeVectorType(float_type_, 2); vec3_float_type_ = b.makeVectorType(float_type_, 3); vec4_float_type_ = b.makeVectorType(float_type_, 4); - vec4_uint_type_ = b.makeVectorType(uint_type, 4); + vec4_uint_type_ = b.makeVectorType(uint_type_, 4); vec4_bool_type_ = b.makeVectorType(bool_type_, 4); vec4_float_one_ = b.makeCompositeConstant( @@ -71,8 +80,8 @@ void SpirvShaderTranslator::StartTranslation() { std::vector({b.makeFloatConstant(0.f), b.makeFloatConstant(0.f), b.makeFloatConstant(0.f), b.makeFloatConstant(0.f)})); - registers_type_ = - b.makeArrayType(vec4_float_type_, b.makeUintConstant(64), 0); + registers_type_ = b.makeArrayType(vec4_float_type_, + b.makeUintConstant(register_count()), 0); registers_ptr_ = b.createVariable(spv::StorageClass::StorageClassFunction, registers_type_, "r"); @@ -85,16 +94,14 @@ void SpirvShaderTranslator::StartTranslation() { "ps"); pv_ = b.createVariable(spv::StorageClass::StorageClassFunction, vec4_float_type_, "pv"); - a0_ = b.createVariable(spv::StorageClass::StorageClassFunction, - b.makeUintType(32), "a0"); + a0_ = b.createVariable(spv::StorageClass::StorageClassFunction, int_type_, + "a0"); // Uniform constants. Id float_consts_type = b.makeArrayType(vec4_float_type_, b.makeUintConstant(512), 1); - Id loop_consts_type = - b.makeArrayType(b.makeUintType(32), b.makeUintConstant(32), 1); - Id bool_consts_type = - b.makeArrayType(b.makeUintType(32), b.makeUintConstant(8), 1); + Id loop_consts_type = b.makeArrayType(uint_type_, b.makeUintConstant(32), 1); + Id bool_consts_type = b.makeArrayType(uint_type_, b.makeUintConstant(8), 1); Id consts_struct_type = b.makeStructType( {float_consts_type, loop_consts_type, bool_consts_type}, "consts_type"); @@ -136,7 +143,7 @@ void SpirvShaderTranslator::StartTranslation() { // Push constants, represented by SpirvPushConstants. Id push_constants_type = b.makeStructType( - {vec4_float_type_, vec4_float_type_, vec4_float_type_, uint_type}, + {vec4_float_type_, vec4_float_type_, vec4_float_type_, uint_type_}, "push_consts_type"); b.addDecoration(push_constants_type, spv::Decoration::DecorationBlock); @@ -164,48 +171,35 @@ void SpirvShaderTranslator::StartTranslation() { push_constants_type, "push_consts"); // Texture bindings - Id img_t[] = { - b.makeImageType(float_type_, spv::Dim::Dim1D, false, false, false, 1, - spv::ImageFormat::ImageFormatUnknown), - b.makeImageType(float_type_, spv::Dim::Dim2D, false, false, false, 1, - spv::ImageFormat::ImageFormatUnknown), - b.makeImageType(float_type_, spv::Dim::Dim3D, false, false, false, 1, - spv::ImageFormat::ImageFormatUnknown), - b.makeImageType(float_type_, spv::Dim::DimCube, false, false, false, 1, - spv::ImageFormat::ImageFormatUnknown)}; - Id samplers_t = b.makeSamplerType(); + Id tex_t[] = {b.makeSampledImageType(b.makeImageType( + float_type_, spv::Dim::Dim1D, false, false, false, 1, + spv::ImageFormat::ImageFormatUnknown)), + b.makeSampledImageType(b.makeImageType( + float_type_, spv::Dim::Dim2D, false, false, false, 1, + spv::ImageFormat::ImageFormatUnknown)), + b.makeSampledImageType(b.makeImageType( + float_type_, spv::Dim::Dim3D, false, false, false, 1, + spv::ImageFormat::ImageFormatUnknown)), + b.makeSampledImageType(b.makeImageType( + float_type_, spv::Dim::DimCube, false, false, false, 1, + spv::ImageFormat::ImageFormatUnknown))}; - Id img_a_t[] = {b.makeArrayType(img_t[0], b.makeUintConstant(32), 0), - b.makeArrayType(img_t[1], b.makeUintConstant(32), 0), - b.makeArrayType(img_t[2], b.makeUintConstant(32), 0), - b.makeArrayType(img_t[3], b.makeUintConstant(32), 0)}; - Id samplers_a = b.makeArrayType(samplers_t, b.makeUintConstant(32), 0); - - Id img_s[] = { - b.makeStructType({img_a_t[0]}, "img1D_type"), - b.makeStructType({img_a_t[1]}, "img2D_type"), - b.makeStructType({img_a_t[2]}, "img3D_type"), - b.makeStructType({img_a_t[3]}, "imgCube_type"), - }; - Id samplers_s = b.makeStructType({samplers_a}, "samplers_type"); + Id tex_a_t[] = {b.makeArrayType(tex_t[0], b.makeUintConstant(32), 0), + b.makeArrayType(tex_t[1], b.makeUintConstant(32), 0), + b.makeArrayType(tex_t[2], b.makeUintConstant(32), 0), + b.makeArrayType(tex_t[3], b.makeUintConstant(32), 0)}; for (int i = 0; i < 4; i++) { - img_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant, - img_s[i], - xe::format_string("images%dD", i + 1).c_str()); - b.addDecoration(img_[i], spv::Decoration::DecorationBlock); - b.addDecoration(img_[i], spv::Decoration::DecorationDescriptorSet, 1); - b.addDecoration(img_[i], spv::Decoration::DecorationBinding, i + 1); + tex_[i] = b.createVariable(spv::StorageClass::StorageClassUniformConstant, + tex_a_t[i], + xe::format_string("textures%dD", i + 1).c_str()); + b.addDecoration(tex_[i], spv::Decoration::DecorationDescriptorSet, 1); + b.addDecoration(tex_[i], spv::Decoration::DecorationBinding, i); } - samplers_ = b.createVariable(spv::StorageClass::StorageClassUniformConstant, - samplers_s, "samplers"); - b.addDecoration(samplers_, spv::Decoration::DecorationBlock); - b.addDecoration(samplers_, spv::Decoration::DecorationDescriptorSet, 1); - b.addDecoration(samplers_, spv::Decoration::DecorationBinding, 0); // Interpolators. - Id interpolators_type = - b.makeArrayType(vec4_float_type_, b.makeUintConstant(16), 0); + Id interpolators_type = b.makeArrayType( + vec4_float_type_, b.makeUintConstant(kMaxInterpolators), 0); if (is_vertex_shader()) { // Vertex inputs/outputs. for (const auto& binding : vertex_bindings()) { @@ -247,47 +241,132 @@ void SpirvShaderTranslator::StartTranslation() { b.addDecoration(attrib_var, spv::Decoration::DecorationLocation, attrib.attrib_index); - vertex_binding_map_[binding.fetch_constant][attrib.fetch_instr - .attributes.offset] = - attrib_var; + vertex_binding_map_[binding.fetch_constant] + [attrib.fetch_instr.attributes.offset] = attrib_var; } } interpolators_ = b.createVariable(spv::StorageClass::StorageClassOutput, interpolators_type, "interpolators"); - b.addDecoration(interpolators_, spv::Decoration::DecorationNoPerspective); b.addDecoration(interpolators_, spv::Decoration::DecorationLocation, 0); + for (uint32_t i = 0; i < std::min(register_count(), kMaxInterpolators); + i++) { + // Zero interpolators. + auto ptr = b.createAccessChain(spv::StorageClass::StorageClassOutput, + interpolators_, + std::vector({b.makeUintConstant(i)})); + b.createStore(vec4_float_zero_, ptr); + } pos_ = b.createVariable(spv::StorageClass::StorageClassOutput, vec4_float_type_, "gl_Position"); b.addDecoration(pos_, spv::Decoration::DecorationBuiltIn, spv::BuiltIn::BuiltInPosition); + + vertex_id_ = b.createVariable(spv::StorageClass::StorageClassInput, + int_type_, "gl_VertexId"); + b.addDecoration(vertex_id_, spv::Decoration::DecorationBuiltIn, + spv::BuiltIn::BuiltInVertexId); + + auto vertex_id = b.createLoad(vertex_id_); + vertex_id = b.createUnaryOp(spv::Op::OpConvertSToF, float_type_, vertex_id); + auto r0_ptr = b.createAccessChain(spv::StorageClass::StorageClassFunction, + registers_ptr_, + std::vector({b.makeUintConstant(0)})); + auto r0 = b.createLoad(r0_ptr); + r0 = b.createCompositeInsert(vertex_id, r0, vec4_float_type_, + std::vector({0})); + b.createStore(r0, r0_ptr); } else { // Pixel inputs from vertex shader. interpolators_ = b.createVariable(spv::StorageClass::StorageClassInput, interpolators_type, "interpolators"); - b.addDecoration(interpolators_, spv::Decoration::DecorationNoPerspective); b.addDecoration(interpolators_, spv::Decoration::DecorationLocation, 0); // Pixel fragment outputs (one per render target). Id frag_outputs_type = b.makeArrayType(vec4_float_type_, b.makeUintConstant(4), 0); frag_outputs_ = b.createVariable(spv::StorageClass::StorageClassOutput, - frag_outputs_type, "o"); + frag_outputs_type, "oC"); b.addDecoration(frag_outputs_, spv::Decoration::DecorationLocation, 0); + frag_depth_ = b.createVariable(spv::StorageClass::StorageClassOutput, + float_type_, "gl_FragDepth"); + b.addDecoration(frag_depth_, spv::Decoration::DecorationBuiltIn, + spv::BuiltIn::BuiltInFragDepth); + // TODO(benvanik): frag depth, etc. // Copy interpolators to r[0..16]. - b.createNoResultOp(spv::Op::OpCopyMemorySized, - {registers_ptr_, interpolators_, - b.makeUintConstant(16 * 4 * sizeof(float))}); + // TODO: Need physical addressing in order to do this. + // b.createNoResultOp(spv::Op::OpCopyMemorySized, + // {registers_ptr_, interpolators_, + // b.makeUintConstant(16 * 4 * sizeof(float))}); + for (uint32_t i = 0; i < std::min(register_count(), kMaxInterpolators); + i++) { + // For now, copy interpolators register-by-register :/ + auto idx = b.makeUintConstant(i); + auto i_a = b.createAccessChain(spv::StorageClass::StorageClassInput, + interpolators_, std::vector({idx})); + auto r_a = b.createAccessChain(spv::StorageClass::StorageClassFunction, + registers_ptr_, std::vector({idx})); + b.createNoResultOp(spv::Op::OpCopyMemory, std::vector({r_a, i_a})); + } + + // Setup ps_param_gen + auto ps_param_gen_idx_ptr = b.createAccessChain( + spv::StorageClass::StorageClassPushConstant, push_consts_, + std::vector({b.makeUintConstant(3)})); + auto ps_param_gen_idx = b.createLoad(ps_param_gen_idx_ptr); + + auto frag_coord = b.createVariable(spv::StorageClass::StorageClassInput, + vec4_float_type_, "gl_FragCoord"); + b.addDecoration(frag_coord, spv::Decoration::DecorationBuiltIn, + spv::BuiltIn::BuiltInFragCoord); + + auto point_coord = b.createVariable(spv::StorageClass::StorageClassInput, + vec2_float_type_, "gl_PointCoord"); + b.addDecoration(point_coord, spv::Decoration::DecorationBuiltIn, + spv::BuiltIn::BuiltInPointCoord); + auto param = b.createOp(spv::Op::OpVectorShuffle, vec4_float_type_, + {frag_coord, point_coord, 0, 1, 4, 5}); + /* + // TODO: gl_FrontFacing + auto param_x = b.createCompositeExtract(param, float_type_, 0); + auto param_x_inv = b.createBinOp(spv::Op::OpFMul, float_type_, param_x, + b.makeFloatConstant(-1.f)); + param_x = b.createCompositeInsert(param_x_inv, param, vec4_float_type_, 0); + */ + + auto cond = b.createBinOp(spv::Op::OpINotEqual, bool_type_, + ps_param_gen_idx, b.makeUintConstant(-1)); + spv::Builder::If ifb(cond, b); + + // FYI: We do this instead of r[ps_param_gen_idx] because that causes + // nvidia to move all registers into local memory (slow!) + for (uint32_t i = 0; i < std::min(register_count(), kMaxInterpolators); + i++) { + auto reg_ptr = b.createAccessChain( + spv::StorageClass::StorageClassFunction, registers_ptr_, + std::vector({b.makeUintConstant(i)})); + + auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, ps_param_gen_idx, + b.makeUintConstant(i)); + auto reg = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, cond, param, + b.createLoad(reg_ptr)); + b.createStore(reg, reg_ptr); + } + + ifb.makeEndIf(); } } std::vector SpirvShaderTranslator::CompleteTranslation() { auto& b = *builder_; + assert_false(open_predicated_block_); + auto block = &b.makeNewBlock(); + b.createBranch(block); b.makeReturn(false); // main() entry point. @@ -330,8 +409,7 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { p_w = b.createTriOp(spv::Op::OpSelect, float_type_, c_w, p_w, p_w_inv); // pos.xyz = vtx_fmt.xyz != 0.0 ? pos.xyz / pos.w : pos.xyz - auto p_all_w = b.smearScalar(spv::Decoration::DecorationInvariant, p_w, - vec4_float_type_); + auto p_all_w = b.smearScalar(spv::NoPrecision, p_w, vec4_float_type_); auto p_inv = b.createBinOp(spv::Op::OpFDiv, vec4_float_type_, p, p_all_w); p = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c, p_inv, p); @@ -346,10 +424,66 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { {p, p_scaled, 4, 5, 2, 3}); b.createStore(p, pos_); + } else { + // Alpha test + auto alpha_test_enabled = b.createCompositeExtract( + push_consts_, float_type_, std::vector{2, 0}); + auto alpha_test_func = b.createCompositeExtract( + push_consts_, float_type_, std::vector{2, 1}); + auto alpha_test_ref = b.createCompositeExtract(push_consts_, float_type_, + std::vector{2, 2}); + alpha_test_func = + b.createUnaryOp(spv::Op::OpConvertFToU, uint_type_, alpha_test_func); + auto oC0_alpha = b.createCompositeExtract(frag_outputs_, float_type_, + std::vector({0, 3})); + + auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, + alpha_test_enabled, b.makeFloatConstant(1.f)); + spv::Builder::If alpha_if(cond, b); + + std::vector switch_segments; + b.makeSwitch(alpha_test_func, 8, std::vector({0, 1, 2, 3, 4, 5, 6, 7}), + std::vector({0, 1, 2, 3, 4, 5, 6, 7}), 7, + switch_segments); + + const static spv::Op alpha_op_map[] = { + spv::Op::OpNop, + spv::Op::OpFOrdGreaterThanEqual, + spv::Op::OpFOrdNotEqual, + spv::Op::OpFOrdGreaterThan, + spv::Op::OpFOrdLessThanEqual, + spv::Op::OpFOrdEqual, + spv::Op::OpFOrdLessThan, + spv::Op::OpNop, + }; + + // if (alpha_func == 0) passes = false; + b.nextSwitchSegment(switch_segments, 0); + b.makeDiscard(); + b.addSwitchBreak(); + + for (int i = 1; i < 7; i++) { + b.nextSwitchSegment(switch_segments, i); + auto cond = + b.createBinOp(alpha_op_map[i], bool_type_, oC0_alpha, alpha_test_ref); + spv::Builder::If discard_if(cond, b); + b.makeDiscard(); + discard_if.makeEndIf(); + b.addSwitchBreak(); + } + + // if (alpha_func == 7) passes = true; + b.nextSwitchSegment(switch_segments, 7); + b.endSwitch(switch_segments); + + alpha_if.makeEndIf(); } b.makeReturn(false); + // Compile the spv IR + compiler_.Compile(b.getModule()); + std::vector spirv_words; b.dump(spirv_words); @@ -365,28 +499,55 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { } void SpirvShaderTranslator::PostTranslation(Shader* shader) { + // Validation. + // TODO(DrChat): Only do this if a flag is set (this is pretty slow). + auto validation = validator_.Validate( + reinterpret_cast(shader->translated_binary().data()), + shader->translated_binary().size() / 4); + if (validation->has_error()) { + XELOGE("SPIR-V Shader Validation failed! Error: %s", + validation->error_string()); + } + // TODO(benvanik): only if needed? could be slowish. auto disasm = disassembler_.Disassemble( reinterpret_cast(shader->translated_binary().data()), shader->translated_binary().size() / 4); if (disasm->has_error()) { XELOGE("Failed to disassemble SPIRV - invalid?"); - return; + } else { + set_host_disassembly(shader, disasm->to_string()); } - set_host_disassembly(shader, disasm->to_string()); } void SpirvShaderTranslator::PreProcessControlFlowInstruction( - uint32_t cf_index) { + uint32_t cf_index, const ControlFlowInstruction& instr) { auto& b = *builder_; - cf_blocks_[cf_index] = &b.makeNewBlock(); + if (cf_blocks_.find(cf_index) == cf_blocks_.end()) { + CFBlock block; + block.block = &b.makeNewBlock(); + cf_blocks_[cf_index] = block; + } else { + cf_blocks_[cf_index].block = &b.makeNewBlock(); + } + + if (instr.opcode() == ControlFlowOpcode::kCondJmp) { + auto cf_block = cf_blocks_.find(instr.cond_jmp.address()); + if (cf_block == cf_blocks_.end()) { + CFBlock block; + block.prev_dominates = false; + cf_blocks_[instr.cond_jmp.address()] = block; + } else { + cf_block->second.prev_dominates = false; + } + } else if (instr.opcode() == ControlFlowOpcode::kLoopStart) { + // TODO + } } void SpirvShaderTranslator::ProcessLabel(uint32_t cf_index) { auto& b = *builder_; - - EmitUnimplementedTranslationError(); } void SpirvShaderTranslator::ProcessControlFlowInstructionBegin( @@ -395,7 +556,7 @@ void SpirvShaderTranslator::ProcessControlFlowInstructionBegin( if (cf_index == 0) { // Kind of cheaty, but emit a branch to the first block. - b.createBranch(cf_blocks_[cf_index]); + b.createBranch(cf_blocks_[cf_index].block); } } @@ -407,15 +568,20 @@ void SpirvShaderTranslator::ProcessControlFlowInstructionEnd( void SpirvShaderTranslator::ProcessControlFlowNopInstruction() { auto& b = *builder_; - b.createNoResultOp(spv::Op::OpNop); + // b.createNoResultOp(spv::Op::OpNop); } void SpirvShaderTranslator::ProcessExecInstructionBegin( const ParsedExecInstruction& instr) { auto& b = *builder_; + assert_false(open_predicated_block_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; + // Head has the logic to check if the body should execute. - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); auto body = head; switch (instr.type) { @@ -432,24 +598,46 @@ void SpirvShaderTranslator::ProcessExecInstructionBegin( v = b.createLoad(v); // Bitfield extract the bool constant. - v = b.createTriOp(spv::Op::OpBitFieldUExtract, b.makeUintType(32), v, + // FIXME: NVidia's compiler seems to be broken on this instruction? + /* + v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v, b.makeUintConstant(instr.bool_constant_index % 32), b.makeUintConstant(1)); + auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v, + b.makeUintConstant(instr.condition ? 1 : 0)); + */ + v = b.createBinOp( + spv::Op::OpBitwiseAnd, uint_type_, v, + b.makeUintConstant(1 << (instr.bool_constant_index % 32))); + auto cond = b.createBinOp( + instr.condition ? spv::Op::OpINotEqual : spv::Op::OpIEqual, + bool_type_, v, b.makeUintConstant(0)); + // Conditional branch assert_true(cf_blocks_.size() > instr.dword_index + 1); body = &b.makeNewBlock(); - auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, v, - b.makeBoolConstant(instr.condition)); - b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]); + + auto next_block = cf_blocks_[instr.dword_index + 1]; + if (next_block.prev_dominates) { + b.createSelectionMerge(next_block.block, spv::SelectionControlMaskNone); + } + b.createConditionalBranch(cond, body, next_block.block); } break; case ParsedExecInstruction::Type::kPredicated: { // Branch based on p0. assert_true(cf_blocks_.size() > instr.dword_index + 1); body = &b.makeNewBlock(); - auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, p0_, - b.makeBoolConstant(instr.condition)); - b.createConditionalBranch(cond, body, cf_blocks_[instr.dword_index + 1]); + auto cond = + b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), + b.makeBoolConstant(instr.condition)); + + auto next_block = cf_blocks_[instr.dword_index + 1]; + if (next_block.prev_dominates) { + b.createSelectionMerge(next_block.block, spv::SelectionControlMaskNone); + } + b.createConditionalBranch(cond, body, next_block.block); + } break; } b.setBuildPoint(body); @@ -459,11 +647,19 @@ void SpirvShaderTranslator::ProcessExecInstructionEnd( const ParsedExecInstruction& instr) { auto& b = *builder_; + if (open_predicated_block_) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; + } + if (instr.is_end) { b.makeReturn(false); } else { assert_true(cf_blocks_.size() > instr.dword_index + 1); - b.createBranch(cf_blocks_[instr.dword_index + 1]); + b.createBranch(cf_blocks_[instr.dword_index + 1].block); } } @@ -471,7 +667,7 @@ void SpirvShaderTranslator::ProcessLoopStartInstruction( const ParsedLoopStartInstruction& instr) { auto& b = *builder_; - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); // TODO: Emit a spv LoopMerge @@ -480,46 +676,50 @@ void SpirvShaderTranslator::ProcessLoopStartInstruction( EmitUnimplementedTranslationError(); assert_true(cf_blocks_.size() > instr.dword_index + 1); - b.createBranch(cf_blocks_[instr.dword_index + 1]); + b.createBranch(cf_blocks_[instr.dword_index + 1].block); } void SpirvShaderTranslator::ProcessLoopEndInstruction( const ParsedLoopEndInstruction& instr) { auto& b = *builder_; - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); EmitUnimplementedTranslationError(); assert_true(cf_blocks_.size() > instr.dword_index + 1); - b.createBranch(cf_blocks_[instr.dword_index + 1]); + b.createBranch(cf_blocks_[instr.dword_index + 1].block); } void SpirvShaderTranslator::ProcessCallInstruction( const ParsedCallInstruction& instr) { auto& b = *builder_; - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); + // Unused instruction(?) + assert_always(); EmitUnimplementedTranslationError(); assert_true(cf_blocks_.size() > instr.dword_index + 1); - b.createBranch(cf_blocks_[instr.dword_index + 1]); + b.createBranch(cf_blocks_[instr.dword_index + 1].block); } void SpirvShaderTranslator::ProcessReturnInstruction( const ParsedReturnInstruction& instr) { auto& b = *builder_; - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); + // Unused instruction(?) + assert_always(); EmitUnimplementedTranslationError(); assert_true(cf_blocks_.size() > instr.dword_index + 1); - b.createBranch(cf_blocks_[instr.dword_index + 1]); + b.createBranch(cf_blocks_[instr.dword_index + 1].block); } // CF jump @@ -527,13 +727,15 @@ void SpirvShaderTranslator::ProcessJumpInstruction( const ParsedJumpInstruction& instr) { auto& b = *builder_; - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); switch (instr.type) { case ParsedJumpInstruction::Type::kUnconditional: { - b.createBranch(cf_blocks_[instr.target_address]); + b.createBranch(cf_blocks_[instr.target_address].block); } break; case ParsedJumpInstruction::Type::kConditional: { + assert_true(cf_blocks_.size() > instr.dword_index + 1); + // Based off of bool_consts std::vector offsets; offsets.push_back(b.makeUintConstant(2)); // bool_consts @@ -542,23 +744,35 @@ void SpirvShaderTranslator::ProcessJumpInstruction( consts_, offsets); v = b.createLoad(v); + // FIXME: NVidia's compiler seems to be broken on this instruction? + /* // Bitfield extract the bool constant. - v = b.createTriOp(spv::Op::OpBitFieldUExtract, b.makeUintType(32), v, + v = b.createTriOp(spv::Op::OpBitFieldUExtract, uint_type_, v, b.makeUintConstant(instr.bool_constant_index % 32), b.makeUintConstant(1)); // Conditional branch - auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, v, - b.makeBoolConstant(instr.condition)); - b.createConditionalBranch(cond, cf_blocks_[instr.target_address], - cf_blocks_[instr.dword_index]); + auto cond = b.createBinOp(spv::Op::OpIEqual, bool_type_, v, + b.makeUintConstant(instr.condition ? 1 : 0)); + */ + v = b.createBinOp( + spv::Op::OpBitwiseAnd, uint_type_, v, + b.makeUintConstant(1 << (instr.bool_constant_index % 32))); + auto cond = b.createBinOp( + instr.condition ? spv::Op::OpINotEqual : spv::Op::OpIEqual, + bool_type_, v, b.makeUintConstant(0)); + + b.createConditionalBranch(cond, cf_blocks_[instr.target_address].block, + cf_blocks_[instr.dword_index + 1].block); } break; case ParsedJumpInstruction::Type::kPredicated: { assert_true(cf_blocks_.size() > instr.dword_index + 1); - auto cond = b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, p0_, - b.makeBoolConstant(instr.condition)); - b.createConditionalBranch(cond, cf_blocks_[instr.target_address], - cf_blocks_[instr.dword_index]); + + auto cond = + b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), + b.makeBoolConstant(instr.condition)); + b.createConditionalBranch(cond, cf_blocks_[instr.target_address].block, + cf_blocks_[instr.dword_index + 1].block); } break; } } @@ -567,7 +781,7 @@ void SpirvShaderTranslator::ProcessAllocInstruction( const ParsedAllocInstruction& instr) { auto& b = *builder_; - auto head = cf_blocks_[instr.dword_index]; + auto head = cf_blocks_[instr.dword_index].block; b.setBuildPoint(head); switch (instr.type) { @@ -585,24 +799,113 @@ void SpirvShaderTranslator::ProcessAllocInstruction( } assert_true(cf_blocks_.size() > instr.dword_index + 1); - b.createBranch(cf_blocks_[instr.dword_index + 1]); + b.createBranch(cf_blocks_[instr.dword_index + 1].block); } void SpirvShaderTranslator::ProcessVertexFetchInstruction( const ParsedVertexFetchInstruction& instr) { auto& b = *builder_; + assert_true(is_vertex_shader()); + assert_not_zero(vertex_id_); - // TODO: instr.is_predicated + // Close the open predicated block if this instr isn't predicated or the + // conditions do not match. + if (open_predicated_block_ && + (!instr.is_predicated || + instr.predicate_condition != predicated_block_cond_)) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; + } + + if (!open_predicated_block_ && instr.is_predicated) { + Id pred_cond = + b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), + b.makeBoolConstant(instr.predicate_condition)); + auto block = &b.makeNewBlock(); + open_predicated_block_ = true; + predicated_block_cond_ = instr.predicate_condition; + predicated_block_end_ = &b.makeNewBlock(); + + b.createSelectionMerge(predicated_block_end_, + spv::SelectionControlMaskNone); + b.createConditionalBranch(pred_cond, block, predicated_block_end_); + b.setBuildPoint(block); + } // Operand 0 is the index // Operand 1 is the binding // TODO: Indexed fetch - auto vertex_ptr = - vertex_binding_map_[instr.operands[1].storage_index][instr.attributes - .offset]; - assert_not_zero(vertex_ptr); + auto vertex_id = LoadFromOperand(instr.operands[0]); + vertex_id = b.createCompositeExtract(vertex_id, float_type_, 0); + vertex_id = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, vertex_id); + auto shader_vertex_id = b.createLoad(vertex_id_); + auto cond = + b.createBinOp(spv::Op::OpIEqual, bool_type_, vertex_id, shader_vertex_id); + cond = b.smearScalar(spv::NoPrecision, cond, vec4_bool_type_); + // Skip loading if it's an indexed fetch. + auto vertex_ptr = vertex_binding_map_[instr.operands[1].storage_index] + [instr.attributes.offset]; + assert_not_zero(vertex_ptr); auto vertex = b.createLoad(vertex_ptr); + + switch (instr.attributes.data_format) { + case VertexFormat::k_8_8_8_8: + case VertexFormat::k_16_16: + case VertexFormat::k_16_16_16_16: + case VertexFormat::k_16_16_16_16_FLOAT: + case VertexFormat::k_32: + case VertexFormat::k_32_32: + case VertexFormat::k_32_32_32_32: + case VertexFormat::k_32_FLOAT: + case VertexFormat::k_32_32_FLOAT: + case VertexFormat::k_32_32_32_FLOAT: + case VertexFormat::k_32_32_32_32_FLOAT: + // These are handled, for now. + break; + + case VertexFormat::k_10_11_11: { + // No conversion needed. Natively supported. + } break; + + case VertexFormat::k_11_11_10: { + // This needs to be converted. + } break; + } + + auto vertex_components = b.getNumComponents(vertex); + Id alt_vertex = 0; + switch (vertex_components) { + case 1: + alt_vertex = b.makeFloatConstant(0.f); + break; + case 2: + alt_vertex = b.makeCompositeConstant( + vec2_float_type_, std::vector({b.makeFloatConstant(0.f), + b.makeFloatConstant(1.f)})); + break; + case 3: + alt_vertex = b.makeCompositeConstant( + vec3_float_type_, + std::vector({b.makeFloatConstant(0.f), b.makeFloatConstant(0.f), + b.makeFloatConstant(1.f)})); + break; + case 4: + alt_vertex = b.makeCompositeConstant( + vec4_float_type_, + std::vector({b.makeFloatConstant(0.f), b.makeFloatConstant(0.f), + b.makeFloatConstant(0.f), + b.makeFloatConstant(1.f)})); + break; + default: + assert_unhandled_case(vertex_components); + } + + vertex = b.createTriOp(spv::Op::OpSelect, b.getTypeId(vertex), cond, vertex, + alt_vertex); StoreToResult(vertex, instr.result); } @@ -610,7 +913,33 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( const ParsedTextureFetchInstruction& instr) { auto& b = *builder_; - // TODO: instr.is_predicated + // Close the open predicated block if this instr isn't predicated or the + // conditions do not match. + if (open_predicated_block_ && + (!instr.is_predicated || + instr.predicate_condition != predicated_block_cond_)) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; + } + + if (!open_predicated_block_ && instr.is_predicated) { + Id pred_cond = + b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), + b.makeBoolConstant(instr.predicate_condition)); + auto block = &b.makeNewBlock(); + open_predicated_block_ = true; + predicated_block_cond_ = instr.predicate_condition; + predicated_block_end_ = &b.makeNewBlock(); + + b.createSelectionMerge(predicated_block_end_, + spv::SelectionControlMaskNone); + b.createConditionalBranch(pred_cond, block, predicated_block_end_); + b.setBuildPoint(block); + } + // Operand 0 is the offset // Operand 1 is the sampler index Id dest = 0; @@ -619,23 +948,13 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( uint32_t dim_idx = 0; switch (instr.dimension) { - case TextureDimension::k1D: - src = b.createCompositeExtract(src, float_type_, 0); + case TextureDimension::k1D: { dim_idx = 0; - break; + } break; case TextureDimension::k2D: { - auto s0 = b.createCompositeExtract(src, float_type_, 0); - auto s1 = b.createCompositeExtract(src, float_type_, 1); - src = b.createCompositeConstruct(vec2_float_type_, - std::vector({s0, s1})); dim_idx = 1; } break; case TextureDimension::k3D: { - auto s0 = b.createCompositeExtract(src, float_type_, 0); - auto s1 = b.createCompositeExtract(src, float_type_, 1); - auto s2 = b.createCompositeExtract(src, float_type_, 2); - src = b.createCompositeConstruct(vec3_float_type_, - std::vector({s0, s1, s2})); dim_idx = 2; } break; case TextureDimension::kCube: { @@ -647,28 +966,21 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( switch (instr.opcode) { case FetchOpcode::kTextureFetch: { - auto image_index = b.makeUintConstant(instr.operands[1].storage_index); - auto image_ptr = b.createAccessChain( - spv::StorageClass::StorageClassUniformConstant, img_[dim_idx], - std::vector({b.makeUintConstant(0), image_index})); - auto sampler_ptr = b.createAccessChain( - spv::StorageClass::StorageClassUniformConstant, samplers_, - std::vector({b.makeUintConstant(0), image_index})); - auto image = b.createLoad(image_ptr); - auto sampler = b.createLoad(sampler_ptr); - - auto tex = b.createBinOp(spv::Op::OpSampledImage, b.getImageType(image), - image, sampler); + auto texture_index = b.makeUintConstant(instr.operands[1].storage_index); + auto texture_ptr = + b.createAccessChain(spv::StorageClass::StorageClassUniformConstant, + tex_[dim_idx], std::vector({texture_index})); + auto texture = b.createLoad(texture_ptr); spv::Builder::TextureParameters params = {0}; params.coords = src; - params.sampler = sampler; - dest = b.createTextureCall(spv::Decoration::DecorationInvariant, - vec4_float_type_, false, false, false, false, - false, params); + params.sampler = texture; + dest = b.createTextureCall(spv::NoPrecision, vec4_float_type_, false, + false, false, false, false, params); } break; default: // TODO: the rest of these + assert_always(); break; } @@ -698,19 +1010,41 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( const ParsedAluInstruction& instr) { auto& b = *builder_; + // TODO: If we have identical operands, reuse previous one. Id sources[3] = {0}; Id dest = 0; for (size_t i = 0; i < instr.operand_count; i++) { sources[i] = LoadFromOperand(instr.operands[i]); } - Id pred_cond = 0; - if (instr.is_predicated) { - pred_cond = - b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), - b.makeBoolConstant(instr.predicate_condition)); + // Close the open predicated block if this instr isn't predicated or the + // conditions do not match. + if (open_predicated_block_ && + (!instr.is_predicated || + instr.predicate_condition != predicated_block_cond_)) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; } + if (!open_predicated_block_ && instr.is_predicated) { + Id pred_cond = + b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), + b.makeBoolConstant(instr.predicate_condition)); + auto block = &b.makeNewBlock(); + open_predicated_block_ = true; + predicated_block_cond_ = instr.predicate_condition; + predicated_block_end_ = &b.makeNewBlock(); + + b.createSelectionMerge(predicated_block_end_, + spv::SelectionControlMaskNone); + b.createConditionalBranch(pred_cond, block, predicated_block_end_); + b.setBuildPoint(block); + } + + bool close_predicated_block = false; switch (instr.vector_opcode) { case AluVectorOpcode::kAdd: { dest = b.createBinOp(spv::Op::OpFAdd, vec4_float_type_, sources[0], @@ -746,23 +1080,52 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( } break; case AluVectorOpcode::kDst: { - // TODO + auto src0_y = b.createCompositeExtract(sources[0], float_type_, 1); + auto src1_y = b.createCompositeExtract(sources[1], float_type_, 1); + auto dst_y = b.createBinOp(spv::Op::OpFMul, float_type_, src0_y, src1_y); + + auto src0_z = b.createCompositeExtract(sources[0], float_type_, 3); + auto src1_w = b.createCompositeExtract(sources[0], float_type_, 4); + dest = b.createCompositeConstruct( + vec4_float_type_, + std::vector({b.makeFloatConstant(1.f), dst_y, src0_z, src1_w})); + } break; + + case AluVectorOpcode::kDp2Add: { + auto src0_xy = b.createOp(spv::Op::OpVectorShuffle, vec2_float_type_, + {sources[0], sources[0], 0, 1}); + auto src1_xy = b.createOp(spv::Op::OpVectorShuffle, vec2_float_type_, + {sources[1], sources[1], 0, 1}); + auto src2_x = b.createCompositeExtract(sources[2], float_type_, 0); + dest = b.createBinOp(spv::Op::OpDot, float_type_, src0_xy, src1_xy); + dest = b.createBinOp(spv::Op::OpFAdd, float_type_, dest, src2_x); + dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_); + } break; + + case AluVectorOpcode::kDp3: { + auto src0_xyz = b.createOp(spv::Op::OpVectorShuffle, vec3_float_type_, + {sources[0], sources[0], 0, 1, 2}); + auto src1_xyz = b.createOp(spv::Op::OpVectorShuffle, vec3_float_type_, + {sources[1], sources[1], 0, 1, 2}); + dest = b.createBinOp(spv::Op::OpDot, float_type_, src0_xyz, src1_xyz); + dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_); } break; case AluVectorOpcode::kDp4: { dest = b.createBinOp(spv::Op::OpDot, float_type_, sources[0], sources[1]); + dest = b.smearScalar(spv::NoPrecision, dest, vec4_float_type_); } break; case AluVectorOpcode::kFloor: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - spv::GLSLstd450::kFloor, {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_, + spv::GLSLstd450::kFloor, + {sources[0]}); } break; case AluVectorOpcode::kFrc: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - spv::GLSLstd450::kFract, {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_, + spv::GLSLstd450::kFract, + {sources[0]}); } break; case AluVectorOpcode::kKillEq: { @@ -771,10 +1134,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto cond = b.createBinOp(spv::Op::OpFOrdEqual, vec4_bool_type_, sources[0], sources[1]); cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -790,10 +1149,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThanEqual, vec4_bool_type_, sources[0], sources[1]); cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -809,10 +1164,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThan, vec4_bool_type_, sources[0], sources[1]); cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -828,10 +1179,6 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto cond = b.createBinOp(spv::Op::OpFOrdNotEqual, vec4_bool_type_, sources[0], sources[1]); cond = b.createUnaryOp(spv::Op::OpAny, bool_type_, cond); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -848,6 +1195,23 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( } break; case AluVectorOpcode::kMax4: { + auto src0_x = b.createCompositeExtract(sources[0], float_type_, 0); + auto src0_y = b.createCompositeExtract(sources[0], float_type_, 1); + auto src0_z = b.createCompositeExtract(sources[0], float_type_, 2); + auto src0_w = b.createCompositeExtract(sources[0], float_type_, 3); + + auto max_xy = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kFMax, + {src0_x, src0_y}); + auto max_zw = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kFMax, + {src0_z, src0_w}); + auto max_xyzw = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kFMax, + {max_xy, max_zw}); + + // FIXME: Docs say this only updates pv.x? + dest = b.smearScalar(spv::NoPrecision, max_xyzw, vec4_float_type_); } break; case AluVectorOpcode::kMaxA: { @@ -857,27 +1221,38 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( b.makeFloatConstant(0.5f)); addr = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, addr); addr = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, int_type_, - spv::GLSLstd450::kSClamp, + spv::NoPrecision, int_type_, spv::GLSLstd450::kSClamp, {addr, b.makeIntConstant(-256), b.makeIntConstant(255)}); b.createStore(addr, a0_); // dest = src0 >= src1 ? src0 : src1 - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - spv::GLSLstd450::kFMax, {sources[0], sources[1]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_, + spv::GLSLstd450::kFMax, + {sources[0], sources[1]}); } break; case AluVectorOpcode::kMax: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - spv::GLSLstd450::kFMax, {sources[0], sources[1]}); + if (sources[0] == sources[1]) { + // mov dst, src + dest = sources[0]; + break; + } + + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_, + spv::GLSLstd450::kFMax, + {sources[0], sources[1]}); } break; case AluVectorOpcode::kMin: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - spv::GLSLstd450::kFMin, {sources[0], sources[1]}); + if (sources[0] == sources[1]) { + // mov dst, src + dest = sources[0]; + break; + } + + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_, + spv::GLSLstd450::kFMin, + {sources[0], sources[1]}); } break; case AluVectorOpcode::kMul: { @@ -893,17 +1268,18 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto c_and = b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1); auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0); + c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_); auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3); // p0 b.createStore(c_and_w, p0_); + close_predicated_block = true; // dest auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0); s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x, b.makeFloatConstant(1.f)); - auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x, - vec4_float_type_); + auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_); dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x, vec4_float_zero_, s0); @@ -917,17 +1293,18 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto c_and = b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1); auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0); + c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_); auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3); // p0 b.createStore(c_and_w, p0_); + close_predicated_block = true; // dest auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0); s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x, b.makeFloatConstant(1.f)); - auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x, - vec4_float_type_); + auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_); dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x, vec4_float_zero_, s0); @@ -941,17 +1318,18 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto c_and = b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1); auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0); + c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_); auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3); // p0 b.createStore(c_and_w, p0_); + close_predicated_block = true; // dest auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0); s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x, b.makeFloatConstant(1.f)); - auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x, - vec4_float_type_); + auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_); dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x, vec4_float_zero_, s0); @@ -965,17 +1343,18 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( auto c_and = b.createBinOp(spv::Op::OpLogicalAnd, vec4_bool_type_, c0, c1); auto c_and_x = b.createCompositeExtract(c_and, bool_type_, 0); + c_and_x = b.smearScalar(spv::NoPrecision, c_and_x, vec4_bool_type_); auto c_and_w = b.createCompositeExtract(c_and, bool_type_, 3); // p0 b.createStore(c_and_w, p0_); + close_predicated_block = true; // dest auto s0_x = b.createCompositeExtract(sources[0], float_type_, 0); s0_x = b.createBinOp(spv::Op::OpFAdd, float_type_, s0_x, b.makeFloatConstant(1.f)); - auto s0 = b.smearScalar(spv::Decoration::DecorationInvariant, s0_x, - vec4_float_type_); + auto s0 = b.smearScalar(spv::NoPrecision, s0_x, vec4_float_type_); dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c_and_x, vec4_float_zero_, s0); @@ -1014,25 +1393,27 @@ void SpirvShaderTranslator::ProcessVectorAluInstruction( } break; case AluVectorOpcode::kTrunc: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - GLSLstd450::kTrunc, {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, vec4_float_type_, + GLSLstd450::kTrunc, {sources[0]}); } break; default: + assert_unhandled_case(instr.vector_opcode); break; } + assert_not_zero(dest); if (dest) { - // If predicated, discard the result from the instruction. - Id pv_dest = dest; - if (instr.is_predicated) { - pv_dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, pred_cond, - dest, b.createLoad(pv_)); - } + b.createStore(dest, pv_); + StoreToResult(dest, instr.result); + } - b.createStore(pv_dest, pv_); - StoreToResult(dest, instr.result, pred_cond); + if (close_predicated_block && open_predicated_block_) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; } } @@ -1040,6 +1421,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( const ParsedAluInstruction& instr) { auto& b = *builder_; + // TODO: If we have identical operands, reuse previous one. Id sources[3] = {0}; Id dest = 0; for (size_t i = 0, x = 0; i < instr.operand_count; i++) { @@ -1075,13 +1457,34 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( } } - Id pred_cond = 0; - if (instr.is_predicated) { - pred_cond = - b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), - b.makeBoolConstant(instr.predicate_condition)); + // Close the open predicated block if this instr isn't predicated or the + // conditions do not match. + if (open_predicated_block_ && + (!instr.is_predicated || + instr.predicate_condition != predicated_block_cond_)) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; } + if (!open_predicated_block_ && instr.is_predicated) { + Id pred_cond = + b.createBinOp(spv::Op::OpLogicalEqual, bool_type_, b.createLoad(p0_), + b.makeBoolConstant(instr.predicate_condition)); + auto block = &b.makeNewBlock(); + open_predicated_block_ = true; + predicated_block_cond_ = instr.predicate_condition; + predicated_block_end_ = &b.makeNewBlock(); + + b.createSelectionMerge(predicated_block_end_, + spv::SelectionControlMaskNone); + b.createConditionalBranch(pred_cond, block, predicated_block_end_); + b.setBuildPoint(block); + } + + bool close_predicated_block = false; switch (instr.scalar_opcode) { case AluScalarOpcode::kAdds: case AluScalarOpcode::kAddsc0: @@ -1093,32 +1496,29 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( case AluScalarOpcode::kAddsPrev: { // dest = src0 + ps - dest = b.createBinOp(spv::Op::OpFAdd, float_type_, sources[0], ps_); + dest = b.createBinOp(spv::Op::OpFAdd, float_type_, sources[0], + b.createLoad(ps_)); } break; case AluScalarOpcode::kCos: { // dest = cos(src0) - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kCos, - {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kCos, {sources[0]}); } break; case AluScalarOpcode::kExp: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kExp2, - {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kExp2, {sources[0]}); } break; case AluScalarOpcode::kFloors: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFloor, - {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kFloor, {sources[0]}); } break; case AluScalarOpcode::kFrcs: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFract, - {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kFract, {sources[0]}); } break; case AluScalarOpcode::kKillsEq: { @@ -1126,7 +1526,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto kill_block = &b.makeNewBlock(); auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0], b.makeFloatConstant(0.f)); - cond = b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -1141,10 +1540,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto kill_block = &b.makeNewBlock(); auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThanEqual, bool_type_, sources[0], b.makeFloatConstant(0.f)); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -1159,10 +1554,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto kill_block = &b.makeNewBlock(); auto cond = b.createBinOp(spv::Op::OpFOrdGreaterThan, bool_type_, sources[0], b.makeFloatConstant(0.f)); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -1177,10 +1568,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto kill_block = &b.makeNewBlock(); auto cond = b.createBinOp(spv::Op::OpFOrdNotEqual, bool_type_, sources[0], b.makeFloatConstant(0.f)); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -1195,10 +1582,6 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto kill_block = &b.makeNewBlock(); auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0], b.makeFloatConstant(1.f)); - if (pred_cond) { - cond = - b.createBinOp(spv::Op::OpLogicalAnd, bool_type_, cond, pred_cond); - } b.createConditionalBranch(cond, kill_block, continue_block); b.setBuildPoint(kill_block); @@ -1209,27 +1592,32 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( } break; case AluScalarOpcode::kLogc: { + auto t = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kLog2, {sources[0]}); + + // FIXME: We don't check to see if t == -INF, we just check for INF + auto c = b.createUnaryOp(spv::Op::OpIsInf, bool_type_, t); + dest = b.createTriOp(spv::Op::OpSelect, float_type_, c, + b.makeFloatConstant(-FLT_MAX), t); } break; case AluScalarOpcode::kLog: { - auto log = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, - spv::GLSLstd450::kLog2, {sources[0]}); + dest = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kLog2, {sources[0]}); } break; case AluScalarOpcode::kMaxAsf: { auto addr = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, sources[0]); addr = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, int_type_, - spv::GLSLstd450::kSClamp, + spv::NoPrecision, int_type_, spv::GLSLstd450::kSClamp, {addr, b.makeIntConstant(-256), b.makeIntConstant(255)}); b.createStore(addr, a0_); // dest = src0 >= src1 ? src0 : src1 - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, - spv::GLSLstd450::kFMax, {sources[0], sources[1]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + spv::GLSLstd450::kFMax, + {sources[0], sources[1]}); } break; case AluScalarOpcode::kMaxAs: { @@ -1238,29 +1626,28 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( b.makeFloatConstant(0.5f)); addr = b.createUnaryOp(spv::Op::OpConvertFToS, int_type_, addr); addr = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, int_type_, - spv::GLSLstd450::kSClamp, + spv::NoPrecision, int_type_, spv::GLSLstd450::kSClamp, {addr, b.makeIntConstant(-256), b.makeIntConstant(255)}); b.createStore(addr, a0_); // dest = src0 >= src1 ? src0 : src1 - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, - spv::GLSLstd450::kFMax, {sources[0], sources[1]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + spv::GLSLstd450::kFMax, + {sources[0], sources[1]}); } break; case AluScalarOpcode::kMaxs: { // dest = max(src0, src1) - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFMax, - {sources[0], sources[1]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kFMax, + {sources[0], sources[1]}); } break; case AluScalarOpcode::kMins: { // dest = min(src0, src1) - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFMin, - {sources[0], sources[1]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kFMin, + {sources[0], sources[1]}); } break; case AluScalarOpcode::kMuls: @@ -1273,7 +1660,8 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( case AluScalarOpcode::kMulsPrev: { // dest = src0 * ps - dest = b.createBinOp(spv::Op::OpFMul, float_type_, sources[0], ps_); + dest = b.createBinOp(spv::Op::OpFMul, float_type_, sources[0], + b.createLoad(ps_)); } break; case AluScalarOpcode::kMulsPrev2: { @@ -1281,28 +1669,57 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( } break; case AluScalarOpcode::kRcpc: { - // TODO: dest = src0 != 0.0 ? 1.0 / src0 : FLT_MAX; + dest = b.createBinOp(spv::Op::OpFDiv, float_type_, + b.makeFloatConstant(1.f), sources[0]); + dest = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kFClamp, + {dest, b.makeFloatConstant(-FLT_MAX), b.makeFloatConstant(FLT_MAX)}); } break; - case AluScalarOpcode::kRcp: case AluScalarOpcode::kRcpf: { + dest = b.createBinOp(spv::Op::OpFDiv, float_type_, + b.makeFloatConstant(1.f), sources[0]); + auto c = b.createUnaryOp(spv::Op::OpIsInf, bool_type_, dest); + dest = b.createTriOp(spv::Op::OpSelect, float_type_, c, + b.makeFloatConstant(0.f), dest); + } break; + + case AluScalarOpcode::kRcp: { // dest = src0 != 0.0 ? 1.0 / src0 : 0.0; auto c = b.createBinOp(spv::Op::OpFOrdEqual, float_type_, sources[0], b.makeFloatConstant(0.f)); auto d = b.createBinOp(spv::Op::OpFDiv, float_type_, b.makeFloatConstant(1.f), sources[0]); - dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c, + dest = b.createTriOp(spv::Op::OpSelect, float_type_, c, b.makeFloatConstant(0.f), d); } break; + case AluScalarOpcode::kRsqc: { + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + spv::GLSLstd450::kInverseSqrt, + {sources[0]}); + dest = CreateGlslStd450InstructionCall( + spv::NoPrecision, float_type_, spv::GLSLstd450::kFClamp, + {dest, b.makeFloatConstant(-FLT_MAX), b.makeFloatConstant(FLT_MAX)}); + } break; + + case AluScalarOpcode::kRsqf: { + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + spv::GLSLstd450::kInverseSqrt, + {sources[0]}); + auto c = b.createUnaryOp(spv::Op::OpIsInf, bool_type_, dest); + dest = b.createTriOp(spv::Op::OpSelect, float_type_, c, + b.makeFloatConstant(0.f), dest); + } break; + case AluScalarOpcode::kRsq: { // dest = src0 != 0.0 ? inversesqrt(src0) : 0.0; auto c = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0], b.makeFloatConstant(0.f)); - auto d = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, vec4_float_type_, - spv::GLSLstd450::kInverseSqrt, {sources[0]}); - dest = b.createTriOp(spv::Op::OpSelect, vec4_float_type_, c, + auto d = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + spv::GLSLstd450::kInverseSqrt, + {sources[0]}); + dest = b.createTriOp(spv::Op::OpSelect, float_type_, c, b.makeFloatConstant(0.f), d); } break; @@ -1340,6 +1757,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( case AluScalarOpcode::kSetpClr: { b.createStore(b.makeBoolConstant(false), p0_); + close_predicated_block = true; dest = b.makeFloatConstant(FLT_MAX); } break; @@ -1348,6 +1766,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( b.makeFloatConstant(0.f)); // p0 = cond b.createStore(cond, p0_); + close_predicated_block = true; // dest = cond ? 0.f : 1.f; dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond, @@ -1359,6 +1778,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( sources[0], b.makeFloatConstant(0.f)); // p0 = cond b.createStore(cond, p0_); + close_predicated_block = true; // dest = cond ? 0.f : 1.f; dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond, @@ -1370,6 +1790,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( sources[0], b.makeFloatConstant(0.f)); // p0 = cond b.createStore(cond, p0_); + close_predicated_block = true; // dest = cond ? 0.f : 1.f; dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond, @@ -1377,12 +1798,11 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( } break; case AluScalarOpcode::kSetpInv: { + // p0 = src0 == 1.0 auto cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0], b.makeFloatConstant(1.f)); - auto pred = - b.createTriOp(spv::Op::OpSelect, bool_type_, cond, - b.makeBoolConstant(true), b.makeBoolConstant(false)); - b.createStore(pred, p0_); + b.createStore(cond, p0_); + close_predicated_block = true; // if (!cond) dest = src0 == 0.0 ? 1.0 : src0; auto dst_cond = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, @@ -1399,6 +1819,7 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( // p0 = cond b.createStore(cond, p0_); + close_predicated_block = true; // dest = cond ? 0.f : 1.f; dest = b.createTriOp(spv::Op::OpSelect, float_type_, cond, @@ -1411,9 +1832,10 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto c = b.createBinOp(spv::Op::OpFOrdLessThanEqual, bool_type_, src, b.makeFloatConstant(0.f)); b.createStore(c, p0_); + close_predicated_block = true; dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kFMax, + spv::NoPrecision, float_type_, GLSLstd450::kFMax, {sources[0], b.makeFloatConstant(0.f)}); } break; @@ -1421,13 +1843,18 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( auto c = b.createBinOp(spv::Op::OpFOrdEqual, bool_type_, sources[0], b.makeFloatConstant(0.f)); b.createStore(c, p0_); + close_predicated_block = true; dest = sources[0]; } break; case AluScalarOpcode::kSin: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kSin, - {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kSin, {sources[0]}); + } break; + + case AluScalarOpcode::kSqrt: { + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kSqrt, {sources[0]}); } break; case AluScalarOpcode::kSubs: @@ -1438,29 +1865,32 @@ void SpirvShaderTranslator::ProcessScalarAluInstruction( } break; case AluScalarOpcode::kSubsPrev: { - dest = b.createBinOp(spv::Op::OpFSub, float_type_, sources[0], ps_); + dest = b.createBinOp(spv::Op::OpFSub, float_type_, sources[0], + b.createLoad(ps_)); } break; case AluScalarOpcode::kTruncs: { - dest = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, float_type_, GLSLstd450::kTrunc, - {sources[0]}); + dest = CreateGlslStd450InstructionCall(spv::NoPrecision, float_type_, + GLSLstd450::kTrunc, {sources[0]}); } break; default: + assert_unhandled_case(instr.scalar_opcode); break; } + assert_not_zero(dest); if (dest) { - // If predicated, discard the result from the instruction. - Id ps_dest = dest; - if (instr.is_predicated) { - ps_dest = b.createTriOp(spv::Op::OpSelect, float_type_, pred_cond, dest, - b.createLoad(ps_)); - } + b.createStore(dest, ps_); + StoreToResult(dest, instr.result); + } - b.createStore(ps_dest, ps_); - StoreToResult(dest, instr.result, pred_cond); + if (close_predicated_block && open_predicated_block_) { + b.createBranch(predicated_block_end_); + b.setBuildPoint(predicated_block_end_); + open_predicated_block_ = false; + predicated_block_cond_ = false; + predicated_block_end_ = nullptr; } } @@ -1494,15 +1924,15 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) { case InstructionStorageAddressingMode::kAddressAbsolute: { // storage_index + a0 storage_index = - b.createBinOp(spv::Op::OpIAdd, b.makeUintType(32), b.createLoad(a0_), + b.createBinOp(spv::Op::OpIAdd, uint_type_, b.createLoad(a0_), b.makeUintConstant(storage_base + op.storage_index)); } break; case InstructionStorageAddressingMode::kAddressRelative: { // TODO: Based on loop index // storage_index + aL.x - storage_index = b.createBinOp( - spv::Op::OpIAdd, b.makeUintType(32), b.makeUintConstant(0), - b.makeUintConstant(storage_base + op.storage_index)); + storage_index = + b.createBinOp(spv::Op::OpIAdd, uint_type_, b.makeUintConstant(0), + b.makeUintConstant(storage_base + op.storage_index)); } break; default: assert_always(); @@ -1544,8 +1974,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) { if (op.is_absolute_value) { storage_value = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, storage_type, GLSLstd450::kFAbs, - {storage_value}); + spv::NoPrecision, storage_type, GLSLstd450::kFAbs, {storage_value}); } if (op.is_negated) { storage_value = @@ -1598,8 +2027,7 @@ Id SpirvShaderTranslator::LoadFromOperand(const InstructionOperand& op) { } void SpirvShaderTranslator::StoreToResult(Id source_value_id, - const InstructionResult& result, - Id predicate_cond) { + const InstructionResult& result) { auto& b = *builder_; if (result.storage_target == InstructionStorageTarget::kNone) { @@ -1624,7 +2052,7 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, case InstructionStorageAddressingMode::kAddressAbsolute: { // storage_index + a0 storage_index = - b.createBinOp(spv::Op::OpIAdd, b.makeUintType(32), b.createLoad(a0_), + b.createBinOp(spv::Op::OpIAdd, uint_type_, b.createLoad(a0_), b.makeUintConstant(result.storage_index)); } break; case InstructionStorageAddressingMode::kAddressRelative: { @@ -1677,7 +2105,11 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, break; case InstructionStorageTarget::kDepth: assert_true(is_pixel_shader()); - // TODO(benvanik): result.storage_index + storage_pointer = frag_depth_; + storage_class = spv::StorageClass::StorageClassOutput; + storage_type = float_type_; + storage_offsets.push_back(0); + storage_array = false; break; case InstructionStorageTarget::kNone: assert_unhandled_case(result.storage_target); @@ -1696,10 +2128,18 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, // Only load from storage if we need it later. Id storage_value = 0; - if (!result.has_all_writes() || predicate_cond) { + if (!result.has_all_writes()) { storage_value = b.createLoad(storage_pointer); } + // Clamp the input value. + if (result.is_clamped) { + source_value_id = CreateGlslStd450InstructionCall( + spv::NoPrecision, b.getTypeId(source_value_id), + spv::GLSLstd450::kFClamp, + {source_value_id, b.makeFloatConstant(0.0), b.makeFloatConstant(1.0)}); + } + // Convert to the appropriate type, if needed. if (b.getTypeId(source_value_id) != storage_type) { std::vector constituents; @@ -1707,22 +2147,22 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, auto n_dst = b.getNumTypeComponents(storage_type); assert_true(n_el < n_dst); - constituents.push_back(source_value_id); - for (int i = n_el; i < n_dst; i++) { - // Pad with zeroes. - constituents.push_back(b.makeFloatConstant(0.f)); + if (n_el == 1) { + // Smear scalar. + for (int i = 0; i < n_dst; i++) { + constituents.push_back(source_value_id); + } + } else { + // FIXME: This may not work as intended. + constituents.push_back(source_value_id); + for (int i = n_el; i < n_dst; i++) { + // Pad with zeroes. + constituents.push_back(b.makeFloatConstant(0.f)); + } } - source_value_id = b.createConstructor(spv::Decoration::DecorationInvariant, - constituents, storage_type); - } - - // Clamp the input value. - if (result.is_clamped) { - source_value_id = CreateGlslStd450InstructionCall( - spv::Decoration::DecorationInvariant, b.getTypeId(source_value_id), - spv::GLSLstd450::kFClamp, - {source_value_id, b.makeFloatConstant(0.0), b.makeFloatConstant(1.0)}); + source_value_id = + b.createConstructor(spv::NoPrecision, constituents, storage_type); } // swizzle @@ -1788,13 +2228,8 @@ void SpirvShaderTranslator::StoreToResult(Id source_value_id, assert_true(b.getNumComponents(source_value_id) == b.getNumTypeComponents(storage_type)); - // Discard if predicate condition is false. - if (predicate_cond) { - source_value_id = - b.createTriOp(spv::Op::OpSelect, storage_type, predicate_cond, - source_value_id, storage_value); - } - + assert_true(b.getTypeId(source_value_id) == + b.getDerefTypeId(storage_pointer)); b.createStore(source_value_id, storage_pointer); } diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 0d8b1e14c..b6a761a24 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -2,7 +2,7 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2015 Ben Vanik. All rights reserved. * + * Copyright 2016 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ @@ -17,7 +17,9 @@ #include "third_party/glslang-spirv/SpvBuilder.h" #include "third_party/spirv/GLSL.std.450.hpp11" #include "xenia/gpu/shader_translator.h" +#include "xenia/gpu/spirv/compiler.h" #include "xenia/ui/spirv/spirv_disassembler.h" +#include "xenia/ui/spirv/spirv_validator.h" namespace xe { namespace gpu { @@ -54,7 +56,8 @@ class SpirvShaderTranslator : public ShaderTranslator { std::vector CompleteTranslation() override; void PostTranslation(Shader* shader) override; - void PreProcessControlFlowInstruction(uint32_t cf_index) override; + void PreProcessControlFlowInstruction( + uint32_t cf_index, const ucode::ControlFlowInstruction& instr) override; void ProcessLabel(uint32_t cf_index) override; void ProcessControlFlowInstructionBegin(uint32_t cf_index) override; void ProcessControlFlowInstructionEnd(uint32_t cf_index) override; @@ -91,10 +94,16 @@ class SpirvShaderTranslator : public ShaderTranslator { // Stores a value based on the specified result information. // The value will be transformed into the appropriate form for the result and // the proper components will be selected. - void StoreToResult(spv::Id source_value_id, const InstructionResult& result, - spv::Id predicate_cond = 0); + void StoreToResult(spv::Id source_value_id, const InstructionResult& result); xe::ui::spirv::SpirvDisassembler disassembler_; + xe::ui::spirv::SpirvValidator validator_; + xe::gpu::spirv::Compiler compiler_; + + // True if there's an open predicated block + bool open_predicated_block_ = false; + bool predicated_block_cond_ = false; + spv::Block* predicated_block_end_ = nullptr; // TODO(benvanik): replace with something better, make reusable, etc. std::unique_ptr builder_; @@ -104,11 +113,10 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Function* translated_main_ = 0; // Types. - spv::Id float_type_ = 0, bool_type_ = 0, int_type_ = 0; + spv::Id float_type_ = 0, bool_type_ = 0, int_type_ = 0, uint_type_ = 0; spv::Id vec2_float_type_ = 0, vec3_float_type_ = 0, vec4_float_type_ = 0; spv::Id vec4_uint_type_ = 0; spv::Id vec4_bool_type_ = 0; - spv::Id sampled_image_type_ = 0; // Constants. spv::Id vec4_float_zero_ = 0, vec4_float_one_ = 0; @@ -121,13 +129,19 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id pos_ = 0; spv::Id push_consts_ = 0; spv::Id interpolators_ = 0; - spv::Id frag_outputs_ = 0; + spv::Id vertex_id_ = 0; + spv::Id frag_outputs_ = 0, frag_depth_ = 0; spv::Id samplers_ = 0; - spv::Id img_[4] = {0}; // Images {1D, 2D, 3D, Cube} + spv::Id tex_[4] = {0}; // Images {1D, 2D, 3D, Cube} // Map of {binding -> {offset -> spv input}} std::map> vertex_binding_map_; - std::map cf_blocks_; + + struct CFBlock { + spv::Block* block = nullptr; + bool prev_dominates = true; + }; + std::map cf_blocks_; }; } // namespace gpu diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index 500f22bb3..0cb2ed2ba 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -88,6 +88,66 @@ enum class TextureFormat : uint32_t { kUnknown = 0xFFFFFFFFu, }; +inline size_t GetTexelSize(TextureFormat format) { + switch (format) { + case TextureFormat::k_1_5_5_5: + return 2; + break; + case TextureFormat::k_2_10_10_10: + return 4; + break; + case TextureFormat::k_4_4_4_4: + return 2; + break; + case TextureFormat::k_5_6_5: + return 2; + break; + case TextureFormat::k_8: + return 1; + break; + case TextureFormat::k_8_8: + return 2; + break; + case TextureFormat::k_8_8_8_8: + return 4; + break; + case TextureFormat::k_16: + return 4; + break; + case TextureFormat::k_16_FLOAT: + return 4; + break; + case TextureFormat::k_16_16: + return 4; + break; + case TextureFormat::k_16_16_FLOAT: + return 4; + break; + case TextureFormat::k_16_16_16_16: + return 8; + break; + case TextureFormat::k_16_16_16_16_FLOAT: + return 8; + break; + case TextureFormat::k_32_FLOAT: + return 4; + break; + case TextureFormat::k_32_32_FLOAT: + return 8; + break; + case TextureFormat::k_32_32_32_32_FLOAT: + return 16; + break; + case TextureFormat::k_10_11_11: + case TextureFormat::k_11_11_10: + return 4; + break; + default: + assert_unhandled_case(format); + return 0; + } +} + inline TextureFormat ColorFormatToTextureFormat(ColorFormat color_format) { return static_cast(color_format); } diff --git a/src/xenia/gpu/trace_player.cc b/src/xenia/gpu/trace_player.cc index 54c199736..b79b49df2 100644 --- a/src/xenia/gpu/trace_player.cc +++ b/src/xenia/gpu/trace_player.cc @@ -51,7 +51,7 @@ void TracePlayer::SeekFrame(int target_frame) { assert_true(frame->start_ptr <= frame->end_ptr); PlayTrace(frame->start_ptr, frame->end_ptr - frame->start_ptr, - TracePlaybackMode::kBreakOnSwap); + TracePlaybackMode::kBreakOnSwap, false); } void TracePlayer::SeekCommand(int target_command) { @@ -71,11 +71,11 @@ void TracePlayer::SeekCommand(int target_command) { const auto& previous_command = frame->commands[previous_command_index]; PlayTrace(previous_command.end_ptr, command.end_ptr - previous_command.end_ptr, - TracePlaybackMode::kBreakOnSwap); + TracePlaybackMode::kBreakOnSwap, false); } else { // Full playback from frame start. PlayTrace(frame->start_ptr, command.end_ptr - frame->start_ptr, - TracePlaybackMode::kBreakOnSwap); + TracePlaybackMode::kBreakOnSwap, true); } } @@ -84,19 +84,25 @@ void TracePlayer::WaitOnPlayback() { } void TracePlayer::PlayTrace(const uint8_t* trace_data, size_t trace_size, - TracePlaybackMode playback_mode) { - graphics_system_->command_processor()->CallInThread( - [this, trace_data, trace_size, playback_mode]() { - PlayTraceOnThread(trace_data, trace_size, playback_mode); - }); + TracePlaybackMode playback_mode, + bool clear_caches) { + playing_trace_ = true; + graphics_system_->command_processor()->CallInThread([=]() { + PlayTraceOnThread(trace_data, trace_size, playback_mode, clear_caches); + }); } void TracePlayer::PlayTraceOnThread(const uint8_t* trace_data, size_t trace_size, - TracePlaybackMode playback_mode) { + TracePlaybackMode playback_mode, + bool clear_caches) { auto memory = graphics_system_->memory(); auto command_processor = graphics_system_->command_processor(); + if (clear_caches) { + command_processor->ClearCaches(); + } + command_processor->set_swap_mode(SwapMode::kIgnored); playback_percent_ = 0; auto trace_end = trace_data + trace_size; diff --git a/src/xenia/gpu/trace_player.h b/src/xenia/gpu/trace_player.h index d3926d460..0c3c6571a 100644 --- a/src/xenia/gpu/trace_player.h +++ b/src/xenia/gpu/trace_player.h @@ -50,9 +50,9 @@ class TracePlayer : public TraceReader { private: void PlayTrace(const uint8_t* trace_data, size_t trace_size, - TracePlaybackMode playback_mode); + TracePlaybackMode playback_mode, bool clear_caches); void PlayTraceOnThread(const uint8_t* trace_data, size_t trace_size, - TracePlaybackMode playback_mode); + TracePlaybackMode playback_mode, bool clear_caches); xe::ui::Loop* loop_; GraphicsSystem* graphics_system_; diff --git a/src/xenia/gpu/trace_reader.cc b/src/xenia/gpu/trace_reader.cc index fb58c436b..6bedfb9b4 100644 --- a/src/xenia/gpu/trace_reader.cc +++ b/src/xenia/gpu/trace_reader.cc @@ -75,6 +75,10 @@ void TraceReader::ParseTrace() { const uint8_t* packet_start_ptr = nullptr; const uint8_t* last_ptr = trace_ptr; bool pending_break = false; + auto current_command_buffer = new CommandBuffer(); + current_frame.command_tree = + std::unique_ptr(current_command_buffer); + while (trace_ptr < trace_data_ + trace_size_) { ++current_frame.command_count; auto type = static_cast(xe::load(trace_ptr)); @@ -94,11 +98,29 @@ void TraceReader::ParseTrace() { auto cmd = reinterpret_cast(trace_ptr); trace_ptr += sizeof(*cmd) + cmd->count * 4; + + // Traverse down a level. + auto sub_command_buffer = new CommandBuffer(); + sub_command_buffer->parent = current_command_buffer; + current_command_buffer->commands.push_back( + CommandBuffer::Command(sub_command_buffer)); + current_command_buffer = sub_command_buffer; break; } case TraceCommandType::kIndirectBufferEnd: { auto cmd = reinterpret_cast(trace_ptr); trace_ptr += sizeof(*cmd); + + // IB packet is wrapped in a kPacketStart/kPacketEnd. Skip the end. + auto end_cmd = reinterpret_cast(trace_ptr); + assert_true(end_cmd->type == TraceCommandType::kPacketEnd); + trace_ptr += sizeof(*cmd); + + // Go back up a level. If parent is null, this frame started in an + // indirect buffer. + if (current_command_buffer->parent) { + current_command_buffer = current_command_buffer->parent; + } break; } case TraceCommandType::kPacketStart: { @@ -125,6 +147,8 @@ void TraceReader::ParseTrace() { command.end_ptr = trace_ptr; current_frame.commands.push_back(std::move(command)); last_ptr = trace_ptr; + current_command_buffer->commands.push_back(CommandBuffer::Command( + uint32_t(current_frame.commands.size() - 1))); break; } case PacketCategory::kSwap: @@ -136,6 +160,9 @@ void TraceReader::ParseTrace() { if (pending_break) { current_frame.end_ptr = trace_ptr; frames_.push_back(std::move(current_frame)); + current_command_buffer = new CommandBuffer(); + current_frame.command_tree = + std::unique_ptr(current_command_buffer); current_frame.start_ptr = trace_ptr; current_frame.end_ptr = nullptr; current_frame.command_count = 0; diff --git a/src/xenia/gpu/trace_reader.h b/src/xenia/gpu/trace_reader.h index 5445bd1f9..b3245da46 100644 --- a/src/xenia/gpu/trace_reader.h +++ b/src/xenia/gpu/trace_reader.h @@ -11,6 +11,7 @@ #define XENIA_GPU_TRACE_READER_H_ #include +#include #include "xenia/base/mapped_memory.h" #include "xenia/gpu/trace_protocol.h" @@ -51,6 +52,42 @@ namespace gpu { class TraceReader { public: + struct CommandBuffer { + struct Command { + enum class Type { + kCommand, + kBuffer, + }; + + Command() {} + Command(Command&& other) { + type = other.type; + command_id = other.command_id; + command_subtree = std::move(other.command_subtree); + } + Command(CommandBuffer* buf) { + type = Type::kBuffer; + command_subtree = std::unique_ptr(buf); + } + Command(uint32_t id) { + type = Type::kCommand; + command_id = id; + } + ~Command() = default; + + Type type; + uint32_t command_id = -1; + std::unique_ptr command_subtree = nullptr; + }; + + CommandBuffer() {} + ~CommandBuffer() {} + + // Parent command buffer, if one exists. + CommandBuffer* parent = nullptr; + std::vector commands; + }; + struct Frame { struct Command { enum class Type { @@ -74,7 +111,12 @@ class TraceReader { const uint8_t* start_ptr = nullptr; const uint8_t* end_ptr = nullptr; int command_count = 0; + + // Flat list of all commands in this frame. std::vector commands; + + // Tree of all command buffers + std::unique_ptr command_tree; }; TraceReader() = default; diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 7ce20c7ca..8079631f5 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -390,6 +390,66 @@ void TraceViewer::DrawPacketDisassemblerUI() { ImGui::End(); } +int TraceViewer::RecursiveDrawCommandBufferUI( + const TraceReader::Frame* frame, TraceReader::CommandBuffer* buffer) { + int selected_id = -1; + int column_width = int(ImGui::GetContentRegionMax().x); + + for (size_t i = 0; i < buffer->commands.size(); i++) { + switch (buffer->commands[i].type) { + case TraceReader::CommandBuffer::Command::Type::kBuffer: { + auto subtree = buffer->commands[i].command_subtree.get(); + if (!subtree->commands.size()) { + continue; + } + + ImGui::PushID(int(i)); + if (ImGui::TreeNode((void*)0, "Indirect Buffer %d", i)) { + ImGui::Indent(); + auto id = RecursiveDrawCommandBufferUI( + frame, buffer->commands[i].command_subtree.get()); + ImGui::Unindent(); + ImGui::TreePop(); + + if (id != -1) { + selected_id = id; + } + } + ImGui::PopID(); + } break; + + case TraceReader::CommandBuffer::Command::Type::kCommand: { + uint32_t command_id = buffer->commands[i].command_id; + + const auto& command = frame->commands[command_id]; + bool is_selected = command_id == player_->current_command_index(); + const char* label; + switch (command.type) { + case TraceReader::Frame::Command::Type::kDraw: + label = "Draw"; + break; + case TraceReader::Frame::Command::Type::kSwap: + label = "Swap"; + break; + } + + ImGui::PushID(command_id); + if (ImGui::Selectable(label, &is_selected)) { + selected_id = command_id; + } + ImGui::SameLine(column_width - 60.0f); + ImGui::Text("%d", command_id); + ImGui::PopID(); + // if (did_seek && target_command == i) { + // ImGui::SetScrollPosHere(); + // } + } break; + } + } + + return selected_id; +} + void TraceViewer::DrawCommandListUI() { ImGui::SetNextWindowPos(ImVec2(5, 70), ImGuiSetCond_FirstUseEver); if (!ImGui::Begin("Command List", nullptr, ImVec2(200, 640))) { @@ -473,31 +533,12 @@ void TraceViewer::DrawCommandListUI() { ImGui::SetScrollPosHere(); } - for (int i = 0; i < int(frame->commands.size()); ++i) { - ImGui::PushID(i); - is_selected = i == player_->current_command_index(); - const auto& command = frame->commands[i]; - const char* label; - switch (command.type) { - case TraceReader::Frame::Command::Type::kDraw: - label = "Draw"; - break; - case TraceReader::Frame::Command::Type::kSwap: - label = "Swap"; - break; - } - if (ImGui::Selectable(label, &is_selected)) { - if (!player_->is_playing_trace()) { - player_->SeekCommand(i); - } - } - ImGui::SameLine(column_width - 60.0f); - ImGui::Text("%d", i); - ImGui::PopID(); - if (did_seek && target_command == i) { - ImGui::SetScrollPosHere(); - } + auto id = RecursiveDrawCommandBufferUI(frame, frame->command_tree.get()); + if (id != -1 && id != player_->current_command_index() && + !player_->is_playing_trace()) { + player_->SeekCommand(id); } + ImGui::EndChild(); ImGui::End(); } @@ -639,8 +680,8 @@ void TraceViewer::DrawTextureInfo( ImGui::Columns(2); ImVec2 button_size(256, 256); - if (ImGui::ImageButton(ImTextureID(texture | ui::ImGuiDrawer::kIgnoreAlpha), - button_size, ImVec2(0, 0), ImVec2(1, 1))) { + if (ImGui::ImageButton(ImTextureID(texture), button_size, ImVec2(0, 0), + ImVec2(1, 1))) { // show viewer } ImGui::NextColumn(); @@ -1108,11 +1149,14 @@ void TraceViewer::DrawStateUI() { ((window_scissor_br >> 16) & 0x7FFF) - ((window_scissor_tl >> 16) & 0x7FFF)); uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32; + uint32_t surface_actual = (surface_info >> 18) & 0x3FFF; uint32_t surface_pitch = surface_info & 0x3FFF; auto surface_msaa = (surface_info >> 16) & 0x3; static const char* kMsaaNames[] = { "1X", "2X", "4X", }; + ImGui::BulletText("Surface Pitch - Actual: %d - %d", surface_pitch, + surface_actual); ImGui::BulletText("Surface MSAA: %s", kMsaaNames[surface_msaa]); uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32; bool vport_xscale_enable = (vte_control & (1 << 0)) > 0; @@ -1124,6 +1168,9 @@ void TraceViewer::DrawStateUI() { assert_true(vport_xscale_enable == vport_yscale_enable == vport_zscale_enable == vport_xoffset_enable == vport_yoffset_enable == vport_zoffset_enable); + if (!vport_xscale_enable) { + ImGui::PushStyleColor(ImGuiCol_Text, kColorIgnored); + } ImGui::BulletText( "Viewport Offset: %f, %f, %f", vport_xoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 : 0, @@ -1134,6 +1181,10 @@ void TraceViewer::DrawStateUI() { vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1, vport_yscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 : 1, vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1); + if (!vport_xscale_enable) { + ImGui::PopStyleColor(); + } + ImGui::BulletText("Vertex Format: %s, %s, %s, %s", ((vte_control >> 8) & 0x1) ? "x/w0" : "x", ((vte_control >> 8) & 0x1) ? "y/w0" : "y", @@ -1318,7 +1369,7 @@ void TraceViewer::DrawStateUI() { if (write_mask) { auto color_target = GetColorRenderTarget(surface_pitch, surface_msaa, color_base, color_format); - tex = ImTextureID(color_target | ui::ImGuiDrawer::kIgnoreAlpha); + tex = ImTextureID(color_target); if (ImGui::ImageButton(tex, button_size, ImVec2(0, 0), ImVec2(1, 1))) { // show viewer @@ -1330,10 +1381,9 @@ void TraceViewer::DrawStateUI() { } if (ImGui::IsItemHovered()) { ImGui::BeginTooltip(); - ImGui::Text( - "Color Target %d (%s), base %.4X, pitch %d, msaa %d, format %d", - i, write_mask ? "enabled" : "disabled", color_base, surface_pitch, - surface_msaa, color_format); + ImGui::Text("Color Target %d (%s), base %.4X, pitch %d, format %d", i, + write_mask ? "enabled" : "disabled", color_base, + surface_pitch, color_format); if (tex) { ImVec2 rel_pos; @@ -1407,17 +1457,19 @@ void TraceViewer::DrawStateUI() { auto button_pos = ImGui::GetCursorScreenPos(); ImVec2 button_size(256, 256); - ImGui::ImageButton( - ImTextureID(depth_target | ui::ImGuiDrawer::kIgnoreAlpha), - button_size, ImVec2(0, 0), ImVec2(1, 1)); + ImGui::ImageButton(ImTextureID(depth_target), button_size, ImVec2(0, 0), + ImVec2(1, 1)); if (ImGui::IsItemHovered()) { ImGui::BeginTooltip(); + ImGui::Text("Depth Target: base %.4X, pitch %d, format %d", depth_base, + surface_pitch, depth_format); + ImVec2 rel_pos; rel_pos.x = ImGui::GetMousePos().x - button_pos.x; rel_pos.y = ImGui::GetMousePos().y - button_pos.y; - ZoomedImage(ImTextureID(depth_target | ui::ImGuiDrawer::kIgnoreAlpha), - rel_pos, button_size, 32.f, ImVec2(256, 256)); + ZoomedImage(ImTextureID(depth_target), rel_pos, button_size, 32.f, + ImVec2(256, 256)); ImGui::EndTooltip(); } diff --git a/src/xenia/gpu/trace_viewer.h b/src/xenia/gpu/trace_viewer.h index 6f7c900fc..7e82ad831 100644 --- a/src/xenia/gpu/trace_viewer.h +++ b/src/xenia/gpu/trace_viewer.h @@ -80,6 +80,8 @@ class TraceViewer { void DrawUI(); void DrawControllerUI(); void DrawPacketDisassemblerUI(); + int RecursiveDrawCommandBufferUI(const TraceReader::Frame* frame, + TraceReader::CommandBuffer* buffer); void DrawCommandListUI(); void DrawStateUI(); diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc index 7fd3c4768..02bd88a83 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.cc +++ b/src/xenia/gpu/vulkan/buffer_cache.cc @@ -22,98 +22,19 @@ namespace vulkan { using xe::ui::vulkan::CheckResult; -// Space kept between tail and head when wrapping. -constexpr VkDeviceSize kDeadZone = 4 * 1024; - constexpr VkDeviceSize kConstantRegisterUniformRange = 512 * 4 * 4 + 8 * 4 + 32 * 4; BufferCache::BufferCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device, size_t capacity) - : register_file_(register_file), - device_(*device), - transient_capacity_(capacity) { - // Uniform buffer. - VkBufferCreateInfo uniform_buffer_info; - uniform_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - uniform_buffer_info.pNext = nullptr; - uniform_buffer_info.flags = 0; - uniform_buffer_info.size = transient_capacity_; - uniform_buffer_info.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; - uniform_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - uniform_buffer_info.queueFamilyIndexCount = 0; - uniform_buffer_info.pQueueFamilyIndices = nullptr; - auto err = vkCreateBuffer(device_, &uniform_buffer_info, nullptr, - &transient_uniform_buffer_); - CheckResult(err, "vkCreateBuffer"); - - // Index buffer. - VkBufferCreateInfo index_buffer_info; - index_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - index_buffer_info.pNext = nullptr; - index_buffer_info.flags = 0; - index_buffer_info.size = transient_capacity_; - index_buffer_info.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT; - index_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - index_buffer_info.queueFamilyIndexCount = 0; - index_buffer_info.pQueueFamilyIndices = nullptr; - err = vkCreateBuffer(device_, &index_buffer_info, nullptr, - &transient_index_buffer_); - CheckResult(err, "vkCreateBuffer"); - - // Vertex buffer. - VkBufferCreateInfo vertex_buffer_info; - vertex_buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - vertex_buffer_info.pNext = nullptr; - vertex_buffer_info.flags = 0; - vertex_buffer_info.size = transient_capacity_; - vertex_buffer_info.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT; - vertex_buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - vertex_buffer_info.queueFamilyIndexCount = 0; - vertex_buffer_info.pQueueFamilyIndices = nullptr; - err = vkCreateBuffer(*device, &vertex_buffer_info, nullptr, - &transient_vertex_buffer_); - CheckResult(err, "vkCreateBuffer"); - - // Allocate the underlying buffer we use for all storage. - // We query all types and take the max alignment. - VkMemoryRequirements uniform_buffer_requirements; - VkMemoryRequirements index_buffer_requirements; - VkMemoryRequirements vertex_buffer_requirements; - vkGetBufferMemoryRequirements(device_, transient_uniform_buffer_, - &uniform_buffer_requirements); - vkGetBufferMemoryRequirements(device_, transient_index_buffer_, - &index_buffer_requirements); - vkGetBufferMemoryRequirements(device_, transient_vertex_buffer_, - &vertex_buffer_requirements); - uniform_buffer_alignment_ = uniform_buffer_requirements.alignment; - index_buffer_alignment_ = index_buffer_requirements.alignment; - vertex_buffer_alignment_ = vertex_buffer_requirements.alignment; - VkMemoryRequirements buffer_requirements; - buffer_requirements.size = transient_capacity_; - buffer_requirements.alignment = - std::max(uniform_buffer_requirements.alignment, - std::max(index_buffer_requirements.alignment, - vertex_buffer_requirements.alignment)); - buffer_requirements.memoryTypeBits = - uniform_buffer_requirements.memoryTypeBits | - index_buffer_requirements.memoryTypeBits | - vertex_buffer_requirements.memoryTypeBits; - transient_buffer_memory_ = device->AllocateMemory( - buffer_requirements, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - - // Alias all buffers to our memory. - vkBindBufferMemory(device_, transient_uniform_buffer_, - transient_buffer_memory_, 0); - vkBindBufferMemory(device_, transient_index_buffer_, transient_buffer_memory_, - 0); - vkBindBufferMemory(device_, transient_vertex_buffer_, - transient_buffer_memory_, 0); - - // Map memory and keep it mapped while we use it. - err = vkMapMemory(device_, transient_buffer_memory_, 0, VK_WHOLE_SIZE, 0, - &transient_buffer_data_); - CheckResult(err, "vkMapMemory"); + : register_file_(register_file), device_(*device) { + transient_buffer_ = std::make_unique(device); + if (!transient_buffer_->Initialize(capacity, + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT)) { + assert_always(); + } // Descriptor pool used for all of our cached descriptors. // In the steady state we don't allocate anything, so these are all manually @@ -129,8 +50,8 @@ BufferCache::BufferCache(RegisterFile* register_file, pool_sizes[0].descriptorCount = 2; descriptor_pool_info.poolSizeCount = 1; descriptor_pool_info.pPoolSizes = pool_sizes; - err = vkCreateDescriptorPool(device_, &descriptor_pool_info, nullptr, - &descriptor_pool_); + auto err = vkCreateDescriptorPool(device_, &descriptor_pool_info, nullptr, + &descriptor_pool_); CheckResult(err, "vkCreateDescriptorPool"); // Create the descriptor set layout used for our uniform buffer. @@ -180,7 +101,7 @@ BufferCache::BufferCache(RegisterFile* register_file, // Initialize descriptor set with our buffers. VkDescriptorBufferInfo buffer_info; - buffer_info.buffer = transient_uniform_buffer_; + buffer_info.buffer = transient_buffer_->gpu_buffer(); buffer_info.offset = 0; buffer_info.range = kConstantRegisterUniformRange; VkWriteDescriptorSet descriptor_writes[2]; @@ -212,25 +133,20 @@ BufferCache::~BufferCache() { &transient_descriptor_set_); vkDestroyDescriptorSetLayout(device_, descriptor_set_layout_, nullptr); vkDestroyDescriptorPool(device_, descriptor_pool_, nullptr); - vkUnmapMemory(device_, transient_buffer_memory_); - vkFreeMemory(device_, transient_buffer_memory_, nullptr); - vkDestroyBuffer(device_, transient_uniform_buffer_, nullptr); - vkDestroyBuffer(device_, transient_index_buffer_, nullptr); - vkDestroyBuffer(device_, transient_vertex_buffer_, nullptr); + transient_buffer_->Shutdown(); } std::pair BufferCache::UploadConstantRegisters( const Shader::ConstantRegisterMap& vertex_constant_register_map, - const Shader::ConstantRegisterMap& pixel_constant_register_map) { + const Shader::ConstantRegisterMap& pixel_constant_register_map, + std::shared_ptr fence) { // Fat struct, including all registers: // struct { // vec4 float[512]; // uint bool[8]; // uint loop[32]; // }; - size_t total_size = - xe::round_up(kConstantRegisterUniformRange, uniform_buffer_alignment_); - auto offset = AllocateTransientData(uniform_buffer_alignment_, total_size); + auto offset = AllocateTransientData(kConstantRegisterUniformRange, fence); if (offset == VK_WHOLE_SIZE) { // OOM. return {VK_WHOLE_SIZE, VK_WHOLE_SIZE}; @@ -238,8 +154,7 @@ std::pair BufferCache::UploadConstantRegisters( // Copy over all the registers. const auto& values = register_file_->values; - uint8_t* dest_ptr = - reinterpret_cast(transient_buffer_data_) + offset; + uint8_t* dest_ptr = transient_buffer_->host_base() + offset; std::memcpy(dest_ptr, &values[XE_GPU_REG_SHADER_CONSTANT_000_X].f32, (512 * 4 * 4)); dest_ptr += 512 * 4 * 4; @@ -258,8 +173,8 @@ std::pair BufferCache::UploadConstantRegisters( // constant indexing. #if 0 // Allocate space in the buffer for our data. - auto offset = AllocateTransientData(uniform_buffer_alignment_, - constant_register_map.packed_byte_length); + auto offset = + AllocateTransientData(constant_register_map.packed_byte_length, fence); if (offset == VK_WHOLE_SIZE) { // OOM. return VK_WHOLE_SIZE; @@ -304,11 +219,12 @@ std::pair BufferCache::UploadConstantRegisters( } std::pair BufferCache::UploadIndexBuffer( - const void* source_ptr, size_t source_length, IndexFormat format) { + const void* source_ptr, size_t source_length, IndexFormat format, + std::shared_ptr fence) { // TODO(benvanik): check cache. // Allocate space in the buffer for our data. - auto offset = AllocateTransientData(index_buffer_alignment_, source_length); + auto offset = AllocateTransientData(source_length, fence); if (offset == VK_WHOLE_SIZE) { // OOM. return {nullptr, VK_WHOLE_SIZE}; @@ -319,25 +235,24 @@ std::pair BufferCache::UploadIndexBuffer( // TODO(benvanik): memcpy then use compute shaders to swap? if (format == IndexFormat::kInt16) { // Endian::k8in16, swap half-words. - xe::copy_and_swap_16_aligned( - reinterpret_cast(transient_buffer_data_) + offset, source_ptr, - source_length / 2); + xe::copy_and_swap_16_aligned(transient_buffer_->host_base() + offset, + source_ptr, source_length / 2); } else if (format == IndexFormat::kInt32) { // Endian::k8in32, swap words. - xe::copy_and_swap_32_aligned( - reinterpret_cast(transient_buffer_data_) + offset, source_ptr, - source_length / 4); + xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset, + source_ptr, source_length / 4); } - return {transient_index_buffer_, offset}; + return {transient_buffer_->gpu_buffer(), offset}; } std::pair BufferCache::UploadVertexBuffer( - const void* source_ptr, size_t source_length) { + const void* source_ptr, size_t source_length, Endian endian, + std::shared_ptr fence) { // TODO(benvanik): check cache. // Allocate space in the buffer for our data. - auto offset = AllocateTransientData(vertex_buffer_alignment_, source_length); + auto offset = AllocateTransientData(source_length, fence); if (offset == VK_WHOLE_SIZE) { // OOM. return {nullptr, VK_WHOLE_SIZE}; @@ -345,60 +260,38 @@ std::pair BufferCache::UploadVertexBuffer( // Copy data into the buffer. // TODO(benvanik): memcpy then use compute shaders to swap? - // Endian::k8in32, swap words. - xe::copy_and_swap_32_aligned( - reinterpret_cast(transient_buffer_data_) + offset, source_ptr, - source_length / 4); + assert_true(endian == Endian::k8in32); + if (endian == Endian::k8in32) { + // Endian::k8in32, swap words. + xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset, + source_ptr, source_length / 4); + } - return {transient_vertex_buffer_, offset}; + return {transient_buffer_->gpu_buffer(), offset}; } -VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize alignment, - VkDeviceSize length) { +VkDeviceSize BufferCache::AllocateTransientData( + VkDeviceSize length, std::shared_ptr fence) { // Try fast path (if we have space). - VkDeviceSize offset = TryAllocateTransientData(alignment, length); + VkDeviceSize offset = TryAllocateTransientData(length, fence); if (offset != VK_WHOLE_SIZE) { return offset; } // Ran out of easy allocations. // Try consuming fences before we panic. - assert_always("Reclamation not yet implemented"); + transient_buffer_->Scavenge(); // Try again. It may still fail if we didn't get enough space back. - return TryAllocateTransientData(alignment, length); + offset = TryAllocateTransientData(length, fence); + return offset; } -VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize alignment, - VkDeviceSize length) { - if (transient_tail_offset_ >= transient_head_offset_) { - // Tail follows head, so things are easy: - // | H----T | - if (xe::round_up(transient_tail_offset_, alignment) + length <= - transient_capacity_) { - // Allocation fits from tail to end of buffer, so grow. - // | H----**T | - VkDeviceSize offset = xe::round_up(transient_tail_offset_, alignment); - transient_tail_offset_ = offset + length; - return offset; - } else if (length + kDeadZone <= transient_head_offset_) { - // Can't fit at the end, but can fit if we wrap around. - // |**T H----....| - VkDeviceSize offset = 0; - transient_tail_offset_ = length; - return offset; - } - } else { - // Head follows tail, so we're reversed: - // |----T H---| - if (xe::round_up(transient_tail_offset_, alignment) + length + kDeadZone <= - transient_head_offset_) { - // Fits from tail to head. - // |----***T H---| - VkDeviceSize offset = xe::round_up(transient_tail_offset_, alignment); - transient_tail_offset_ = offset + length; - return offset; - } +VkDeviceSize BufferCache::TryAllocateTransientData( + VkDeviceSize length, std::shared_ptr fence) { + auto alloc = transient_buffer_->Acquire(length, fence); + if (alloc) { + return alloc->offset; } // No more space. @@ -420,9 +313,9 @@ void BufferCache::Flush(VkCommandBuffer command_buffer) { VkMappedMemoryRange dirty_range; dirty_range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; dirty_range.pNext = nullptr; - dirty_range.memory = transient_buffer_memory_; + dirty_range.memory = transient_buffer_->gpu_memory(); dirty_range.offset = 0; - dirty_range.size = transient_capacity_; + dirty_range.size = transient_buffer_->capacity(); vkFlushMappedMemoryRanges(device_, 1, &dirty_range); } @@ -434,6 +327,8 @@ void BufferCache::ClearCache() { // TODO(benvanik): caching. } +void BufferCache::Scavenge() { transient_buffer_->Scavenge(); } + } // namespace vulkan } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h index 1c7330e52..8695fc36d 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.h +++ b/src/xenia/gpu/vulkan/buffer_cache.h @@ -13,6 +13,7 @@ #include "xenia/gpu/register_file.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/xenos.h" +#include "xenia/ui/vulkan/circular_buffer.h" #include "xenia/ui/vulkan/vulkan.h" #include "xenia/ui/vulkan/vulkan_device.h" @@ -50,22 +51,24 @@ class BufferCache { // The returned offsets may alias. std::pair UploadConstantRegisters( const Shader::ConstantRegisterMap& vertex_constant_register_map, - const Shader::ConstantRegisterMap& pixel_constant_register_map); + const Shader::ConstantRegisterMap& pixel_constant_register_map, + std::shared_ptr fence); // Uploads index buffer data from guest memory, possibly eliding with // recently uploaded data or cached copies. // Returns a buffer and offset that can be used with vkCmdBindIndexBuffer. // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM). - std::pair UploadIndexBuffer(const void* source_ptr, - size_t source_length, - IndexFormat format); + std::pair UploadIndexBuffer( + const void* source_ptr, size_t source_length, IndexFormat format, + std::shared_ptr fence); // Uploads vertex buffer data from guest memory, possibly eliding with // recently uploaded data or cached copies. // Returns a buffer and offset that can be used with vkCmdBindVertexBuffers. // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM). - std::pair UploadVertexBuffer(const void* source_ptr, - size_t source_length); + std::pair UploadVertexBuffer( + const void* source_ptr, size_t source_length, Endian endian, + std::shared_ptr fence); // Flushes all pending data to the GPU. // Until this is called the GPU is not guaranteed to see any data. @@ -81,36 +84,26 @@ class BufferCache { // Clears all cached content and prevents future elision with pending data. void ClearCache(); + // Wipes all data no longer needed. + void Scavenge(); + private: // Allocates a block of memory in the transient buffer. // When memory is not available fences are checked and space is reclaimed. // Returns VK_WHOLE_SIZE if requested amount of memory is not available. - VkDeviceSize AllocateTransientData(VkDeviceSize alignment, - VkDeviceSize length); + VkDeviceSize AllocateTransientData(VkDeviceSize length, + std::shared_ptr fence); // Tries to allocate a block of memory in the transient buffer. // Returns VK_WHOLE_SIZE if requested amount of memory is not available. - VkDeviceSize TryAllocateTransientData(VkDeviceSize alignment, - VkDeviceSize length); + VkDeviceSize TryAllocateTransientData( + VkDeviceSize length, std::shared_ptr fence); RegisterFile* register_file_ = nullptr; VkDevice device_ = nullptr; // Staging ringbuffer we cycle through fast. Used for data we don't // plan on keeping past the current frame. - size_t transient_capacity_ = 0; - VkBuffer transient_uniform_buffer_ = nullptr; - VkBuffer transient_index_buffer_ = nullptr; - VkBuffer transient_vertex_buffer_ = nullptr; - VkDeviceMemory transient_buffer_memory_ = nullptr; - void* transient_buffer_data_ = nullptr; - VkDeviceSize transient_head_offset_ = 0; - VkDeviceSize transient_tail_offset_ = 0; - - // Required alignments for our various types. - // All allocations must start at the appropriate alignment. - VkDeviceSize uniform_buffer_alignment_ = 0; - VkDeviceSize index_buffer_alignment_ = 0; - VkDeviceSize vertex_buffer_alignment_ = 0; + std::unique_ptr transient_buffer_ = nullptr; VkDescriptorPool descriptor_pool_ = nullptr; VkDescriptorSetLayout descriptor_set_layout_ = nullptr; diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc index 542329af5..e80cb4675 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/pipeline_cache.cc @@ -17,6 +17,9 @@ #include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/vulkan/vulkan_gpu_flags.h" +#include +#include + namespace xe { namespace gpu { namespace vulkan { @@ -154,40 +157,19 @@ VulkanShader* PipelineCache::LoadShader(ShaderType shader_type, host_address, dword_count); shader_map_.insert({data_hash, shader}); - // Perform translation. - // If this fails the shader will be marked as invalid and ignored later. - if (!shader_translator_.Translate(shader)) { - XELOGE("Shader translation failed; marking shader as ignored"); - return shader; - } - - // Prepare the shader for use (creates our VkShaderModule). - // It could still fail at this point. - if (!shader->Prepare()) { - XELOGE("Shader preparation failed; marking shader as ignored"); - return shader; - } - - if (shader->is_valid()) { - XELOGGPU("Generated %s shader at 0x%.8X (%db):\n%s", - shader_type == ShaderType::kVertex ? "vertex" : "pixel", - guest_address, dword_count * 4, - shader->ucode_disassembly().c_str()); - } - - // Dump shader files if desired. - if (!FLAGS_dump_shaders.empty()) { - shader->Dump(FLAGS_dump_shaders, "vk"); - } - return shader; } -bool PipelineCache::ConfigurePipeline(VkCommandBuffer command_buffer, - const RenderState* render_state, - VulkanShader* vertex_shader, - VulkanShader* pixel_shader, - PrimitiveType primitive_type) { +PipelineCache::UpdateStatus PipelineCache::ConfigurePipeline( + VkCommandBuffer command_buffer, const RenderState* render_state, + VulkanShader* vertex_shader, VulkanShader* pixel_shader, + PrimitiveType primitive_type, VkPipeline* pipeline_out) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + assert_not_null(pipeline_out); + // Perform a pass over all registers and state updating our cached structures. // This will tell us if anything has changed that requires us to either build // a new pipeline or use an existing one. @@ -208,7 +190,7 @@ bool PipelineCache::ConfigurePipeline(VkCommandBuffer command_buffer, // Error updating state - bail out. // We are in an indeterminate state, so reset things for the next attempt. current_pipeline_ = nullptr; - return false; + return update_status; } if (!pipeline) { // Should have a hash key produced by the UpdateState pass. @@ -217,24 +199,12 @@ bool PipelineCache::ConfigurePipeline(VkCommandBuffer command_buffer, current_pipeline_ = pipeline; if (!pipeline) { // Unable to create pipeline. - return false; + return UpdateStatus::kError; } } - // Bind the pipeline. - vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); - - // Issue all changed dynamic state information commands. - // TODO(benvanik): dynamic state is kept in the command buffer, so if we - // have issued it before (regardless of pipeline) we don't need to do it now. - // TODO(benvanik): track whether we have issued on the given command buffer. - bool full_dynamic_state = true; - if (!SetDynamicState(command_buffer, full_dynamic_state)) { - // Failed to update state. - return false; - } - - return true; + *pipeline_out = pipeline; + return update_status; } void PipelineCache::ClearCache() { @@ -291,16 +261,140 @@ VkPipeline PipelineCache::GetPipeline(const RenderState* render_state, pipeline_info.basePipelineHandle = nullptr; pipeline_info.basePipelineIndex = 0; VkPipeline pipeline = nullptr; - auto err = vkCreateGraphicsPipelines(device_, nullptr, 1, &pipeline_info, - nullptr, &pipeline); + auto err = vkCreateGraphicsPipelines(device_, pipeline_cache_, 1, + &pipeline_info, nullptr, &pipeline); CheckResult(err, "vkCreateGraphicsPipelines"); + // Dump shader disassembly. + if (FLAGS_vulkan_dump_disasm) { + DumpShaderDisasmNV(pipeline_info); + } + // Add to cache with the hash key for reuse. cached_pipelines_.insert({hash_key, pipeline}); return pipeline; } +bool PipelineCache::TranslateShader(VulkanShader* shader, + xenos::xe_gpu_program_cntl_t cntl) { + // Perform translation. + // If this fails the shader will be marked as invalid and ignored later. + if (!shader_translator_.Translate(shader, cntl)) { + XELOGE("Shader translation failed; marking shader as ignored"); + return false; + } + + // Prepare the shader for use (creates our VkShaderModule). + // It could still fail at this point. + if (!shader->Prepare()) { + XELOGE("Shader preparation failed; marking shader as ignored"); + return false; + } + + if (shader->is_valid()) { + XELOGGPU("Generated %s shader (%db) - hash %.16" PRIX64 ":\n%s\n", + shader->type() == ShaderType::kVertex ? "vertex" : "pixel", + shader->ucode_dword_count() * 4, shader->ucode_data_hash(), + shader->ucode_disassembly().c_str()); + } + + // Dump shader files if desired. + if (!FLAGS_dump_shaders.empty()) { + shader->Dump(FLAGS_dump_shaders, "vk"); + } + + return shader->is_valid(); +} + +void PipelineCache::DumpShaderDisasmNV( + const VkGraphicsPipelineCreateInfo& pipeline_info) { + // !! HACK !!: This only works on NVidia drivers. Dumps shader disasm. + // This code is super ugly. Update this when NVidia includes an official + // way to dump shader disassembly. + + VkPipelineCacheCreateInfo pipeline_cache_info; + VkPipelineCache dummy_pipeline_cache; + pipeline_cache_info.sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO; + pipeline_cache_info.pNext = nullptr; + pipeline_cache_info.flags = 0; + pipeline_cache_info.initialDataSize = 0; + pipeline_cache_info.pInitialData = nullptr; + auto err = vkCreatePipelineCache(device_, &pipeline_cache_info, nullptr, + &dummy_pipeline_cache); + CheckResult(err, "vkCreatePipelineCache"); + + // Create a pipeline on the dummy cache and dump it. + VkPipeline dummy_pipeline; + err = vkCreateGraphicsPipelines(device_, dummy_pipeline_cache, 1, + &pipeline_info, nullptr, &dummy_pipeline); + + std::vector pipeline_data; + size_t data_size = 0; + err = vkGetPipelineCacheData(device_, dummy_pipeline_cache, &data_size, + nullptr); + if (err == VK_SUCCESS) { + pipeline_data.resize(data_size); + vkGetPipelineCacheData(device_, dummy_pipeline_cache, &data_size, + pipeline_data.data()); + + // Scan the data for the disassembly. + std::string disasm_vp, disasm_fp; + + const char* disasm_start_vp = nullptr; + const char* disasm_start_fp = nullptr; + size_t search_offset = 0; + const char* search_start = + reinterpret_cast(pipeline_data.data()); + while (true) { + auto p = reinterpret_cast( + memchr(pipeline_data.data() + search_offset, '!', + pipeline_data.size() - search_offset)); + if (!p) { + break; + } + if (!strncmp(p, "!!NV", 4)) { + if (!strncmp(p + 4, "vp", 2)) { + disasm_start_vp = p; + } else if (!strncmp(p + 4, "fp", 2)) { + disasm_start_fp = p; + } + + if (disasm_start_fp && disasm_start_vp) { + // Found all we needed. + break; + } + } + search_offset = p - search_start; + ++search_offset; + } + if (disasm_start_vp) { + disasm_vp = std::string(disasm_start_vp); + + // For some reason there's question marks all over the code. + disasm_vp.erase(std::remove(disasm_vp.begin(), disasm_vp.end(), '?'), + disasm_vp.end()); + } else { + disasm_vp = std::string("Shader disassembly not available."); + } + + if (disasm_start_fp) { + disasm_fp = std::string(disasm_start_fp); + + // For some reason there's question marks all over the code. + disasm_fp.erase(std::remove(disasm_fp.begin(), disasm_fp.end(), '?'), + disasm_fp.end()); + } else { + disasm_fp = std::string("Shader disassembly not available."); + } + + XELOGI("%s\n=====================================\n%s\n", disasm_vp.c_str(), + disasm_fp.c_str()); + } + + vkDestroyPipelineCache(device_, dummy_pipeline_cache, nullptr); +} + VkShaderModule PipelineCache::GetGeometryShader(PrimitiveType primitive_type, bool is_line_mode) { switch (primitive_type) { @@ -334,10 +428,16 @@ VkShaderModule PipelineCache::GetGeometryShader(PrimitiveType primitive_type, bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, bool full_update) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + auto& regs = set_dynamic_state_registers_; bool window_offset_dirty = SetShadowRegister(®s.pa_sc_window_offset, XE_GPU_REG_PA_SC_WINDOW_OFFSET); + window_offset_dirty |= SetShadowRegister(®s.pa_su_sc_mode_cntl, + XE_GPU_REG_PA_SU_SC_MODE_CNTL); // Window parameters. // http://ftp.tku.edu.tw/NetBSD/NetBSD-current/xsrc/external/mit/xf86-video-ati/dist/src/r600_reg_auto_r6xx.h @@ -397,22 +497,21 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, viewport_state_dirty |= SetShadowRegister(®s.pa_cl_vport_zscale, XE_GPU_REG_PA_CL_VPORT_ZSCALE); if (viewport_state_dirty) { - // HACK: no clue where to get these values. // RB_SURFACE_INFO auto surface_msaa = static_cast((regs.rb_surface_info >> 16) & 0x3); - // TODO(benvanik): ?? + + // Apply a multiplier to emulate MSAA. float window_width_scalar = 1; float window_height_scalar = 1; switch (surface_msaa) { case MsaaSamples::k1X: break; case MsaaSamples::k2X: - window_width_scalar = 2; + window_height_scalar = 2; break; case MsaaSamples::k4X: - window_width_scalar = 2; - window_height_scalar = 2; + window_width_scalar = window_height_scalar = 2; break; } @@ -429,10 +528,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, vport_yoffset_enable == vport_zoffset_enable); VkViewport viewport_rect; - viewport_rect.x = 0; - viewport_rect.y = 0; - viewport_rect.width = 100; - viewport_rect.height = 100; + std::memset(&viewport_rect, 0, sizeof(VkViewport)); viewport_rect.minDepth = 0; viewport_rect.maxDepth = 1; @@ -443,6 +539,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, float voy = vport_yoffset_enable ? regs.pa_cl_vport_yoffset : 0; float vsx = vport_xscale_enable ? regs.pa_cl_vport_xscale : 1; float vsy = vport_yscale_enable ? regs.pa_cl_vport_yscale : 1; + window_width_scalar = window_height_scalar = 1; float vpw = 2 * window_width_scalar * vsx; float vph = -2 * window_height_scalar * vsy; @@ -490,25 +587,25 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, vkCmdSetBlendConstants(command_buffer, regs.rb_blend_rgba); } - // VK_DYNAMIC_STATE_LINE_WIDTH - vkCmdSetLineWidth(command_buffer, 1.0f); + if (full_update) { + // VK_DYNAMIC_STATE_LINE_WIDTH + vkCmdSetLineWidth(command_buffer, 1.0f); - // VK_DYNAMIC_STATE_DEPTH_BIAS - vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f); + // VK_DYNAMIC_STATE_DEPTH_BIAS + vkCmdSetDepthBias(command_buffer, 0.0f, 0.0f, 0.0f); - // VK_DYNAMIC_STATE_DEPTH_BOUNDS - vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f); + // VK_DYNAMIC_STATE_DEPTH_BOUNDS + vkCmdSetDepthBounds(command_buffer, 0.0f, 1.0f); - // VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK - vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); + // VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK + vkCmdSetStencilCompareMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); - // VK_DYNAMIC_STATE_STENCIL_REFERENCE - vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); + // VK_DYNAMIC_STATE_STENCIL_REFERENCE + vkCmdSetStencilReference(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); - // VK_DYNAMIC_STATE_STENCIL_WRITE_MASK - vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); - - // TODO(benvanik): push constants. + // VK_DYNAMIC_STATE_STENCIL_WRITE_MASK + vkCmdSetStencilWriteMask(command_buffer, VK_STENCIL_FRONT_AND_BACK, 0); + } bool push_constants_dirty = full_update || viewport_state_dirty; push_constants_dirty |= @@ -539,7 +636,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, push_constants.window_scale[1] = -1.0f; } else { push_constants.window_scale[0] = 1.0f / 2560.0f; - push_constants.window_scale[1] = -1.0f / 2560.0f; + push_constants.window_scale[1] = 1.0f / 2560.0f; } // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf @@ -558,7 +655,7 @@ bool PipelineCache::SetDynamicState(VkCommandBuffer command_buffer, push_constants.vtx_fmt[3] = vtx_w0_fmt; // Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE - // Deprecated in Vulkan, implemented in shader. + // Emulated in shader. // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard; // ALPHATESTENABLE push_constants.alpha_test[0] = @@ -657,16 +754,32 @@ PipelineCache::UpdateStatus PipelineCache::UpdateShaderStages( bool dirty = false; dirty |= SetShadowRegister(®s.pa_su_sc_mode_cntl, XE_GPU_REG_PA_SU_SC_MODE_CNTL); + dirty |= SetShadowRegister(®s.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL); dirty |= regs.vertex_shader != vertex_shader; dirty |= regs.pixel_shader != pixel_shader; dirty |= regs.primitive_type != primitive_type; + regs.vertex_shader = vertex_shader; + regs.pixel_shader = pixel_shader; + regs.primitive_type = primitive_type; XXH64_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } - regs.vertex_shader = vertex_shader; - regs.pixel_shader = pixel_shader; - regs.primitive_type = primitive_type; + + xenos::xe_gpu_program_cntl_t sq_program_cntl; + sq_program_cntl.dword_0 = regs.sq_program_cntl; + + if (!vertex_shader->is_translated() && + !TranslateShader(vertex_shader, sq_program_cntl)) { + XELOGE("Failed to translate the vertex shader!"); + return UpdateStatus::kError; + } + + if (!pixel_shader->is_translated() && + !TranslateShader(pixel_shader, sq_program_cntl)) { + XELOGE("Failed to translate the pixel shader!"); + return UpdateStatus::kError; + } update_shader_stages_stage_count_ = 0; @@ -723,11 +836,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState( bool dirty = false; dirty |= vertex_shader != regs.vertex_shader; + regs.vertex_shader = vertex_shader; XXH64_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } - regs.vertex_shader = vertex_shader; state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO; state_info.pNext = nullptr; @@ -765,11 +878,14 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState( : VK_FORMAT_A2R10G10B10_UNORM_PACK32; break; case VertexFormat::k_10_11_11: - assert_always("unsupported?"); + assert_true(is_signed); vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32; break; case VertexFormat::k_11_11_10: - assert_true(is_signed); + // Converted in-shader. + // TODO(DrChat) + assert_always(); + // vertex_attrib_descr.format = VK_FORMAT_R32_UINT; vertex_attrib_descr.format = VK_FORMAT_B10G11R11_UFLOAT_PACK32; break; case VertexFormat::k_16_16: @@ -802,19 +918,19 @@ PipelineCache::UpdateStatus PipelineCache::UpdateVertexInputState( is_signed ? VK_FORMAT_R32G32B32A32_SINT : VK_FORMAT_R32_UINT; break; case VertexFormat::k_32_FLOAT: - assert_true(is_signed); + // assert_true(is_signed); vertex_attrib_descr.format = VK_FORMAT_R32_SFLOAT; break; case VertexFormat::k_32_32_FLOAT: - assert_true(is_signed); + // assert_true(is_signed); vertex_attrib_descr.format = VK_FORMAT_R32G32_SFLOAT; break; case VertexFormat::k_32_32_32_FLOAT: - assert_true(is_signed); + // assert_true(is_signed); vertex_attrib_descr.format = VK_FORMAT_R32G32B32_SFLOAT; break; case VertexFormat::k_32_32_32_32_FLOAT: - assert_true(is_signed); + // assert_true(is_signed); vertex_attrib_descr.format = VK_FORMAT_R32G32B32A32_SFLOAT; break; default: @@ -843,11 +959,11 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState( XE_GPU_REG_PA_SU_SC_MODE_CNTL); dirty |= SetShadowRegister(®s.multi_prim_ib_reset_index, XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX); + regs.primitive_type = primitive_type; XXH64_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; } - regs.primitive_type = primitive_type; state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; @@ -934,14 +1050,17 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState( auto& state_info = update_rasterization_state_info_; bool dirty = false; + dirty |= regs.primitive_type != primitive_type; dirty |= SetShadowRegister(®s.pa_su_sc_mode_cntl, XE_GPU_REG_PA_SU_SC_MODE_CNTL); dirty |= SetShadowRegister(®s.pa_sc_screen_scissor_tl, XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL); dirty |= SetShadowRegister(®s.pa_sc_screen_scissor_br, XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR); + dirty |= SetShadowRegister(®s.pa_sc_viz_query, XE_GPU_REG_PA_SC_VIZ_QUERY); dirty |= SetShadowRegister(®s.multi_prim_ib_reset_index, XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX); + regs.primitive_type = primitive_type; XXH64_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; @@ -953,10 +1072,13 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState( // TODO(benvanik): right setting? state_info.depthClampEnable = VK_FALSE; - - // TODO(benvanik): use in depth-only mode? state_info.rasterizerDiscardEnable = VK_FALSE; + // KILL_PIX_POST_EARLY_Z + if (regs.pa_sc_viz_query & 0x80) { + state_info.rasterizerDiscardEnable = VK_TRUE; + } + bool poly_mode = ((regs.pa_su_sc_mode_cntl >> 3) & 0x3) != 0; if (poly_mode) { uint32_t front_poly_mode = (regs.pa_su_sc_mode_cntl >> 5) & 0x7; @@ -981,6 +1103,10 @@ PipelineCache::UpdateStatus PipelineCache::UpdateRasterizationState( case 2: state_info.cullMode = VK_CULL_MODE_BACK_BIT; break; + case 3: + // Cull both sides? + assert_always(); + break; } if (regs.pa_su_sc_mode_cntl & 0x4) { state_info.frontFace = VK_FRONT_FACE_CLOCKWISE; @@ -1007,18 +1133,53 @@ PipelineCache::UpdateStatus PipelineCache::UpdateMultisampleState() { auto& regs = update_multisample_state_regs_; auto& state_info = update_multisample_state_info_; + bool dirty = false; + dirty |= SetShadowRegister(®s.pa_sc_aa_config, XE_GPU_REG_PA_SC_AA_CONFIG); + dirty |= SetShadowRegister(®s.pa_su_sc_mode_cntl, + XE_GPU_REG_PA_SU_SC_MODE_CNTL); + dirty |= SetShadowRegister(®s.rb_surface_info, XE_GPU_REG_RB_SURFACE_INFO); + XXH64_update(&hash_state_, ®s, sizeof(regs)); + if (!dirty) { + return UpdateStatus::kCompatible; + } + state_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO; state_info.pNext = nullptr; state_info.flags = 0; - state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + // PA_SC_AA_CONFIG MSAA_NUM_SAMPLES (0x7) + // PA_SC_AA_MASK (0xFFFF) + // PA_SU_SC_MODE_CNTL MSAA_ENABLE (0x10000) + // If set, all samples will be sampled at set locations. Otherwise, they're + // all sampled from the pixel center. + if (FLAGS_vulkan_native_msaa) { + auto msaa_num_samples = + static_cast((regs.rb_surface_info >> 16) & 0x3); + switch (msaa_num_samples) { + case MsaaSamples::k1X: + state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + break; + case MsaaSamples::k2X: + state_info.rasterizationSamples = VK_SAMPLE_COUNT_2_BIT; + break; + case MsaaSamples::k4X: + state_info.rasterizationSamples = VK_SAMPLE_COUNT_4_BIT; + break; + default: + assert_unhandled_case(msaa_num_samples); + break; + } + } else { + state_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT; + } + state_info.sampleShadingEnable = VK_FALSE; state_info.minSampleShading = 0; state_info.pSampleMask = nullptr; state_info.alphaToCoverageEnable = VK_FALSE; state_info.alphaToOneEnable = VK_FALSE; - return UpdateStatus::kCompatible; + return UpdateStatus::kMismatch; } PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { @@ -1038,19 +1199,60 @@ PipelineCache::UpdateStatus PipelineCache::UpdateDepthStencilState() { state_info.pNext = nullptr; state_info.flags = 0; - state_info.depthTestEnable = VK_FALSE; - state_info.depthWriteEnable = VK_FALSE; - state_info.depthCompareOp = VK_COMPARE_OP_ALWAYS; + static const VkCompareOp compare_func_map[] = { + /* 0 */ VK_COMPARE_OP_NEVER, + /* 1 */ VK_COMPARE_OP_LESS, + /* 2 */ VK_COMPARE_OP_EQUAL, + /* 3 */ VK_COMPARE_OP_LESS_OR_EQUAL, + /* 4 */ VK_COMPARE_OP_GREATER, + /* 5 */ VK_COMPARE_OP_NOT_EQUAL, + /* 6 */ VK_COMPARE_OP_GREATER_OR_EQUAL, + /* 7 */ VK_COMPARE_OP_ALWAYS, + }; + static const VkStencilOp stencil_op_map[] = { + /* 0 */ VK_STENCIL_OP_KEEP, + /* 1 */ VK_STENCIL_OP_ZERO, + /* 2 */ VK_STENCIL_OP_REPLACE, + /* 3 */ VK_STENCIL_OP_INCREMENT_AND_WRAP, + /* 4 */ VK_STENCIL_OP_DECREMENT_AND_WRAP, + /* 5 */ VK_STENCIL_OP_INVERT, + /* 6 */ VK_STENCIL_OP_INCREMENT_AND_CLAMP, + /* 7 */ VK_STENCIL_OP_DECREMENT_AND_CLAMP, + }; + + // Depth state + // TODO: EARLY_Z_ENABLE (needs to be enabled in shaders) + state_info.depthWriteEnable = !!(regs.rb_depthcontrol & 0x4); + state_info.depthTestEnable = !!(regs.rb_depthcontrol & 0x2); + state_info.stencilTestEnable = !!(regs.rb_depthcontrol & 0x1); + + state_info.depthCompareOp = + compare_func_map[(regs.rb_depthcontrol >> 4) & 0x7]; state_info.depthBoundsTestEnable = VK_FALSE; - state_info.stencilTestEnable = VK_FALSE; - state_info.front.failOp = VK_STENCIL_OP_KEEP; - state_info.front.passOp = VK_STENCIL_OP_KEEP; - state_info.front.depthFailOp = VK_STENCIL_OP_KEEP; - state_info.front.compareOp = VK_COMPARE_OP_ALWAYS; - state_info.back.failOp = VK_STENCIL_OP_KEEP; - state_info.back.passOp = VK_STENCIL_OP_KEEP; - state_info.back.depthFailOp = VK_STENCIL_OP_KEEP; - state_info.back.compareOp = VK_COMPARE_OP_ALWAYS; + + uint32_t stencil_ref = (regs.rb_stencilrefmask & 0x000000FF); + uint32_t stencil_read_mask = (regs.rb_stencilrefmask & 0x0000FF00) >> 8; + + // Stencil state + state_info.front.compareOp = + compare_func_map[(regs.rb_depthcontrol >> 8) & 0x7]; + state_info.front.failOp = stencil_op_map[(regs.rb_depthcontrol >> 11) & 0x7]; + state_info.front.passOp = stencil_op_map[(regs.rb_depthcontrol >> 14) & 0x7]; + state_info.front.depthFailOp = + stencil_op_map[(regs.rb_depthcontrol >> 17) & 0x7]; + + // BACKFACE_ENABLE + if (!!(regs.rb_depthcontrol & 0x80)) { + state_info.back.compareOp = + compare_func_map[(regs.rb_depthcontrol >> 20) & 0x7]; + state_info.back.failOp = stencil_op_map[(regs.rb_depthcontrol >> 23) & 0x7]; + state_info.back.passOp = stencil_op_map[(regs.rb_depthcontrol >> 26) & 0x7]; + state_info.back.depthFailOp = + stencil_op_map[(regs.rb_depthcontrol >> 29) & 0x7]; + } else { + // Back state is identical to front state. + std::memcpy(&state_info.back, &state_info.front, sizeof(VkStencilOpState)); + } // Ignored; set dynamically. state_info.minDepthBounds = 0; @@ -1089,6 +1291,7 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() { SetShadowRegister(®s.rb_blendcontrol[2], XE_GPU_REG_RB_BLENDCONTROL_2); dirty |= SetShadowRegister(®s.rb_blendcontrol[3], XE_GPU_REG_RB_BLENDCONTROL_3); + dirty |= SetShadowRegister(®s.rb_modecontrol, XE_GPU_REG_RB_MODECONTROL); XXH64_update(&hash_state_, ®s, sizeof(regs)); if (!dirty) { return UpdateStatus::kCompatible; @@ -1101,6 +1304,8 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() { state_info.logicOpEnable = VK_FALSE; state_info.logicOp = VK_LOGIC_OP_NO_OP; + auto enable_mode = static_cast(regs.rb_modecontrol & 0x7); + static const VkBlendFactor kBlendFactorMap[] = { /* 0 */ VK_BLEND_FACTOR_ZERO, /* 1 */ VK_BLEND_FACTOR_ONE, @@ -1153,7 +1358,8 @@ PipelineCache::UpdateStatus PipelineCache::UpdateColorBlendState() { // A2XX_RB_COLOR_MASK_WRITE_* == D3DRS_COLORWRITEENABLE // Lines up with VkColorComponentFlagBits, where R=bit 1, G=bit 2, etc.. uint32_t write_mask = (regs.rb_color_mask >> (i * 4)) & 0xF; - attachment_state.colorWriteMask = write_mask; + attachment_state.colorWriteMask = + enable_mode == xenos::ModeControl::kColorDepth ? write_mask : 0; } state_info.attachmentCount = 4; diff --git a/src/xenia/gpu/vulkan/pipeline_cache.h b/src/xenia/gpu/vulkan/pipeline_cache.h index 3e623f14e..49144f50f 100644 --- a/src/xenia/gpu/vulkan/pipeline_cache.h +++ b/src/xenia/gpu/vulkan/pipeline_cache.h @@ -32,6 +32,12 @@ namespace vulkan { // including shaders, various blend/etc options, and input configuration. class PipelineCache { public: + enum class UpdateStatus { + kCompatible, + kMismatch, + kError, + }; + PipelineCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device, VkDescriptorSetLayout uniform_descriptor_set_layout, VkDescriptorSetLayout texture_descriptor_set_layout); @@ -46,11 +52,17 @@ class PipelineCache { // otherwise a new one may be created. Any state that can be set dynamically // in the command buffer is issued at this time. // Returns whether the pipeline could be successfully created. - bool ConfigurePipeline(VkCommandBuffer command_buffer, - const RenderState* render_state, - VulkanShader* vertex_shader, - VulkanShader* pixel_shader, - PrimitiveType primitive_type); + UpdateStatus ConfigurePipeline(VkCommandBuffer command_buffer, + const RenderState* render_state, + VulkanShader* vertex_shader, + VulkanShader* pixel_shader, + PrimitiveType primitive_type, + VkPipeline* pipeline_out); + + // Sets required dynamic state on the command buffer. + // Only state that has changed since the last call will be set unless + // full_update is true. + bool SetDynamicState(VkCommandBuffer command_buffer, bool full_update); // Pipeline layout shared by all pipelines. VkPipelineLayout pipeline_layout() const { return pipeline_layout_; } @@ -63,16 +75,14 @@ class PipelineCache { // state. VkPipeline GetPipeline(const RenderState* render_state, uint64_t hash_key); + bool TranslateShader(VulkanShader* shader, xenos::xe_gpu_program_cntl_t cntl); + void DumpShaderDisasmNV(const VkGraphicsPipelineCreateInfo& info); + // Gets a geometry shader used to emulate the given primitive type. // Returns nullptr if the primitive doesn't need to be emulated. VkShaderModule GetGeometryShader(PrimitiveType primitive_type, bool is_line_mode); - // Sets required dynamic state on the command buffer. - // Only state that has changed since the last call will be set unless - // full_update is true. - bool SetDynamicState(VkCommandBuffer command_buffer, bool full_update); - RegisterFile* register_file_ = nullptr; VkDevice device_ = nullptr; @@ -111,12 +121,6 @@ class PipelineCache { VkPipeline current_pipeline_ = nullptr; private: - enum class UpdateStatus { - kCompatible, - kMismatch, - kError, - }; - UpdateStatus UpdateState(VulkanShader* vertex_shader, VulkanShader* pixel_shader, PrimitiveType primitive_type); @@ -154,6 +158,7 @@ class PipelineCache { struct UpdateShaderStagesRegisters { PrimitiveType primitive_type; uint32_t pa_su_sc_mode_cntl; + uint32_t sq_program_cntl; VulkanShader* vertex_shader; VulkanShader* pixel_shader; @@ -205,11 +210,12 @@ class PipelineCache { VkPipelineViewportStateCreateInfo update_viewport_state_info_; struct UpdateRasterizationStateRegisters { + PrimitiveType primitive_type; uint32_t pa_su_sc_mode_cntl; uint32_t pa_sc_screen_scissor_tl; uint32_t pa_sc_screen_scissor_br; + uint32_t pa_sc_viz_query; uint32_t multi_prim_ib_reset_index; - PrimitiveType prim_type; UpdateRasterizationStateRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } @@ -217,6 +223,10 @@ class PipelineCache { VkPipelineRasterizationStateCreateInfo update_rasterization_state_info_; struct UpdateMultisampleStateeRegisters { + uint32_t pa_sc_aa_config; + uint32_t pa_su_sc_mode_cntl; + uint32_t rb_surface_info; + UpdateMultisampleStateeRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } } update_multisample_state_regs_; @@ -235,6 +245,7 @@ class PipelineCache { uint32_t rb_colorcontrol; uint32_t rb_color_mask; uint32_t rb_blendcontrol[4]; + uint32_t rb_modecontrol; UpdateColorBlendStateRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } diff --git a/src/xenia/gpu/vulkan/render_cache.cc b/src/xenia/gpu/vulkan/render_cache.cc index 5637d44eb..f3d3288a7 100644 --- a/src/xenia/gpu/vulkan/render_cache.cc +++ b/src/xenia/gpu/vulkan/render_cache.cc @@ -39,7 +39,7 @@ VkFormat ColorRenderTargetFormatToVkFormat(ColorRenderTargetFormat format) { case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown: // WARNING: this is wrong, most likely - no float form in vulkan? XELOGW("Unsupported EDRAM format k_2_10_10_10_FLOAT used"); - return VK_FORMAT_A2R10G10B10_SSCALED_PACK32; + return VK_FORMAT_A2R10G10B10_UNORM_PACK32; case ColorRenderTargetFormat::k_16_16: return VK_FORMAT_R16G16_UNORM; case ColorRenderTargetFormat::k_16_16_16_16: @@ -71,34 +71,6 @@ VkFormat DepthRenderTargetFormatToVkFormat(DepthRenderTargetFormat format) { } } -// Cached view into the EDRAM memory. -// The image is aliased to a region of the edram_memory_ based on the tile -// parameters. -// TODO(benvanik): reuse VkImage's with multiple VkViews for compatible -// formats? -class CachedTileView { - public: - // Key identifying the view in the cache. - TileViewKey key; - // Image mapped into EDRAM. - VkImage image = nullptr; - // Simple view on the image matching the format. - VkImageView image_view = nullptr; - - CachedTileView(VkDevice device, VkDeviceMemory edram_memory, - TileViewKey view_key); - ~CachedTileView(); - - bool IsEqual(const TileViewKey& other_key) const { - auto a = reinterpret_cast(&key); - auto b = reinterpret_cast(&other_key); - return *a == *b; - } - - private: - VkDevice device_ = nullptr; -}; - // Cached framebuffer referencing tile attachments. // Each framebuffer is specific to a render pass. Ugh. class CachedFramebuffer { @@ -151,9 +123,11 @@ class CachedRenderPass { VkDevice device_ = nullptr; }; -CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory, +CachedTileView::CachedTileView(ui::vulkan::VulkanDevice* device, + VkCommandBuffer command_buffer, + VkDeviceMemory edram_memory, TileViewKey view_key) - : device_(device), key(std::move(view_key)) { + : device_(*device), key(std::move(view_key)) { // Map format to Vulkan. VkFormat vulkan_format = VK_FORMAT_UNDEFINED; uint32_t bpp = 4; @@ -175,7 +149,8 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory, vulkan_format = DepthRenderTargetFormatToVkFormat(edram_format); } assert_true(vulkan_format != VK_FORMAT_UNDEFINED); - assert_true(bpp == 4); + // FIXME(DrChat): Was this check necessary? + // assert_true(bpp == 4); // Create the image with the desired properties. VkImageCreateInfo image_info; @@ -191,8 +166,25 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory, image_info.extent.depth = 1; image_info.mipLevels = 1; image_info.arrayLayers = 1; - // TODO(benvanik): native MSAA support? - image_info.samples = VK_SAMPLE_COUNT_1_BIT; + if (FLAGS_vulkan_native_msaa) { + auto msaa_samples = static_cast(key.msaa_samples); + switch (msaa_samples) { + case MsaaSamples::k1X: + image_info.samples = VK_SAMPLE_COUNT_1_BIT; + break; + case MsaaSamples::k2X: + image_info.samples = VK_SAMPLE_COUNT_2_BIT; + break; + case MsaaSamples::k4X: + image_info.samples = VK_SAMPLE_COUNT_4_BIT; + break; + default: + assert_unhandled_case(msaa_samples); + } + } else { + image_info.samples = VK_SAMPLE_COUNT_1_BIT; + } + sample_count = image_info.samples; image_info.tiling = VK_IMAGE_TILING_OPTIMAL; image_info.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | @@ -203,19 +195,17 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory, image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; image_info.queueFamilyIndexCount = 0; image_info.pQueueFamilyIndices = nullptr; - image_info.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED; + image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; auto err = vkCreateImage(device_, &image_info, nullptr, &image); CheckResult(err, "vkCreateImage"); - // Verify our assumptions about memory layout are correct. - VkDeviceSize edram_offset = key.tile_offset * 5120; VkMemoryRequirements memory_requirements; - vkGetImageMemoryRequirements(device, image, &memory_requirements); - assert_true(edram_offset + memory_requirements.size <= kEdramBufferCapacity); - assert_true(edram_offset % memory_requirements.alignment == 0); + vkGetImageMemoryRequirements(*device, image, &memory_requirements); - // Bind to the region of EDRAM we occupy. - err = vkBindImageMemory(device_, image, edram_memory, edram_offset); + // Bind to a newly allocated chunk. + // TODO: Alias from a really big buffer? + memory = device->AllocateMemory(memory_requirements, 0); + err = vkBindImageMemory(device_, image, memory, 0); CheckResult(err, "vkBindImageMemory"); // Create the image view we'll use to attach it to a framebuffer. @@ -242,11 +232,37 @@ CachedTileView::CachedTileView(VkDevice device, VkDeviceMemory edram_memory, CheckResult(err, "vkCreateImageView"); // TODO(benvanik): transition to general layout? + VkImageMemoryBarrier image_barrier; + image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + image_barrier.pNext = nullptr; + image_barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + image_barrier.dstAccessMask = + key.color_or_depth ? VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT + : VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + image_barrier.dstAccessMask |= + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.image = image; + image_barrier.subresourceRange.aspectMask = + key.color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_barrier.subresourceRange.baseMipLevel = 0; + image_barrier.subresourceRange.levelCount = 1; + image_barrier.subresourceRange.baseArrayLayer = 0; + image_barrier.subresourceRange.layerCount = 1; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); } CachedTileView::~CachedTileView() { vkDestroyImageView(device_, image_view, nullptr); vkDestroyImage(device_, image, nullptr); + vkFreeMemory(device_, memory, nullptr); } CachedFramebuffer::CachedFramebuffer( @@ -293,8 +309,15 @@ bool CachedFramebuffer::IsCompatible( const RenderConfiguration& desired_config) const { // We already know all render pass things line up, so let's verify dimensions, // edram offsets, etc. We need an exact match. - if (desired_config.surface_pitch_px != width || - desired_config.surface_height_px != height) { + uint32_t surface_pitch_px = desired_config.surface_msaa != MsaaSamples::k4X + ? desired_config.surface_pitch_px + : desired_config.surface_pitch_px * 2; + uint32_t surface_height_px = desired_config.surface_msaa == MsaaSamples::k1X + ? desired_config.surface_height_px + : desired_config.surface_height_px * 2; + surface_pitch_px = std::min(surface_pitch_px, 2560u); + surface_height_px = std::min(surface_height_px, 2560u); + if (surface_pitch_px != width || surface_height_px != height) { return false; } // TODO(benvanik): separate image views from images in tiles and store in fb? @@ -327,13 +350,33 @@ CachedRenderPass::CachedRenderPass(VkDevice device, : device_(device) { std::memcpy(&config, &desired_config, sizeof(config)); + VkSampleCountFlagBits sample_count; + if (FLAGS_vulkan_native_msaa) { + switch (desired_config.surface_msaa) { + case MsaaSamples::k1X: + sample_count = VK_SAMPLE_COUNT_1_BIT; + break; + case MsaaSamples::k2X: + sample_count = VK_SAMPLE_COUNT_2_BIT; + break; + case MsaaSamples::k4X: + sample_count = VK_SAMPLE_COUNT_4_BIT; + break; + default: + assert_unhandled_case(desired_config.surface_msaa); + break; + } + } else { + sample_count = VK_SAMPLE_COUNT_1_BIT; + } + // Initialize all attachments to default unused. // As we set layout(location=RT) in shaders we must always provide 4. VkAttachmentDescription attachments[5]; for (int i = 0; i < 4; ++i) { attachments[i].flags = 0; attachments[i].format = VK_FORMAT_UNDEFINED; - attachments[i].samples = VK_SAMPLE_COUNT_1_BIT; + attachments[i].samples = sample_count; attachments[i].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; attachments[i].storeOp = VK_ATTACHMENT_STORE_OP_STORE; attachments[i].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; @@ -344,7 +387,7 @@ CachedRenderPass::CachedRenderPass(VkDevice device, auto& depth_stencil_attachment = attachments[4]; depth_stencil_attachment.flags = 0; depth_stencil_attachment.format = VK_FORMAT_UNDEFINED; - depth_stencil_attachment.samples = VK_SAMPLE_COUNT_1_BIT; + depth_stencil_attachment.samples = sample_count; depth_stencil_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; depth_stencil_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; depth_stencil_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; @@ -409,6 +452,11 @@ CachedRenderPass::~CachedRenderPass() { bool CachedRenderPass::IsCompatible( const RenderConfiguration& desired_config) const { + if (config.surface_msaa != desired_config.surface_msaa && + FLAGS_vulkan_native_msaa) { + return false; + } + for (int i = 0; i < 4; ++i) { // TODO(benvanik): allow compatible vulkan formats. if (config.color[i].format != desired_config.color[i].format) { @@ -423,9 +471,10 @@ bool CachedRenderPass::IsCompatible( RenderCache::RenderCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device) - : register_file_(register_file), device_(*device) { + : register_file_(register_file), device_(device) { + VkResult status = VK_SUCCESS; + // Create the buffer we'll bind to our memory. - // We do this first so we can get the right memory type. VkBufferCreateInfo buffer_info; buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; buffer_info.pNext = nullptr; @@ -436,55 +485,39 @@ RenderCache::RenderCache(RegisterFile* register_file, buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; buffer_info.queueFamilyIndexCount = 0; buffer_info.pQueueFamilyIndices = nullptr; - auto err = vkCreateBuffer(*device, &buffer_info, nullptr, &edram_buffer_); - CheckResult(err, "vkCreateBuffer"); + status = vkCreateBuffer(*device, &buffer_info, nullptr, &edram_buffer_); + CheckResult(status, "vkCreateBuffer"); // Query requirements for the buffer. // It should be 1:1. VkMemoryRequirements buffer_requirements; - vkGetBufferMemoryRequirements(device_, edram_buffer_, &buffer_requirements); + vkGetBufferMemoryRequirements(*device_, edram_buffer_, &buffer_requirements); assert_true(buffer_requirements.size == kEdramBufferCapacity); - // Create a dummy image so we can see what memory bits it requires. - // They should overlap with the buffer requirements but are likely more - // strict. - VkImageCreateInfo test_image_info; - test_image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - test_image_info.pNext = nullptr; - test_image_info.flags = 0; - test_image_info.imageType = VK_IMAGE_TYPE_2D; - test_image_info.format = VK_FORMAT_R8G8B8A8_UINT; - test_image_info.extent.width = 128; - test_image_info.extent.height = 128; - test_image_info.extent.depth = 1; - test_image_info.mipLevels = 1; - test_image_info.arrayLayers = 1; - test_image_info.samples = VK_SAMPLE_COUNT_1_BIT; - test_image_info.tiling = VK_IMAGE_TILING_OPTIMAL; - test_image_info.usage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - test_image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; - test_image_info.queueFamilyIndexCount = 0; - test_image_info.pQueueFamilyIndices = nullptr; - test_image_info.initialLayout = VK_IMAGE_LAYOUT_GENERAL; - VkImage test_image = nullptr; - err = vkCreateImage(device_, &test_image_info, nullptr, &test_image); - CheckResult(err, "vkCreateImage"); - VkMemoryRequirements image_requirements; - vkGetImageMemoryRequirements(device_, test_image, &image_requirements); - vkDestroyImage(device_, test_image, nullptr); - assert_true((image_requirements.memoryTypeBits & - buffer_requirements.memoryTypeBits) != 0); - // Allocate EDRAM memory. - VkMemoryRequirements memory_requirements; - memory_requirements.size = buffer_requirements.size; - memory_requirements.alignment = buffer_requirements.alignment; - memory_requirements.memoryTypeBits = image_requirements.memoryTypeBits; // TODO(benvanik): do we need it host visible? - edram_memory_ = device->AllocateMemory(memory_requirements, 0); + edram_memory_ = device->AllocateMemory(buffer_requirements); + assert_not_null(edram_memory_); // Bind buffer to map our entire memory. - vkBindBufferMemory(device_, edram_buffer_, edram_memory_, 0); + status = vkBindBufferMemory(*device_, edram_buffer_, edram_memory_, 0); + CheckResult(status, "vkBindBufferMemory"); + + if (status == VK_SUCCESS) { + // For debugging, upload a grid into the EDRAM buffer. + uint32_t* gpu_data = nullptr; + status = vkMapMemory(*device_, edram_memory_, 0, buffer_requirements.size, + 0, reinterpret_cast(&gpu_data)); + CheckResult(status, "vkMapMemory"); + + if (status == VK_SUCCESS) { + for (int i = 0; i < kEdramBufferCapacity / 4; i++) { + gpu_data[i] = (i % 8) >= 4 ? 0xFF0000FF : 0xFFFFFFFF; + } + + vkUnmapMemory(*device_, edram_memory_); + } + } } RenderCache::~RenderCache() { @@ -503,13 +536,36 @@ RenderCache::~RenderCache() { cached_tile_views_.clear(); // Release underlying EDRAM memory. - vkDestroyBuffer(device_, edram_buffer_, nullptr); - vkFreeMemory(device_, edram_memory_, nullptr); + vkDestroyBuffer(*device_, edram_buffer_, nullptr); + vkFreeMemory(*device_, edram_memory_, nullptr); +} + +bool RenderCache::dirty() const { + auto& regs = *register_file_; + auto& cur_regs = shadow_registers_; + + bool dirty = false; + dirty |= cur_regs.rb_modecontrol != regs[XE_GPU_REG_RB_MODECONTROL].u32; + dirty |= cur_regs.rb_surface_info != regs[XE_GPU_REG_RB_SURFACE_INFO].u32; + dirty |= cur_regs.rb_color_info != regs[XE_GPU_REG_RB_COLOR_INFO].u32; + dirty |= cur_regs.rb_color1_info != regs[XE_GPU_REG_RB_COLOR1_INFO].u32; + dirty |= cur_regs.rb_color2_info != regs[XE_GPU_REG_RB_COLOR2_INFO].u32; + dirty |= cur_regs.rb_color3_info != regs[XE_GPU_REG_RB_COLOR3_INFO].u32; + dirty |= cur_regs.rb_depth_info != regs[XE_GPU_REG_RB_DEPTH_INFO].u32; + dirty |= cur_regs.pa_sc_window_scissor_tl != + regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; + dirty |= cur_regs.pa_sc_window_scissor_br != + regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; + return dirty; } const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, VulkanShader* vertex_shader, VulkanShader* pixel_shader) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + assert_null(current_command_buffer_); current_command_buffer_ = command_buffer; @@ -542,13 +598,34 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, } // Lookup or generate a new render pass and framebuffer for the new state. - if (!ConfigureRenderPass(config, &render_pass, &framebuffer)) { + if (!ConfigureRenderPass(command_buffer, config, &render_pass, + &framebuffer)) { return nullptr; } + current_state_.render_pass = render_pass; current_state_.render_pass_handle = render_pass->handle; current_state_.framebuffer = framebuffer; current_state_.framebuffer_handle = framebuffer->handle; + + // TODO(DrChat): Determine if we actually need an EDRAM buffer. + /* + // Depth + auto depth_target = current_state_.framebuffer->depth_stencil_attachment; + if (depth_target && current_state_.config.depth_stencil.used) { + UpdateTileView(command_buffer, depth_target, true); + } + + // Color + for (int i = 0; i < 4; i++) { + auto target = current_state_.framebuffer->color_attachments[i]; + if (!target || !current_state_.config.color[i].used) { + continue; + } + + UpdateTileView(command_buffer, target, true); + } + */ } if (!render_pass) { return nullptr; @@ -571,6 +648,15 @@ const RenderState* RenderCache::BeginRenderPass(VkCommandBuffer command_buffer, render_pass_begin_info.renderArea.extent.width = config->surface_pitch_px; render_pass_begin_info.renderArea.extent.height = config->surface_height_px; + if (config->surface_msaa == MsaaSamples::k2X) { + render_pass_begin_info.renderArea.extent.height = + std::min(config->surface_height_px * 2, 2560u); + } else if (config->surface_msaa == MsaaSamples::k4X) { + render_pass_begin_info.renderArea.extent.width *= 2; + render_pass_begin_info.renderArea.extent.height = + std::min(config->surface_height_px * 2, 2560u); + } + // Configure clear color, if clearing. // TODO(benvanik): enable clearing here during resolve? render_pass_begin_info.clearValueCount = 0; @@ -601,9 +687,15 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) { // Guess the height from the scissor height. // It's wildly inaccurate, but I've never seen it be bigger than the // EDRAM tiling. + /* uint32_t ws_y = (regs.pa_sc_window_scissor_tl >> 16) & 0x7FFF; uint32_t ws_h = ((regs.pa_sc_window_scissor_br >> 16) & 0x7FFF) - ws_y; config->surface_height_px = std::min(2560u, xe::round_up(ws_h, 16)); + */ + + // TODO(DrChat): Find an accurate way to get the surface height. Until we do, + // we're going to hardcode it to 2560, as that's the absolute maximum. + config->surface_height_px = 2560; // Color attachment configuration. if (config->mode_control == ModeControl::kColorDepth) { @@ -620,12 +712,23 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) { case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: config->color[i].format = ColorRenderTargetFormat::k_8_8_8_8; break; + case ColorRenderTargetFormat::k_2_10_10_10_unknown: + config->color[i].format = ColorRenderTargetFormat::k_2_10_10_10; + break; + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown: + config->color[i].format = ColorRenderTargetFormat::k_2_10_10_10_FLOAT; + break; } + + // Make sure all unknown bits are unset. + // RDR sets bit 0x00400000 + // assert_zero(color_info[i] & ~0x000F0FFF); } } else { for (int i = 0; i < 4; ++i) { config->color[i].edram_base = 0; config->color[i].format = ColorRenderTargetFormat::k_8_8_8_8; + config->color[i].used = false; } } @@ -635,15 +738,20 @@ bool RenderCache::ParseConfiguration(RenderConfiguration* config) { config->depth_stencil.edram_base = regs.rb_depth_info & 0xFFF; config->depth_stencil.format = static_cast((regs.rb_depth_info >> 16) & 0x1); + + // Make sure all unknown bits are unset. + // assert_zero(regs.rb_depth_info & ~0x00010FFF); } else { config->depth_stencil.edram_base = 0; config->depth_stencil.format = DepthRenderTargetFormat::kD24S8; + config->depth_stencil.used = false; } return true; } -bool RenderCache::ConfigureRenderPass(RenderConfiguration* config, +bool RenderCache::ConfigureRenderPass(VkCommandBuffer command_buffer, + RenderConfiguration* config, CachedRenderPass** out_render_pass, CachedFramebuffer** out_framebuffer) { *out_render_pass = nullptr; @@ -662,7 +770,7 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config, // If no render pass was found in the cache create a new one. if (!render_pass) { - render_pass = new CachedRenderPass(device_, *config); + render_pass = new CachedRenderPass(*device_, *config); cached_render_passes_.push_back(render_pass); } @@ -679,16 +787,25 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config, // If no framebuffer was found in the cache create a new one. if (!framebuffer) { + uint32_t tile_width = config->surface_msaa == MsaaSamples::k4X ? 40 : 80; + uint32_t tile_height = config->surface_msaa != MsaaSamples::k1X ? 8 : 16; + CachedTileView* target_color_attachments[4] = {nullptr, nullptr, nullptr, nullptr}; for (int i = 0; i < 4; ++i) { TileViewKey color_key; color_key.tile_offset = config->color[i].edram_base; - color_key.tile_width = config->surface_pitch_px / 80; - color_key.tile_height = config->surface_height_px / 16; + color_key.tile_width = + xe::round_up(config->surface_pitch_px, tile_width) / tile_width; + // color_key.tile_height = + // xe::round_up(config->surface_height_px, tile_height) / tile_height; + color_key.tile_height = 160; color_key.color_or_depth = 1; + color_key.msaa_samples = + 0; // static_cast(config->surface_msaa); color_key.edram_format = static_cast(config->color[i].format); - target_color_attachments[i] = GetTileView(color_key); + target_color_attachments[i] = + FindOrCreateTileView(command_buffer, color_key); if (!target_color_attachments) { XELOGE("Failed to get tile view for color attachment"); return false; @@ -697,21 +814,34 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config, TileViewKey depth_stencil_key; depth_stencil_key.tile_offset = config->depth_stencil.edram_base; - depth_stencil_key.tile_width = config->surface_pitch_px / 80; - depth_stencil_key.tile_height = config->surface_height_px / 16; + depth_stencil_key.tile_width = + xe::round_up(config->surface_pitch_px, tile_width) / tile_width; + // depth_stencil_key.tile_height = + // xe::round_up(config->surface_height_px, tile_height) / tile_height; + depth_stencil_key.tile_height = 160; depth_stencil_key.color_or_depth = 0; + depth_stencil_key.msaa_samples = + 0; // static_cast(config->surface_msaa); depth_stencil_key.edram_format = static_cast(config->depth_stencil.format); - auto target_depth_stencil_attachment = GetTileView(depth_stencil_key); + auto target_depth_stencil_attachment = + FindOrCreateTileView(command_buffer, depth_stencil_key); if (!target_depth_stencil_attachment) { XELOGE("Failed to get tile view for depth/stencil attachment"); return false; } + uint32_t surface_pitch_px = config->surface_msaa != MsaaSamples::k4X + ? config->surface_pitch_px + : config->surface_pitch_px * 2; + uint32_t surface_height_px = config->surface_msaa == MsaaSamples::k1X + ? config->surface_height_px + : config->surface_height_px * 2; + surface_pitch_px = std::min(surface_pitch_px, 2560u); + surface_height_px = std::min(surface_height_px, 2560u); framebuffer = new CachedFramebuffer( - device_, render_pass->handle, config->surface_pitch_px, - config->surface_height_px, target_color_attachments, - target_depth_stencil_attachment); + *device_, render_pass->handle, surface_pitch_px, surface_height_px, + target_color_attachments, target_depth_stencil_attachment); render_pass->cached_framebuffers.push_back(framebuffer); } @@ -720,7 +850,75 @@ bool RenderCache::ConfigureRenderPass(RenderConfiguration* config, return true; } -CachedTileView* RenderCache::GetTileView(const TileViewKey& view_key) { +CachedTileView* RenderCache::FindOrCreateTileView( + VkCommandBuffer command_buffer, const TileViewKey& view_key) { + auto tile_view = FindTileView(view_key); + if (tile_view) { + return tile_view; + } + + // Create a new tile and add to the cache. + tile_view = + new CachedTileView(device_, command_buffer, edram_memory_, view_key); + cached_tile_views_.push_back(tile_view); + + return tile_view; +} + +void RenderCache::UpdateTileView(VkCommandBuffer command_buffer, + CachedTileView* view, bool load, + bool insert_barrier) { + uint32_t tile_width = + view->key.msaa_samples == uint16_t(MsaaSamples::k4X) ? 40 : 80; + uint32_t tile_height = + view->key.msaa_samples != uint16_t(MsaaSamples::k1X) ? 8 : 16; + + if (insert_barrier) { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = nullptr; + if (load) { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + } else { + barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + } + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = edram_buffer_; + barrier.offset = view->key.tile_offset * 5120; + barrier.size = view->key.tile_width * tile_width * view->key.tile_height * + tile_height * view->key.color_or_depth + ? 4 + : 1; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1, + &barrier, 0, nullptr); + } + + // TODO(DrChat): Stencil copies. + VkBufferImageCopy region; + region.bufferOffset = view->key.tile_offset * 5120; + region.bufferRowLength = 0; + region.bufferImageHeight = 0; + region.imageSubresource = {0, 0, 0, 1}; + region.imageSubresource.aspectMask = view->key.color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT; + region.imageOffset = {0, 0, 0}; + region.imageExtent = {view->key.tile_width * tile_width, + view->key.tile_height * tile_height, 1}; + if (load) { + vkCmdCopyBufferToImage(command_buffer, edram_buffer_, view->image, + VK_IMAGE_LAYOUT_GENERAL, 1, ®ion); + } else { + vkCmdCopyImageToBuffer(command_buffer, view->image, VK_IMAGE_LAYOUT_GENERAL, + edram_buffer_, 1, ®ion); + } +} + +CachedTileView* RenderCache::FindTileView(const TileViewKey& view_key) const { // Check the cache. // TODO(benvanik): better lookup. for (auto tile_view : cached_tile_views_) { @@ -729,25 +927,341 @@ CachedTileView* RenderCache::GetTileView(const TileViewKey& view_key) { } } - // Create a new tile and add to the cache. - auto tile_view = new CachedTileView(device_, edram_memory_, view_key); - cached_tile_views_.push_back(tile_view); - return tile_view; + return nullptr; } void RenderCache::EndRenderPass() { assert_not_null(current_command_buffer_); - auto command_buffer = current_command_buffer_; - current_command_buffer_ = nullptr; // End the render pass. - vkCmdEndRenderPass(command_buffer); + vkCmdEndRenderPass(current_command_buffer_); + + // Copy all render targets back into our EDRAM buffer. + // Don't bother waiting on this command to complete, as next render pass may + // reuse previous framebuffer attachments. If they need this, they will wait. + // TODO: Should we bother re-tiling the images on copy back? + // + // FIXME: There's a case where we may have a really big render target (as we + // can't get the correct height atm) and we may end up overwriting the valid + // contents of another render target by mistake! Need to reorder copy commands + // to avoid this. + + // TODO(DrChat): Determine if we actually need an EDRAM buffer. + /* + std::vector cached_views; + + // Depth + auto depth_target = current_state_.framebuffer->depth_stencil_attachment; + if (depth_target && current_state_.config.depth_stencil.used) { + cached_views.push_back(depth_target); + } + + // Color + for (int i = 0; i < 4; i++) { + auto target = current_state_.framebuffer->color_attachments[i]; + if (!target || !current_state_.config.color[i].used) { + continue; + } + + cached_views.push_back(target); + } + + std::sort( + cached_views.begin(), cached_views.end(), + [](CachedTileView const* a, CachedTileView const* b) { return *a < *b; }); + + for (auto view : cached_views) { + UpdateTileView(current_command_buffer_, view, false, false); + } + */ + + current_command_buffer_ = nullptr; } void RenderCache::ClearCache() { // TODO(benvanik): caching. } +void RenderCache::RawCopyToImage(VkCommandBuffer command_buffer, + uint32_t edram_base, VkImage image, + VkImageLayout image_layout, + bool color_or_depth, VkOffset3D offset, + VkExtent3D extents) { + // Transition the texture into a transfer destination layout. + VkImageMemoryBarrier image_barrier; + image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + image_barrier.pNext = nullptr; + image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + if (image_layout != VK_IMAGE_LAYOUT_GENERAL && + image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + image_barrier.oldLayout = image_layout; + image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + image_barrier.image = image; + image_barrier.subresourceRange = {0, 0, 1, 0, 1}; + image_barrier.subresourceRange.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); + } + + VkBufferMemoryBarrier buffer_barrier; + buffer_barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + buffer_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + buffer_barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + buffer_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + buffer_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + buffer_barrier.buffer = edram_buffer_; + buffer_barrier.offset = edram_base * 5120; + // TODO: Calculate this accurately (need texel size) + buffer_barrier.size = extents.width * extents.height * 4; + + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 1, + &buffer_barrier, 0, nullptr); + + // Issue the copy command. + // TODO(DrChat): Stencil copies. + VkBufferImageCopy region; + region.bufferOffset = edram_base * 5120; + region.bufferImageHeight = 0; + region.bufferRowLength = 0; + region.imageOffset = offset; + region.imageExtent = extents; + region.imageSubresource = {0, 0, 0, 1}; + region.imageSubresource.aspectMask = + color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; + vkCmdCopyBufferToImage(command_buffer, edram_buffer_, image, image_layout, 1, + ®ion); + + // Transition the image back into its previous layout. + if (image_layout != VK_IMAGE_LAYOUT_GENERAL && + image_layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { + image_barrier.srcAccessMask = image_barrier.dstAccessMask; + image_barrier.dstAccessMask = 0; + std::swap(image_barrier.oldLayout, image_barrier.newLayout); + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); + } +} + +void RenderCache::BlitToImage(VkCommandBuffer command_buffer, + uint32_t edram_base, uint32_t pitch, + uint32_t height, MsaaSamples num_samples, + VkImage image, VkImageLayout image_layout, + bool color_or_depth, uint32_t format, + VkFilter filter, VkOffset3D offset, + VkExtent3D extents) { + if (color_or_depth) { + // Adjust similar formats for easier matching. + switch (static_cast(format)) { + case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: + format = uint32_t(ColorRenderTargetFormat::k_8_8_8_8); + break; + case ColorRenderTargetFormat::k_2_10_10_10_unknown: + format = uint32_t(ColorRenderTargetFormat::k_2_10_10_10); + break; + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown: + format = uint32_t(ColorRenderTargetFormat::k_2_10_10_10_FLOAT); + break; + } + } + + uint32_t tile_width = num_samples == MsaaSamples::k4X ? 40 : 80; + uint32_t tile_height = num_samples != MsaaSamples::k1X ? 8 : 16; + + // Grab a tile view that represents the source image. + TileViewKey key; + key.color_or_depth = color_or_depth ? 1 : 0; + key.msaa_samples = 0; // static_cast(num_samples); + key.edram_format = format; + key.tile_offset = edram_base; + key.tile_width = xe::round_up(pitch, tile_width) / tile_width; + // key.tile_height = xe::round_up(height, tile_height) / tile_height; + key.tile_height = 160; + auto tile_view = FindOrCreateTileView(command_buffer, key); + assert_not_null(tile_view); + + // Update the view with the latest contents. + // UpdateTileView(command_buffer, tile_view, true, true); + + // Transition the image into a transfer destination layout, if needed. + // TODO: Util function for this + VkImageMemoryBarrier image_barrier; + image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + image_barrier.pNext = nullptr; + image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + image_barrier.oldLayout = image_layout; + image_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + image_barrier.image = image; + image_barrier.subresourceRange = {0, 0, 1, 0, 1}; + image_barrier.subresourceRange.aspectMask = + color_or_depth ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); + + // If we overflow we'll lose the device here. + assert_true(extents.width <= key.tile_width * tile_width); + assert_true(extents.height <= key.tile_height * tile_height); + + // Now issue the blit to the destination. + if (tile_view->sample_count == VK_SAMPLE_COUNT_1_BIT) { + VkImageBlit image_blit; + image_blit.srcSubresource = {0, 0, 0, 1}; + image_blit.srcSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_blit.srcOffsets[0] = {0, 0, offset.z}; + image_blit.srcOffsets[1] = {int32_t(extents.width), int32_t(extents.height), + int32_t(extents.depth)}; + + image_blit.dstSubresource = {0, 0, 0, 1}; + image_blit.dstSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_blit.dstOffsets[0] = offset; + image_blit.dstOffsets[1] = {offset.x + int32_t(extents.width), + offset.y + int32_t(extents.height), + offset.z + int32_t(extents.depth)}; + vkCmdBlitImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL, + image, image_layout, 1, &image_blit, filter); + } else { + VkImageResolve image_resolve; + image_resolve.srcSubresource = {0, 0, 0, 1}; + image_resolve.srcSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_resolve.srcOffset = {0, 0, 0}; + + image_resolve.dstSubresource = {0, 0, 0, 1}; + image_resolve.dstSubresource.aspectMask = + color_or_depth + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + image_resolve.dstOffset = offset; + + image_resolve.extent = extents; + vkCmdResolveImage(command_buffer, tile_view->image, VK_IMAGE_LAYOUT_GENERAL, + image, image_layout, 1, &image_resolve); + } + + // Transition the image back into its previous layout. + image_barrier.srcAccessMask = image_barrier.dstAccessMask; + image_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + std::swap(image_barrier.oldLayout, image_barrier.newLayout); + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); +} + +void RenderCache::ClearEDRAMColor(VkCommandBuffer command_buffer, + uint32_t edram_base, + ColorRenderTargetFormat format, + uint32_t pitch, uint32_t height, + MsaaSamples num_samples, float* color) { + // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just + // need to detect this and calculate a value. + + // Adjust similar formats for easier matching. + switch (format) { + case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: + format = ColorRenderTargetFormat::k_8_8_8_8; + break; + case ColorRenderTargetFormat::k_2_10_10_10_unknown: + format = ColorRenderTargetFormat::k_2_10_10_10; + break; + case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_unknown: + format = ColorRenderTargetFormat::k_2_10_10_10_FLOAT; + break; + } + + uint32_t tile_width = num_samples == MsaaSamples::k4X ? 40 : 80; + uint32_t tile_height = num_samples != MsaaSamples::k1X ? 8 : 16; + + // Grab a tile view (as we need to clear an image first) + TileViewKey key; + key.color_or_depth = 1; + key.msaa_samples = 0; // static_cast(num_samples); + key.edram_format = static_cast(format); + key.tile_offset = edram_base; + key.tile_width = xe::round_up(pitch, tile_width) / tile_width; + // key.tile_height = xe::round_up(height, tile_height) / tile_height; + key.tile_height = 160; + auto tile_view = FindOrCreateTileView(command_buffer, key); + assert_not_null(tile_view); + + VkImageSubresourceRange range = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + VkClearColorValue clear_value; + std::memcpy(clear_value.float32, color, sizeof(float) * 4); + + // Issue a clear command + vkCmdClearColorImage(command_buffer, tile_view->image, + VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range); + + // Copy image back into EDRAM buffer + // UpdateTileView(command_buffer, tile_view, false, false); +} + +void RenderCache::ClearEDRAMDepthStencil(VkCommandBuffer command_buffer, + uint32_t edram_base, + DepthRenderTargetFormat format, + uint32_t pitch, uint32_t height, + MsaaSamples num_samples, float depth, + uint32_t stencil) { + // TODO: For formats <= 4 bpp, we can directly fill the EDRAM buffer. Just + // need to detect this and calculate a value. + + uint32_t tile_width = num_samples == MsaaSamples::k4X ? 40 : 80; + uint32_t tile_height = num_samples != MsaaSamples::k1X ? 8 : 16; + + // Grab a tile view (as we need to clear an image first) + TileViewKey key; + key.color_or_depth = 0; + key.msaa_samples = 0; // static_cast(num_samples); + key.edram_format = static_cast(format); + key.tile_offset = edram_base; + key.tile_width = xe::round_up(pitch, tile_width) / tile_width; + // key.tile_height = xe::round_up(height, tile_height) / tile_height; + key.tile_height = 160; + auto tile_view = FindOrCreateTileView(command_buffer, key); + assert_not_null(tile_view); + + VkImageSubresourceRange range = { + VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT, 0, 1, 0, 1, + }; + VkClearDepthStencilValue clear_value; + clear_value.depth = depth; + clear_value.stencil = stencil; + + // Issue a clear command + vkCmdClearDepthStencilImage(command_buffer, tile_view->image, + VK_IMAGE_LAYOUT_GENERAL, &clear_value, 1, &range); + + // Copy image back into EDRAM buffer + // UpdateTileView(command_buffer, tile_view, false, false); +} + +void RenderCache::FillEDRAM(VkCommandBuffer command_buffer, uint32_t value) { + vkCmdFillBuffer(command_buffer, edram_buffer_, 0, kEdramBufferCapacity, + value); +} + bool RenderCache::SetShadowRegister(uint32_t* dest, uint32_t register_name) { uint32_t value = register_file_->values[register_name].u32; if (*dest == value) { diff --git a/src/xenia/gpu/vulkan/render_cache.h b/src/xenia/gpu/vulkan/render_cache.h index 4a1574e9b..c9f0adf98 100644 --- a/src/xenia/gpu/vulkan/render_cache.h +++ b/src/xenia/gpu/vulkan/render_cache.h @@ -12,6 +12,7 @@ #include "xenia/gpu/register_file.h" #include "xenia/gpu/shader.h" +#include "xenia/gpu/texture_info.h" #include "xenia/gpu/vulkan/vulkan_shader.h" #include "xenia/gpu/xenos.h" #include "xenia/ui/vulkan/vulkan.h" @@ -36,28 +37,67 @@ struct TileViewKey { uint16_t tile_height; // 1 if format is ColorRenderTargetFormat, else DepthRenderTargetFormat. uint16_t color_or_depth : 1; + // Surface MSAA samples + uint16_t msaa_samples : 2; // Either ColorRenderTargetFormat or DepthRenderTargetFormat. - uint16_t edram_format : 15; + uint16_t edram_format : 13; }; static_assert(sizeof(TileViewKey) == 8, "Key must be tightly packed"); +// Cached view representing EDRAM memory. +// TODO(benvanik): reuse VkImage's with multiple VkViews for compatible +// formats? +class CachedTileView { + public: + // Key identifying the view in the cache. + TileViewKey key; + // Image + VkImage image = nullptr; + // Simple view on the image matching the format. + VkImageView image_view = nullptr; + // Memory buffer + VkDeviceMemory memory = nullptr; + // Image sample count + VkSampleCountFlagBits sample_count = VK_SAMPLE_COUNT_1_BIT; + + CachedTileView(ui::vulkan::VulkanDevice* device, + VkCommandBuffer command_buffer, VkDeviceMemory edram_memory, + TileViewKey view_key); + ~CachedTileView(); + + bool IsEqual(const TileViewKey& other_key) const { + auto a = reinterpret_cast(&key); + auto b = reinterpret_cast(&other_key); + return *a == *b; + } + + bool operator<(const CachedTileView& other) const { + return key.tile_offset < other.key.tile_offset; + } + + private: + VkDevice device_ = nullptr; +}; + // Parsed render configuration from the current render state. struct RenderConfiguration { // Render mode (color+depth, depth-only, etc). xenos::ModeControl mode_control; - // Target surface pitch, in pixels. + // Target surface pitch multiplied by MSAA, in pixels. uint32_t surface_pitch_px; - // ESTIMATED target surface height, in pixels. + // ESTIMATED target surface height multiplied by MSAA, in pixels. uint32_t surface_height_px; // Surface MSAA setting. MsaaSamples surface_msaa; // Color attachments for the 4 render targets. struct { + bool used; uint32_t edram_base; ColorRenderTargetFormat format; } color[4]; // Depth/stencil attachment. struct { + bool used; uint32_t edram_base; DepthRenderTargetFormat format; } depth_stencil; @@ -73,6 +113,9 @@ struct RenderState { // Target framebuffer bound to the render pass. CachedFramebuffer* framebuffer = nullptr; VkFramebuffer framebuffer_handle = nullptr; + + bool color_attachment_written[4] = {false}; + bool depth_attachment_written = false; }; // Manages the virtualized EDRAM and the render target cache. @@ -97,9 +140,13 @@ struct RenderState { // 320px by rounding up to the next tile. // // MSAA and other settings will modify the exact pixel sizes, like 4X makes -// each tile effectively 40x8px, but they are still all 5120b. As we try to -// emulate this we adjust our viewport when rendering to stretch pixels as -// needed. +// each tile effectively 40x8px / 2X makes each tile 80x8px, but they are still +// all 5120b. As we try to emulate this we adjust our viewport when rendering to +// stretch pixels as needed. +// +// It appears that games also take advantage of MSAA stretching tiles when doing +// clears. Games will clear a view with 1/2X pitch/height and 4X MSAA and then +// later draw to that view with 1X pitch/height and 1X MSAA. // // The good news is that games cannot read EDRAM directly but must use a copy // operation to get the data out. That gives us a chance to do whatever we @@ -217,6 +264,10 @@ class RenderCache { RenderCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device); ~RenderCache(); + // Call this to determine if you should start a new render pass or continue + // with an already open pass. + bool dirty() const; + // Begins a render pass targeting the state-specified framebuffer formats. // The command buffer will be transitioned into the render pass phase. const RenderState* BeginRenderPass(VkCommandBuffer command_buffer, @@ -230,24 +281,63 @@ class RenderCache { // Clears all cached content. void ClearCache(); + // Queues commands to copy EDRAM contents into an image. + // The command buffer must not be inside of a render pass when calling this. + void RawCopyToImage(VkCommandBuffer command_buffer, uint32_t edram_base, + VkImage image, VkImageLayout image_layout, + bool color_or_depth, VkOffset3D offset, + VkExtent3D extents); + + // Queues commands to blit EDRAM contents into an image. + // The command buffer must not be inside of a render pass when calling this. + void BlitToImage(VkCommandBuffer command_buffer, uint32_t edram_base, + uint32_t pitch, uint32_t height, MsaaSamples num_samples, + VkImage image, VkImageLayout image_layout, + bool color_or_depth, uint32_t format, VkFilter filter, + VkOffset3D offset, VkExtent3D extents); + + // Queues commands to clear EDRAM contents with a solid color. + // The command buffer must not be inside of a render pass when calling this. + void ClearEDRAMColor(VkCommandBuffer command_buffer, uint32_t edram_base, + ColorRenderTargetFormat format, uint32_t pitch, + uint32_t height, MsaaSamples num_samples, float* color); + // Queues commands to clear EDRAM contents with depth/stencil values. + // The command buffer must not be inside of a render pass when calling this. + void ClearEDRAMDepthStencil(VkCommandBuffer command_buffer, + uint32_t edram_base, + DepthRenderTargetFormat format, uint32_t pitch, + uint32_t height, MsaaSamples num_samples, + float depth, uint32_t stencil); + // Queues commands to fill EDRAM contents with a constant value. + // The command buffer must not be inside of a render pass when calling this. + void FillEDRAM(VkCommandBuffer command_buffer, uint32_t value); + private: // Parses the current state into a configuration object. bool ParseConfiguration(RenderConfiguration* config); + // Finds a tile view. Returns nullptr if none found matching the key. + CachedTileView* FindTileView(const TileViewKey& view_key) const; + + // Gets or creates a tile view with the given parameters. + CachedTileView* FindOrCreateTileView(VkCommandBuffer command_buffer, + const TileViewKey& view_key); + + void UpdateTileView(VkCommandBuffer command_buffer, CachedTileView* view, + bool load, bool insert_barrier = true); + // Gets or creates a render pass and frame buffer for the given configuration. // This attempts to reuse as much as possible across render passes and // framebuffers. - bool ConfigureRenderPass(RenderConfiguration* config, + bool ConfigureRenderPass(VkCommandBuffer command_buffer, + RenderConfiguration* config, CachedRenderPass** out_render_pass, CachedFramebuffer** out_framebuffer); - // Gets or creates a tile view with the given parameters. - CachedTileView* GetTileView(const TileViewKey& view_key); - RegisterFile* register_file_ = nullptr; - VkDevice device_ = nullptr; + ui::vulkan::VulkanDevice* device_ = nullptr; - // Entire 10MiB of EDRAM, aliased to hell by various VkImages. + // Entire 10MiB of EDRAM. VkDeviceMemory edram_memory_ = nullptr; // Buffer overlayed 1:1 with edram_memory_ to allow raw access. VkBuffer edram_buffer_ = nullptr; diff --git a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h index b9598cfa9..730f9f12e 100644 --- a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h +++ b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.h @@ -2,7 +2,7 @@ // source: rect_list.geom const uint8_t rect_list_geom[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x08, 0x00, - 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0xCA, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x18, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x20, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x36, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, @@ -10,8 +10,8 @@ const uint8_t rect_list_geom[] = { 0x00, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x09, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6D, 0x61, 0x69, 0x6E, 0x00, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x33, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x10, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x10, 0x00, 0x04, 0x00, @@ -40,17 +40,13 @@ const uint8_t rect_list_geom[] = { 0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x67, 0x6C, 0x5F, 0x43, 0x6C, 0x69, 0x70, 0x44, 0x69, 0x73, 0x74, 0x61, 0x6E, 0x63, 0x65, 0x00, 0x05, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x05, 0x00, 0x05, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x56, 0x65, 0x72, 0x74, - 0x65, 0x78, 0x44, 0x61, 0x74, 0x61, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, - 0x2F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, - 0x05, 0x00, 0x04, 0x00, 0x31, 0x00, 0x00, 0x00, 0x6F, 0x75, 0x74, 0x5F, - 0x76, 0x74, 0x78, 0x00, 0x05, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x56, 0x65, 0x72, 0x74, 0x65, 0x78, 0x44, 0x61, 0x74, 0x61, 0x00, 0x00, - 0x06, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x6F, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x69, 0x6E, 0x5F, 0x76, 0x74, 0x78, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, - 0x66, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, - 0xB4, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x05, 0x00, 0x07, 0x00, 0x30, 0x00, 0x00, 0x00, 0x6F, 0x75, 0x74, 0x5F, + 0x69, 0x6E, 0x74, 0x65, 0x72, 0x70, 0x6F, 0x6C, 0x61, 0x74, 0x6F, 0x72, + 0x73, 0x00, 0x00, 0x00, 0x05, 0x00, 0x07, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x69, 0x6E, 0x5F, 0x69, 0x6E, 0x74, 0x65, 0x72, 0x70, 0x6F, 0x6C, 0x61, + 0x74, 0x6F, 0x72, 0x73, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, + 0x64, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, + 0xB2, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, @@ -65,12 +61,10 @@ const uint8_t rect_list_geom[] = { 0x20, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2F, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x31, 0x00, 0x00, 0x00, - 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, @@ -107,25 +101,23 @@ const uint8_t rect_list_geom[] = { 0x03, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x2D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x03, 0x00, 0x2F, 0x00, 0x00, 0x00, - 0x2E, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, - 0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x1E, 0x00, 0x03, 0x00, 0x32, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, - 0x1C, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, - 0x34, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00, + 0x2D, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x2F, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, + 0x2F, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x1C, 0x00, 0x04, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2E, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x63, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, - 0x65, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x3B, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x3B, 0x00, 0x04, 0x00, 0x63, 0x00, 0x00, 0x00, 0xB2, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, @@ -139,7 +131,7 @@ const uint8_t rect_list_geom[] = { 0x1C, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xF7, 0x00, 0x03, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, - 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x00, 0x00, + 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, @@ -153,286 +145,283 @@ const uint8_t rect_list_geom[] = { 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x2C, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x38, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x31, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, + 0x36, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00, - 0x39, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, - 0x3B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x3B, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, + 0x37, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x39, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x39, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, - 0x3C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x3E, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x00, 0x00, + 0x3A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, + 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x2E, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x3F, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x26, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x43, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x42, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x2B, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, - 0x45, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x2B, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x44, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x46, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00, - 0x49, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, - 0x4B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x4B, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4C, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00, - 0x4C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, - 0x4E, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4B, 0x00, 0x00, 0x00, + 0x4A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0x4C, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x4C, 0x00, 0x00, 0x00, 0x4B, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00, + 0x33, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x2E, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00, 0x4D, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x51, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x52, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x26, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x53, 0x00, 0x00, 0x00, - 0x52, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x54, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x51, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x2B, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x56, 0x00, 0x00, 0x00, - 0x55, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x57, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, - 0x57, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, - 0x58, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x53, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x2B, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, + 0x53, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x56, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, - 0x5B, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x5D, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0x5B, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, - 0x5E, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x60, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, + 0x5C, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0x5E, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x61, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x5F, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x64, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x66, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, - 0x67, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00, - 0xF6, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00, 0x6A, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x6B, 0x00, 0x00, 0x00, - 0xF8, 0x00, 0x02, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, - 0xB1, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00, - 0x6C, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, - 0x6E, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, + 0x62, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x64, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, + 0x65, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x65, 0x00, 0x00, 0x00, + 0xF6, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x69, 0x00, 0x00, 0x00, + 0xF8, 0x00, 0x02, 0x00, 0x69, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x6A, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0xB1, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, + 0x6A, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, + 0x6C, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0xF8, 0x00, 0x02, 0x00, 0x66, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00, + 0x64, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x6F, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x6E, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x04, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, + 0x64, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x72, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x76, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, + 0x75, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x6D, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x7A, 0x00, 0x00, 0x00, + 0x79, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, - 0x66, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x71, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, - 0x7F, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, - 0x72, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x74, 0x00, 0x00, 0x00, 0x66, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, - 0x75, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x77, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, - 0x66, 0x00, 0x00, 0x00, 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x79, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, - 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x7B, 0x00, 0x00, 0x00, - 0x77, 0x00, 0x00, 0x00, 0x7A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x26, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x7C, 0x00, 0x00, 0x00, 0x7B, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, - 0x6A, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x6A, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x00, - 0x66, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x7E, 0x00, 0x00, 0x00, 0x7D, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x66, 0x00, 0x00, 0x00, 0x7E, 0x00, 0x00, 0x00, - 0xF9, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, - 0x69, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, - 0xF9, 0x00, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, - 0x7F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0x81, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x26, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x82, 0x00, 0x00, 0x00, - 0x81, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x83, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x2B, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x85, 0x00, 0x00, 0x00, - 0x84, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x86, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, - 0x86, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, - 0x87, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x8A, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x8A, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x00, 0x8B, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x8D, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x8D, 0x00, 0x00, 0x00, 0x8C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x36, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x8F, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x31, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, - 0x90, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, - 0x92, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x92, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, - 0x93, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, - 0x95, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, - 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x9A, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x9A, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x9C, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x9D, 0x00, 0x00, 0x00, 0x9C, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x36, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x9F, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0x31, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0xA0, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00, - 0xA0, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, - 0xA2, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0xA2, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00, - 0xA3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, - 0xA5, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0xA5, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x32, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x00, - 0x3E, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00, - 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, - 0xA8, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0xA9, 0x00, 0x00, 0x00, 0xA8, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x23, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0xAB, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00, - 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xAC, 0x00, 0x00, 0x00, - 0xA9, 0x00, 0x00, 0x00, 0xAB, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x23, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0xAE, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, - 0x83, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x00, 0x00, - 0xAC, 0x00, 0x00, 0x00, 0xAE, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x26, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xB0, 0x00, 0x00, 0x00, - 0xAF, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, - 0xB1, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0xB2, 0x00, 0x00, 0x00, 0xB1, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x2B, 0x00, 0x00, 0x00, 0xB3, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xB3, 0x00, 0x00, 0x00, - 0xB2, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xB4, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0xB5, 0x00, 0x00, 0x00, - 0xF8, 0x00, 0x02, 0x00, 0xB5, 0x00, 0x00, 0x00, 0xF6, 0x00, 0x04, 0x00, - 0xB7, 0x00, 0x00, 0x00, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xF9, 0x00, 0x02, 0x00, 0xB9, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, - 0xB9, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, - 0xBA, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, 0xB1, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0xBB, 0x00, 0x00, 0x00, 0xBA, 0x00, 0x00, 0x00, - 0x6D, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, 0xBB, 0x00, 0x00, 0x00, - 0xB6, 0x00, 0x00, 0x00, 0xB7, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, - 0xB6, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, - 0xBC, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0xBD, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, 0xBE, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0xBD, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0xBF, 0x00, 0x00, 0x00, 0xBE, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0xC0, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0xC0, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0xC2, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x04, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, 0xC2, 0x00, 0x00, 0x00, - 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x00, 0x00, - 0xBF, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0xC5, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x07, 0x00, 0x23, 0x00, 0x00, 0x00, 0xC6, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0xC5, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, - 0xC7, 0x00, 0x00, 0x00, 0xC6, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, - 0x0A, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x00, 0x00, - 0xC7, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x26, 0x00, 0x00, 0x00, - 0xC9, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, - 0xBC, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0xC9, 0x00, 0x00, 0x00, - 0xC8, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0xB8, 0x00, 0x00, 0x00, - 0xF8, 0x00, 0x02, 0x00, 0xB8, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, - 0x13, 0x00, 0x00, 0x00, 0xCA, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, - 0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, 0xCB, 0x00, 0x00, 0x00, - 0xCA, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, - 0xB4, 0x00, 0x00, 0x00, 0xCB, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, - 0xB5, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB7, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x7B, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x00, + 0x7B, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x64, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, + 0x65, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, 0xF9, 0x00, 0x02, 0x00, - 0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, - 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, + 0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x7D, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0x7E, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x00, 0x00, + 0x7E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0x83, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x83, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x33, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x2E, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, + 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x86, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0x87, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x87, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x2B, 0x00, 0x00, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x8B, 0x00, 0x00, 0x00, + 0x8A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x8C, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x8D, 0x00, 0x00, 0x00, + 0x8C, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x8D, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x00, 0x00, 0x8E, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x8F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x09, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, + 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, + 0xDB, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x98, 0x00, 0x00, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, + 0x9A, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x2B, 0x00, 0x00, 0x00, 0x9B, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x9B, 0x00, 0x00, 0x00, + 0x9A, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x34, 0x00, 0x00, 0x00, + 0x9C, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x9D, 0x00, 0x00, 0x00, + 0x9C, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x9D, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x9E, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0xA0, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0xA0, 0x00, 0x00, 0x00, 0x9F, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x16, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x09, 0x00, 0x00, 0x00, 0xA2, 0x00, 0x00, 0x00, 0xA1, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, 0xA3, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0xA3, 0x00, 0x00, 0x00, 0xA2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x34, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x2E, 0x00, 0x00, 0x00, + 0xA5, 0x00, 0x00, 0x00, 0xA4, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0x30, 0x00, 0x00, 0x00, 0xA5, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0xA6, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00, + 0xA6, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0xA8, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0xA9, 0x00, 0x00, 0x00, 0xA8, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00, 0xA7, 0x00, 0x00, 0x00, + 0xA9, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0xAB, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0xAC, 0x00, 0x00, 0x00, 0xAB, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, 0xAA, 0x00, 0x00, 0x00, + 0xAC, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, + 0xAE, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0xAE, 0x00, 0x00, 0x00, 0xAD, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x00, 0x00, + 0x12, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00, + 0xAF, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0xB1, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0xB1, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0xB2, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, + 0xF9, 0x00, 0x02, 0x00, 0xB3, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, + 0xB3, 0x00, 0x00, 0x00, 0xF6, 0x00, 0x04, 0x00, 0xB5, 0x00, 0x00, 0x00, + 0xB6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, + 0xB7, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB7, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xB8, 0x00, 0x00, 0x00, + 0xB2, 0x00, 0x00, 0x00, 0xB1, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xB9, 0x00, 0x00, 0x00, 0xB8, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, + 0xFA, 0x00, 0x04, 0x00, 0xB9, 0x00, 0x00, 0x00, 0xB4, 0x00, 0x00, 0x00, + 0xB5, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB4, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xBA, 0x00, 0x00, 0x00, + 0xB2, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, + 0xBB, 0x00, 0x00, 0x00, 0xB2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x23, 0x00, 0x00, 0x00, 0xBC, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0xBB, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0xBD, 0x00, 0x00, 0x00, 0xBC, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xBE, 0x00, 0x00, 0x00, + 0xB2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, + 0xBF, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0xBE, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0xC0, 0x00, 0x00, 0x00, 0xBF, 0x00, 0x00, 0x00, 0x7F, 0x00, 0x04, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, 0xC0, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xC2, 0x00, 0x00, 0x00, + 0xBD, 0x00, 0x00, 0x00, 0xC1, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x13, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, 0xB2, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x23, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x00, 0x00, + 0x33, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0xC3, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0xC5, 0x00, 0x00, 0x00, + 0xC4, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x00, 0x00, + 0xC6, 0x00, 0x00, 0x00, 0xC2, 0x00, 0x00, 0x00, 0xC5, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x26, 0x00, 0x00, 0x00, 0xC7, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0xBA, 0x00, 0x00, 0x00, 0x3E, 0x00, 0x03, 0x00, + 0xC7, 0x00, 0x00, 0x00, 0xC6, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, + 0xB6, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0xB6, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, + 0xB2, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, + 0xC9, 0x00, 0x00, 0x00, 0xC8, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x3E, 0x00, 0x03, 0x00, 0xB2, 0x00, 0x00, 0x00, 0xC9, 0x00, 0x00, 0x00, + 0xF9, 0x00, 0x02, 0x00, 0xB3, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, + 0xB5, 0x00, 0x00, 0x00, 0xDA, 0x00, 0x01, 0x00, 0xDB, 0x00, 0x01, 0x00, + 0xF9, 0x00, 0x02, 0x00, 0x1F, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, + 0x1F, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, }; diff --git a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt index b047926f5..94fb6a700 100644 --- a/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt +++ b/src/xenia/gpu/vulkan/shaders/bin/rect_list_geom.txt @@ -1,7 +1,7 @@ ; SPIR-V ; Version: 1.0 ; Generator: Khronos Glslang Reference Front End; 1 -; Bound: 204 +; Bound: 202 ; Schema: 0 OpCapability Geometry OpCapability GeometryPointSize @@ -9,7 +9,7 @@ OpCapability GeometryStreams %1 = OpExtInstImport "GLSL.std.450" OpMemoryModel Logical GLSL450 - OpEntryPoint Geometry %4 "main" %18 %34 %49 %53 + OpEntryPoint Geometry %4 "main" %18 %34 %48 %51 OpExecutionMode %4 Triangles OpExecutionMode %4 Invocations 1 OpExecutionMode %4 OutputTriangleStrip @@ -27,14 +27,10 @@ OpMemberName %32 1 "gl_PointSize" OpMemberName %32 2 "gl_ClipDistance" OpName %34 "" - OpName %47 "VertexData" - OpMemberName %47 0 "o" - OpName %49 "out_vtx" - OpName %50 "VertexData" - OpMemberName %50 0 "o" - OpName %53 "in_vtx" - OpName %102 "i" - OpName %180 "i" + OpName %48 "out_interpolators" + OpName %51 "in_interpolators" + OpName %100 "i" + OpName %178 "i" OpMemberDecorate %14 0 BuiltIn Position OpMemberDecorate %14 1 BuiltIn PointSize OpMemberDecorate %14 2 BuiltIn ClipDistance @@ -45,10 +41,9 @@ OpDecorate %32 Block OpDecorate %32 Stream 0 OpDecorate %34 Stream 0 - OpMemberDecorate %47 0 Location 0 - OpDecorate %47 Stream 0 - OpDecorate %49 Stream 0 - OpMemberDecorate %50 0 Location 0 + OpDecorate %48 Location 0 + OpDecorate %48 Stream 0 + OpDecorate %51 Location 0 %2 = OpTypeVoid %3 = OpTypeFunction %2 %6 = OpTypeBool @@ -77,21 +72,19 @@ %43 = OpTypePointer Output %9 %45 = OpConstant %11 16 %46 = OpTypeArray %10 %45 - %47 = OpTypeStruct %46 - %48 = OpTypePointer Output %47 - %49 = OpVariable %48 Output - %50 = OpTypeStruct %46 - %51 = OpTypeArray %50 %15 - %52 = OpTypePointer Input %51 - %53 = OpVariable %52 Input - %54 = OpTypePointer Input %50 - %101 = OpTypePointer Function %19 - %109 = OpConstant %19 16 + %47 = OpTypePointer Output %46 + %48 = OpVariable %47 Output + %49 = OpTypeArray %46 %15 + %50 = OpTypePointer Input %49 + %51 = OpVariable %50 Input + %52 = OpTypePointer Input %46 + %99 = OpTypePointer Function %19 + %107 = OpConstant %19 16 %4 = OpFunction %2 None %3 %5 = OpLabel %8 = OpVariable %7 Function - %102 = OpVariable %101 Function - %180 = OpVariable %101 Function + %100 = OpVariable %99 Function + %178 = OpVariable %99 Function %23 = OpAccessChain %22 %18 %20 %20 %21 %24 = OpLoad %9 %23 %26 = OpAccessChain %22 %18 %25 %20 %21 @@ -100,7 +93,7 @@ OpStore %8 %28 %29 = OpLoad %6 %8 OpSelectionMerge %31 None - OpBranchConditional %29 %30 %127 + OpBranchConditional %29 %30 %125 %30 = OpLabel %36 = OpAccessChain %35 %18 %20 %20 %37 = OpLoad %10 %36 @@ -110,216 +103,216 @@ %42 = OpLoad %9 %41 %44 = OpAccessChain %43 %34 %40 OpStore %44 %42 - %55 = OpAccessChain %54 %53 %20 - %56 = OpLoad %50 %55 - OpStore %49 %56 + %53 = OpAccessChain %52 %51 %20 + %54 = OpLoad %46 %53 + OpStore %48 %54 OpEmitVertex - %57 = OpAccessChain %35 %18 %40 %20 - %58 = OpLoad %10 %57 - %59 = OpAccessChain %38 %34 %20 - OpStore %59 %58 - %60 = OpAccessChain %22 %18 %40 %40 - %61 = OpLoad %9 %60 - %62 = OpAccessChain %43 %34 %40 - OpStore %62 %61 - %63 = OpAccessChain %54 %53 %40 - %64 = OpLoad %50 %63 - OpStore %49 %64 + %55 = OpAccessChain %35 %18 %40 %20 + %56 = OpLoad %10 %55 + %57 = OpAccessChain %38 %34 %20 + OpStore %57 %56 + %58 = OpAccessChain %22 %18 %40 %40 + %59 = OpLoad %9 %58 + %60 = OpAccessChain %43 %34 %40 + OpStore %60 %59 + %61 = OpAccessChain %52 %51 %40 + %62 = OpLoad %46 %61 + OpStore %48 %62 OpEmitVertex - %65 = OpAccessChain %35 %18 %25 %20 - %66 = OpLoad %10 %65 - %67 = OpAccessChain %38 %34 %20 - OpStore %67 %66 - %68 = OpAccessChain %22 %18 %25 %40 - %69 = OpLoad %9 %68 - %70 = OpAccessChain %43 %34 %40 - OpStore %70 %69 - %71 = OpAccessChain %54 %53 %25 - %72 = OpLoad %50 %71 - OpStore %49 %72 + %63 = OpAccessChain %35 %18 %25 %20 + %64 = OpLoad %10 %63 + %65 = OpAccessChain %38 %34 %20 + OpStore %65 %64 + %66 = OpAccessChain %22 %18 %25 %40 + %67 = OpLoad %9 %66 + %68 = OpAccessChain %43 %34 %40 + OpStore %68 %67 + %69 = OpAccessChain %52 %51 %25 + %70 = OpLoad %46 %69 + OpStore %48 %70 OpEmitVertex OpEndPrimitive - %73 = OpAccessChain %35 %18 %25 %20 - %74 = OpLoad %10 %73 - %75 = OpAccessChain %38 %34 %20 - OpStore %75 %74 - %76 = OpAccessChain %22 %18 %25 %40 - %77 = OpLoad %9 %76 - %78 = OpAccessChain %43 %34 %40 - OpStore %78 %77 - %79 = OpAccessChain %54 %53 %25 - %80 = OpLoad %50 %79 - OpStore %49 %80 + %71 = OpAccessChain %35 %18 %25 %20 + %72 = OpLoad %10 %71 + %73 = OpAccessChain %38 %34 %20 + OpStore %73 %72 + %74 = OpAccessChain %22 %18 %25 %40 + %75 = OpLoad %9 %74 + %76 = OpAccessChain %43 %34 %40 + OpStore %76 %75 + %77 = OpAccessChain %52 %51 %25 + %78 = OpLoad %46 %77 + OpStore %48 %78 OpEmitVertex - %81 = OpAccessChain %35 %18 %40 %20 - %82 = OpLoad %10 %81 - %83 = OpAccessChain %38 %34 %20 - OpStore %83 %82 - %84 = OpAccessChain %22 %18 %40 %40 - %85 = OpLoad %9 %84 - %86 = OpAccessChain %43 %34 %40 - OpStore %86 %85 - %87 = OpAccessChain %54 %53 %40 - %88 = OpLoad %50 %87 - OpStore %49 %88 + %79 = OpAccessChain %35 %18 %40 %20 + %80 = OpLoad %10 %79 + %81 = OpAccessChain %38 %34 %20 + OpStore %81 %80 + %82 = OpAccessChain %22 %18 %40 %40 + %83 = OpLoad %9 %82 + %84 = OpAccessChain %43 %34 %40 + OpStore %84 %83 + %85 = OpAccessChain %52 %51 %40 + %86 = OpLoad %46 %85 + OpStore %48 %86 OpEmitVertex - %89 = OpAccessChain %35 %18 %40 %20 + %87 = OpAccessChain %35 %18 %40 %20 + %88 = OpLoad %10 %87 + %89 = OpAccessChain %35 %18 %25 %20 %90 = OpLoad %10 %89 - %91 = OpAccessChain %35 %18 %25 %20 - %92 = OpLoad %10 %91 - %93 = OpFAdd %10 %90 %92 - %94 = OpAccessChain %35 %18 %20 %20 - %95 = OpLoad %10 %94 - %96 = OpFSub %10 %93 %95 - %97 = OpAccessChain %38 %34 %20 - OpStore %97 %96 - %98 = OpAccessChain %22 %18 %25 %40 - %99 = OpLoad %9 %98 - %100 = OpAccessChain %43 %34 %40 - OpStore %100 %99 - OpStore %102 %20 - OpBranch %103 - %103 = OpLabel - OpLoopMerge %105 %106 None - OpBranch %107 - %107 = OpLabel - %108 = OpLoad %19 %102 - %110 = OpSLessThan %6 %108 %109 - OpBranchConditional %110 %104 %105 - %104 = OpLabel - %111 = OpLoad %19 %102 - %112 = OpLoad %19 %102 - %113 = OpAccessChain %35 %53 %20 %20 %112 - %114 = OpLoad %10 %113 - %115 = OpFNegate %10 %114 - %116 = OpLoad %19 %102 - %117 = OpAccessChain %35 %53 %40 %20 %116 - %118 = OpLoad %10 %117 - %119 = OpFAdd %10 %115 %118 - %120 = OpLoad %19 %102 - %121 = OpAccessChain %35 %53 %25 %20 %120 - %122 = OpLoad %10 %121 - %123 = OpFAdd %10 %119 %122 - %124 = OpAccessChain %38 %49 %20 %111 - OpStore %124 %123 - OpBranch %106 - %106 = OpLabel - %125 = OpLoad %19 %102 - %126 = OpIAdd %19 %125 %40 - OpStore %102 %126 - OpBranch %103 + %91 = OpFAdd %10 %88 %90 + %92 = OpAccessChain %35 %18 %20 %20 + %93 = OpLoad %10 %92 + %94 = OpFSub %10 %91 %93 + %95 = OpAccessChain %38 %34 %20 + OpStore %95 %94 + %96 = OpAccessChain %22 %18 %25 %40 + %97 = OpLoad %9 %96 + %98 = OpAccessChain %43 %34 %40 + OpStore %98 %97 + OpStore %100 %20 + OpBranch %101 + %101 = OpLabel + OpLoopMerge %103 %104 None + OpBranch %105 %105 = OpLabel + %106 = OpLoad %19 %100 + %108 = OpSLessThan %6 %106 %107 + OpBranchConditional %108 %102 %103 + %102 = OpLabel + %109 = OpLoad %19 %100 + %110 = OpLoad %19 %100 + %111 = OpAccessChain %35 %51 %20 %110 + %112 = OpLoad %10 %111 + %113 = OpFNegate %10 %112 + %114 = OpLoad %19 %100 + %115 = OpAccessChain %35 %51 %40 %114 + %116 = OpLoad %10 %115 + %117 = OpFAdd %10 %113 %116 + %118 = OpLoad %19 %100 + %119 = OpAccessChain %35 %51 %25 %118 + %120 = OpLoad %10 %119 + %121 = OpFAdd %10 %117 %120 + %122 = OpAccessChain %38 %48 %109 + OpStore %122 %121 + OpBranch %104 + %104 = OpLabel + %123 = OpLoad %19 %100 + %124 = OpIAdd %19 %123 %40 + OpStore %100 %124 + OpBranch %101 + %103 = OpLabel OpEmitVertex OpEndPrimitive OpBranch %31 - %127 = OpLabel - %128 = OpAccessChain %35 %18 %20 %20 - %129 = OpLoad %10 %128 - %130 = OpAccessChain %38 %34 %20 - OpStore %130 %129 - %131 = OpAccessChain %22 %18 %20 %40 - %132 = OpLoad %9 %131 - %133 = OpAccessChain %43 %34 %40 - OpStore %133 %132 - %134 = OpAccessChain %54 %53 %20 - %135 = OpLoad %50 %134 - OpStore %49 %135 + %125 = OpLabel + %126 = OpAccessChain %35 %18 %20 %20 + %127 = OpLoad %10 %126 + %128 = OpAccessChain %38 %34 %20 + OpStore %128 %127 + %129 = OpAccessChain %22 %18 %20 %40 + %130 = OpLoad %9 %129 + %131 = OpAccessChain %43 %34 %40 + OpStore %131 %130 + %132 = OpAccessChain %52 %51 %20 + %133 = OpLoad %46 %132 + OpStore %48 %133 OpEmitVertex - %136 = OpAccessChain %35 %18 %40 %20 - %137 = OpLoad %10 %136 - %138 = OpAccessChain %38 %34 %20 - OpStore %138 %137 - %139 = OpAccessChain %22 %18 %40 %40 - %140 = OpLoad %9 %139 - %141 = OpAccessChain %43 %34 %40 - OpStore %141 %140 - %142 = OpAccessChain %54 %53 %40 - %143 = OpLoad %50 %142 - OpStore %49 %143 + %134 = OpAccessChain %35 %18 %40 %20 + %135 = OpLoad %10 %134 + %136 = OpAccessChain %38 %34 %20 + OpStore %136 %135 + %137 = OpAccessChain %22 %18 %40 %40 + %138 = OpLoad %9 %137 + %139 = OpAccessChain %43 %34 %40 + OpStore %139 %138 + %140 = OpAccessChain %52 %51 %40 + %141 = OpLoad %46 %140 + OpStore %48 %141 OpEmitVertex - %144 = OpAccessChain %35 %18 %25 %20 - %145 = OpLoad %10 %144 - %146 = OpAccessChain %38 %34 %20 - OpStore %146 %145 - %147 = OpAccessChain %22 %18 %25 %40 - %148 = OpLoad %9 %147 - %149 = OpAccessChain %43 %34 %40 - OpStore %149 %148 - %150 = OpAccessChain %54 %53 %25 - %151 = OpLoad %50 %150 - OpStore %49 %151 + %142 = OpAccessChain %35 %18 %25 %20 + %143 = OpLoad %10 %142 + %144 = OpAccessChain %38 %34 %20 + OpStore %144 %143 + %145 = OpAccessChain %22 %18 %25 %40 + %146 = OpLoad %9 %145 + %147 = OpAccessChain %43 %34 %40 + OpStore %147 %146 + %148 = OpAccessChain %52 %51 %25 + %149 = OpLoad %46 %148 + OpStore %48 %149 OpEmitVertex OpEndPrimitive - %152 = OpAccessChain %35 %18 %20 %20 - %153 = OpLoad %10 %152 - %154 = OpAccessChain %38 %34 %20 - OpStore %154 %153 - %155 = OpAccessChain %22 %18 %20 %40 - %156 = OpLoad %9 %155 - %157 = OpAccessChain %43 %34 %40 - OpStore %157 %156 - %158 = OpAccessChain %54 %53 %20 - %159 = OpLoad %50 %158 - OpStore %49 %159 + %150 = OpAccessChain %35 %18 %20 %20 + %151 = OpLoad %10 %150 + %152 = OpAccessChain %38 %34 %20 + OpStore %152 %151 + %153 = OpAccessChain %22 %18 %20 %40 + %154 = OpLoad %9 %153 + %155 = OpAccessChain %43 %34 %40 + OpStore %155 %154 + %156 = OpAccessChain %52 %51 %20 + %157 = OpLoad %46 %156 + OpStore %48 %157 OpEmitVertex - %160 = OpAccessChain %35 %18 %25 %20 - %161 = OpLoad %10 %160 - %162 = OpAccessChain %38 %34 %20 - OpStore %162 %161 - %163 = OpAccessChain %22 %18 %25 %40 - %164 = OpLoad %9 %163 - %165 = OpAccessChain %43 %34 %40 - OpStore %165 %164 - %166 = OpAccessChain %54 %53 %25 - %167 = OpLoad %50 %166 - OpStore %49 %167 + %158 = OpAccessChain %35 %18 %25 %20 + %159 = OpLoad %10 %158 + %160 = OpAccessChain %38 %34 %20 + OpStore %160 %159 + %161 = OpAccessChain %22 %18 %25 %40 + %162 = OpLoad %9 %161 + %163 = OpAccessChain %43 %34 %40 + OpStore %163 %162 + %164 = OpAccessChain %52 %51 %25 + %165 = OpLoad %46 %164 + OpStore %48 %165 OpEmitVertex - %168 = OpAccessChain %35 %18 %20 %20 + %166 = OpAccessChain %35 %18 %20 %20 + %167 = OpLoad %10 %166 + %168 = OpAccessChain %35 %18 %25 %20 %169 = OpLoad %10 %168 - %170 = OpAccessChain %35 %18 %25 %20 - %171 = OpLoad %10 %170 - %172 = OpFAdd %10 %169 %171 - %173 = OpAccessChain %35 %18 %40 %20 - %174 = OpLoad %10 %173 - %175 = OpFSub %10 %172 %174 - %176 = OpAccessChain %38 %34 %20 - OpStore %176 %175 - %177 = OpAccessChain %22 %18 %25 %40 - %178 = OpLoad %9 %177 - %179 = OpAccessChain %43 %34 %40 - OpStore %179 %178 - OpStore %180 %20 - OpBranch %181 - %181 = OpLabel - OpLoopMerge %183 %184 None - OpBranch %185 - %185 = OpLabel - %186 = OpLoad %19 %180 - %187 = OpSLessThan %6 %186 %109 - OpBranchConditional %187 %182 %183 - %182 = OpLabel - %188 = OpLoad %19 %180 - %189 = OpLoad %19 %180 - %190 = OpAccessChain %35 %53 %20 %20 %189 - %191 = OpLoad %10 %190 - %192 = OpLoad %19 %180 - %193 = OpAccessChain %35 %53 %40 %20 %192 - %194 = OpLoad %10 %193 - %195 = OpFNegate %10 %194 - %196 = OpFAdd %10 %191 %195 - %197 = OpLoad %19 %180 - %198 = OpAccessChain %35 %53 %25 %20 %197 - %199 = OpLoad %10 %198 - %200 = OpFAdd %10 %196 %199 - %201 = OpAccessChain %38 %49 %20 %188 - OpStore %201 %200 - OpBranch %184 - %184 = OpLabel - %202 = OpLoad %19 %180 - %203 = OpIAdd %19 %202 %40 - OpStore %180 %203 - OpBranch %181 + %170 = OpFAdd %10 %167 %169 + %171 = OpAccessChain %35 %18 %40 %20 + %172 = OpLoad %10 %171 + %173 = OpFSub %10 %170 %172 + %174 = OpAccessChain %38 %34 %20 + OpStore %174 %173 + %175 = OpAccessChain %22 %18 %25 %40 + %176 = OpLoad %9 %175 + %177 = OpAccessChain %43 %34 %40 + OpStore %177 %176 + OpStore %178 %20 + OpBranch %179 + %179 = OpLabel + OpLoopMerge %181 %182 None + OpBranch %183 %183 = OpLabel + %184 = OpLoad %19 %178 + %185 = OpSLessThan %6 %184 %107 + OpBranchConditional %185 %180 %181 + %180 = OpLabel + %186 = OpLoad %19 %178 + %187 = OpLoad %19 %178 + %188 = OpAccessChain %35 %51 %20 %187 + %189 = OpLoad %10 %188 + %190 = OpLoad %19 %178 + %191 = OpAccessChain %35 %51 %40 %190 + %192 = OpLoad %10 %191 + %193 = OpFNegate %10 %192 + %194 = OpFAdd %10 %189 %193 + %195 = OpLoad %19 %178 + %196 = OpAccessChain %35 %51 %25 %195 + %197 = OpLoad %10 %196 + %198 = OpFAdd %10 %194 %197 + %199 = OpAccessChain %38 %48 %186 + OpStore %199 %198 + OpBranch %182 + %182 = OpLabel + %200 = OpLoad %19 %178 + %201 = OpIAdd %19 %200 %40 + OpStore %178 %201 + OpBranch %179 + %181 = OpLabel OpEmitVertex OpEndPrimitive OpBranch %31 diff --git a/src/xenia/gpu/vulkan/shaders/rect_list.geom b/src/xenia/gpu/vulkan/shaders/rect_list.geom index d796919d3..6c7e24c7e 100644 --- a/src/xenia/gpu/vulkan/shaders/rect_list.geom +++ b/src/xenia/gpu/vulkan/shaders/rect_list.geom @@ -16,11 +16,8 @@ out gl_PerVertex { float gl_ClipDistance[]; }; -struct VertexData { - vec4 o[16]; -}; -layout(location = 0) in VertexData in_vtx[]; -layout(location = 0) out VertexData out_vtx; +layout(location = 0) in vec4 in_interpolators[][16]; +layout(location = 0) out vec4 out_interpolators[16]; layout(triangles) in; layout(triangle_strip, max_vertices = 6) out; @@ -35,30 +32,30 @@ void main() { // 2 ----- [3] gl_Position = gl_in[0].gl_Position; gl_PointSize = gl_in[0].gl_PointSize; - out_vtx = in_vtx[0]; + out_interpolators = in_interpolators[0]; EmitVertex(); gl_Position = gl_in[1].gl_Position; gl_PointSize = gl_in[1].gl_PointSize; - out_vtx = in_vtx[1]; + out_interpolators = in_interpolators[1]; EmitVertex(); gl_Position = gl_in[2].gl_Position; gl_PointSize = gl_in[2].gl_PointSize; - out_vtx = in_vtx[2]; + out_interpolators = in_interpolators[2]; EmitVertex(); EndPrimitive(); gl_Position = gl_in[2].gl_Position; gl_PointSize = gl_in[2].gl_PointSize; - out_vtx = in_vtx[2]; + out_interpolators = in_interpolators[2]; EmitVertex(); gl_Position = gl_in[1].gl_Position; gl_PointSize = gl_in[1].gl_PointSize; - out_vtx = in_vtx[1]; + out_interpolators = in_interpolators[1]; EmitVertex(); gl_Position = (gl_in[1].gl_Position + gl_in[2].gl_Position) - gl_in[0].gl_Position; gl_PointSize = gl_in[2].gl_PointSize; for (int i = 0; i < 16; ++i) { - out_vtx.o[i] = -in_vtx[0].o[i] + in_vtx[1].o[i] + in_vtx[2].o[i]; + out_interpolators[i] = -in_interpolators[0][i] + in_interpolators[1][i] + in_interpolators[2][i]; } EmitVertex(); EndPrimitive(); @@ -70,30 +67,30 @@ void main() { // [3] ----- 2 gl_Position = gl_in[0].gl_Position; gl_PointSize = gl_in[0].gl_PointSize; - out_vtx = in_vtx[0]; + out_interpolators = in_interpolators[0]; EmitVertex(); gl_Position = gl_in[1].gl_Position; gl_PointSize = gl_in[1].gl_PointSize; - out_vtx = in_vtx[1]; + out_interpolators = in_interpolators[1]; EmitVertex(); gl_Position = gl_in[2].gl_Position; gl_PointSize = gl_in[2].gl_PointSize; - out_vtx = in_vtx[2]; + out_interpolators = in_interpolators[2]; EmitVertex(); EndPrimitive(); gl_Position = gl_in[0].gl_Position; gl_PointSize = gl_in[0].gl_PointSize; - out_vtx = in_vtx[0]; + out_interpolators = in_interpolators[0]; EmitVertex(); gl_Position = gl_in[2].gl_Position; gl_PointSize = gl_in[2].gl_PointSize; - out_vtx = in_vtx[2]; + out_interpolators = in_interpolators[2]; EmitVertex(); gl_Position = (gl_in[0].gl_Position + gl_in[2].gl_Position) - gl_in[1].gl_Position; gl_PointSize = gl_in[2].gl_PointSize; for (int i = 0; i < 16; ++i) { - out_vtx.o[i] = in_vtx[0].o[i] + -in_vtx[1].o[i] + in_vtx[2].o[i]; + out_interpolators[i] = in_interpolators[0][i] + -in_interpolators[1][i] + in_interpolators[2][i]; } EmitVertex(); EndPrimitive(); diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc index 4e93a46ca..a6f6dab17 100644 --- a/src/xenia/gpu/vulkan/texture_cache.cc +++ b/src/xenia/gpu/vulkan/texture_cache.cc @@ -25,26 +25,104 @@ namespace vulkan { using xe::ui::vulkan::CheckResult; constexpr uint32_t kMaxTextureSamplers = 32; +constexpr VkDeviceSize kStagingBufferSize = 64 * 1024 * 1024; -TextureCache::TextureCache(RegisterFile* register_file, +struct TextureConfig { + TextureFormat guest_format; + VkFormat host_format; +}; + +static const TextureConfig texture_configs[64] = { + {TextureFormat::k_1_REVERSE, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_1, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_8, VK_FORMAT_R8_UNORM}, + {TextureFormat::k_1_5_5_5, VK_FORMAT_R5G5B5A1_UNORM_PACK16}, + {TextureFormat::k_5_6_5, VK_FORMAT_R5G6B5_UNORM_PACK16}, + {TextureFormat::k_6_5_5, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_8_8_8_8, VK_FORMAT_R8G8B8A8_UNORM}, + {TextureFormat::k_2_10_10_10, VK_FORMAT_A2R10G10B10_UNORM_PACK32}, + {TextureFormat::k_8_A, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_8_B, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_8_8, VK_FORMAT_R8G8_UNORM}, + {TextureFormat::k_Cr_Y1_Cb_Y0, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_Y1_Cr_Y0_Cb, VK_FORMAT_UNDEFINED}, + {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_8_8_8_8_A, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_4_4_4_4, VK_FORMAT_R4G4B4A4_UNORM_PACK16}, + {TextureFormat::k_10_11_11, VK_FORMAT_B10G11R11_UFLOAT_PACK32}, // ? + {TextureFormat::k_11_11_10, VK_FORMAT_B10G11R11_UFLOAT_PACK32}, // ? + {TextureFormat::k_DXT1, VK_FORMAT_BC1_RGBA_SRGB_BLOCK}, + {TextureFormat::k_DXT2_3, VK_FORMAT_BC2_SRGB_BLOCK}, + {TextureFormat::k_DXT4_5, VK_FORMAT_BC3_SRGB_BLOCK}, + {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_24_8, VK_FORMAT_D24_UNORM_S8_UINT}, + {TextureFormat::k_24_8_FLOAT, VK_FORMAT_D24_UNORM_S8_UINT}, // ? + {TextureFormat::k_16, VK_FORMAT_R16_UNORM}, + {TextureFormat::k_16_16, VK_FORMAT_R16G16_UNORM}, + {TextureFormat::k_16_16_16_16, VK_FORMAT_R16G16B16A16_UNORM}, + {TextureFormat::k_16_EXPAND, VK_FORMAT_R16_UNORM}, // ? + {TextureFormat::k_16_16_EXPAND, VK_FORMAT_R16G16_UNORM}, // ? + {TextureFormat::k_16_16_16_16_EXPAND, VK_FORMAT_R16G16B16A16_UNORM}, // ? + {TextureFormat::k_16_FLOAT, VK_FORMAT_R16_SFLOAT}, + {TextureFormat::k_16_16_FLOAT, VK_FORMAT_R16G16_SFLOAT}, + {TextureFormat::k_16_16_16_16_FLOAT, VK_FORMAT_R16G16B16A16_SFLOAT}, + {TextureFormat::k_32, VK_FORMAT_R32_SINT}, + {TextureFormat::k_32_32, VK_FORMAT_R32G32_SINT}, + {TextureFormat::k_32_32_32_32, VK_FORMAT_R32G32B32A32_SINT}, + {TextureFormat::k_32_FLOAT, VK_FORMAT_R32_SFLOAT}, + {TextureFormat::k_32_32_FLOAT, VK_FORMAT_R32G32_SFLOAT}, + {TextureFormat::k_32_32_32_32_FLOAT, VK_FORMAT_R32G32B32A32_SFLOAT}, + {TextureFormat::k_32_AS_8, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_32_AS_8_8, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_16_MPEG, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_16_16_MPEG, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_8_INTERLACED, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_32_AS_8_INTERLACED, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_32_AS_8_8_INTERLACED, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_16_INTERLACED, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_16_MPEG_INTERLACED, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_16_16_MPEG_INTERLACED, VK_FORMAT_UNDEFINED}, + + // http://fileadmin.cs.lth.se/cs/Personal/Michael_Doggett/talks/unc-xenos-doggett.pdf + {TextureFormat::k_DXN, VK_FORMAT_BC5_UNORM_BLOCK}, // ? + {TextureFormat::k_8_8_8_8_AS_16_16_16_16, VK_FORMAT_R8G8B8A8_UNORM}, + {TextureFormat::k_DXT1_AS_16_16_16_16, VK_FORMAT_BC1_RGB_UNORM_BLOCK}, + {TextureFormat::k_DXT2_3_AS_16_16_16_16, VK_FORMAT_BC2_UNORM_BLOCK}, + {TextureFormat::k_DXT4_5_AS_16_16_16_16, VK_FORMAT_BC3_UNORM_BLOCK}, + {TextureFormat::k_2_10_10_10_AS_16_16_16_16, + VK_FORMAT_A2R10G10B10_UNORM_PACK32}, + {TextureFormat::k_10_11_11_AS_16_16_16_16, + VK_FORMAT_B10G11R11_UFLOAT_PACK32}, // ? + {TextureFormat::k_11_11_10_AS_16_16_16_16, + VK_FORMAT_B10G11R11_UFLOAT_PACK32}, // ? + {TextureFormat::k_32_32_32_FLOAT, VK_FORMAT_R32G32B32_SFLOAT}, + {TextureFormat::k_DXT3A, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_DXT5A, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_CTX1, VK_FORMAT_UNDEFINED}, + {TextureFormat::k_DXT3A_AS_1_1_1_1, VK_FORMAT_UNDEFINED}, + {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED}, + {TextureFormat::kUnknown, VK_FORMAT_UNDEFINED}, +}; + +TextureCache::TextureCache(Memory* memory, RegisterFile* register_file, TraceWriter* trace_writer, ui::vulkan::VulkanDevice* device) - : register_file_(register_file), + : memory_(memory), + register_file_(register_file), trace_writer_(trace_writer), - device_(device) { + device_(device), + staging_buffer_(device) { // Descriptor pool used for all of our cached descriptors. VkDescriptorPoolCreateInfo descriptor_pool_info; descriptor_pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; descriptor_pool_info.pNext = nullptr; descriptor_pool_info.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; - descriptor_pool_info.maxSets = 256; - VkDescriptorPoolSize pool_sizes[2]; - pool_sizes[0].type = VK_DESCRIPTOR_TYPE_SAMPLER; - pool_sizes[0].descriptorCount = 32; - pool_sizes[1].type = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; - pool_sizes[1].descriptorCount = 32; - descriptor_pool_info.poolSizeCount = 2; + descriptor_pool_info.maxSets = 8192; + VkDescriptorPoolSize pool_sizes[1]; + pool_sizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + pool_sizes[0].descriptorCount = 8192; + descriptor_pool_info.poolSizeCount = 1; descriptor_pool_info.pPoolSizes = pool_sizes; auto err = vkCreateDescriptorPool(*device_, &descriptor_pool_info, nullptr, &descriptor_pool_); @@ -52,18 +130,11 @@ TextureCache::TextureCache(RegisterFile* register_file, // Create the descriptor set layout used for rendering. // We always have the same number of samplers but only some are used. - VkDescriptorSetLayoutBinding bindings[5]; - auto& sampler_binding = bindings[0]; - sampler_binding.binding = 0; - sampler_binding.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; - sampler_binding.descriptorCount = kMaxTextureSamplers; - sampler_binding.stageFlags = - VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; - sampler_binding.pImmutableSamplers = nullptr; + VkDescriptorSetLayoutBinding bindings[4]; for (int i = 0; i < 4; ++i) { - auto& texture_binding = bindings[1 + i]; - texture_binding.binding = 1 + i; - texture_binding.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; + auto& texture_binding = bindings[i]; + texture_binding.binding = i; + texture_binding.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; texture_binding.descriptorCount = kMaxTextureSamplers; texture_binding.stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; @@ -81,96 +152,759 @@ TextureCache::TextureCache(RegisterFile* register_file, nullptr, &texture_descriptor_set_layout_); CheckResult(err, "vkCreateDescriptorSetLayout"); - SetupGridImages(); + if (!staging_buffer_.Initialize(kStagingBufferSize, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT)) { + assert_always(); + } + + invalidated_textures_sets_[0].reserve(64); + invalidated_textures_sets_[1].reserve(64); + invalidated_textures_ = &invalidated_textures_sets_[0]; } TextureCache::~TextureCache() { - vkDestroyImageView(*device_, grid_image_2d_view_, nullptr); - vkDestroyImage(*device_, grid_image_2d_, nullptr); - vkFreeMemory(*device_, grid_image_2d_memory_, nullptr); + for (auto it = samplers_.begin(); it != samplers_.end(); ++it) { + vkDestroySampler(*device_, it->second->sampler, nullptr); + delete it->second; + } + samplers_.clear(); vkDestroyDescriptorSetLayout(*device_, texture_descriptor_set_layout_, nullptr); vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr); } -void TextureCache::SetupGridImages() { - VkImageCreateInfo image_info; +TextureCache::Texture* TextureCache::AllocateTexture( + const TextureInfo& texture_info) { + // Create an image first. + VkImageCreateInfo image_info = {}; image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; - image_info.pNext = nullptr; - image_info.flags = 0; - image_info.imageType = VK_IMAGE_TYPE_2D; - image_info.format = VK_FORMAT_R8G8B8A8_UNORM; - image_info.extent = {8, 8, 1}; + switch (texture_info.dimension) { + case Dimension::k1D: + image_info.imageType = VK_IMAGE_TYPE_1D; + break; + case Dimension::k2D: + image_info.imageType = VK_IMAGE_TYPE_2D; + break; + case Dimension::k3D: + image_info.imageType = VK_IMAGE_TYPE_3D; + break; + case Dimension::kCube: + image_info.imageType = VK_IMAGE_TYPE_2D; + image_info.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; + break; + default: + assert_unhandled_case(texture_info.dimension); + return nullptr; + } + + assert_not_null(texture_info.format_info); + auto& config = texture_configs[int(texture_info.format_info->format)]; + VkFormat format = config.host_format != VK_FORMAT_UNDEFINED + ? config.host_format + : VK_FORMAT_R8G8B8A8_UNORM; + + VkFormatProperties props; + uint32_t required_flags = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | + VK_FORMAT_FEATURE_BLIT_DST_BIT | + VK_FORMAT_FEATURE_BLIT_SRC_BIT; + vkGetPhysicalDeviceFormatProperties(*device_, format, &props); + if ((props.optimalTilingFeatures & required_flags) != required_flags) { + // Texture needs conversion on upload to a native format. + // assert_always(); + } + + image_info.format = format; + image_info.extent = {texture_info.width + 1, texture_info.height + 1, + texture_info.depth + 1}; image_info.mipLevels = 1; image_info.arrayLayers = 1; image_info.samples = VK_SAMPLE_COUNT_1_BIT; - image_info.tiling = VK_IMAGE_TILING_LINEAR; - image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT; + image_info.tiling = VK_IMAGE_TILING_OPTIMAL; + image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | + VK_IMAGE_USAGE_TRANSFER_DST_BIT; image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; image_info.queueFamilyIndexCount = 0; image_info.pQueueFamilyIndices = nullptr; - image_info.initialLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - auto err = vkCreateImage(*device_, &image_info, nullptr, &grid_image_2d_); + image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + VkImage image; + auto err = vkCreateImage(*device_, &image_info, nullptr, &image); CheckResult(err, "vkCreateImage"); - VkMemoryRequirements memory_requirements; - vkGetImageMemoryRequirements(*device_, grid_image_2d_, &memory_requirements); - grid_image_2d_memory_ = device_->AllocateMemory( - memory_requirements, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - err = vkBindImageMemory(*device_, grid_image_2d_, grid_image_2d_memory_, 0); + VkMemoryRequirements mem_requirements; + vkGetImageMemoryRequirements(*device_, image, &mem_requirements); + + // TODO: Use a circular buffer or something else to allocate this memory. + // The device has a limited amount (around 64) of memory allocations that we + // can make. + // Now that we have the size, back the image with GPU memory. + auto memory = device_->AllocateMemory(mem_requirements, 0); + if (!memory) { + // Crap. + assert_always(); + vkDestroyImage(*device_, image, nullptr); + return nullptr; + } + + err = vkBindImageMemory(*device_, image, memory, 0); CheckResult(err, "vkBindImageMemory"); + auto texture = new Texture(); + texture->format = image_info.format; + texture->image = image; + texture->image_layout = image_info.initialLayout; + texture->image_memory = memory; + texture->memory_offset = 0; + texture->memory_size = mem_requirements.size; + texture->texture_info = texture_info; + + // Create a default view, just for kicks. VkImageViewCreateInfo view_info; view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; view_info.pNext = nullptr; view_info.flags = 0; - view_info.image = grid_image_2d_; + view_info.image = image; view_info.viewType = VK_IMAGE_VIEW_TYPE_2D; - view_info.format = VK_FORMAT_R8G8B8A8_UNORM; + view_info.format = image_info.format; view_info.components = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A, }; view_info.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; - err = vkCreateImageView(*device_, &view_info, nullptr, &grid_image_2d_view_); + VkImageView view; + err = vkCreateImageView(*device_, &view_info, nullptr, &view); CheckResult(err, "vkCreateImageView"); + if (err == VK_SUCCESS) { + auto texture_view = std::make_unique(); + texture_view->texture = texture; + texture_view->view = view; + texture_view->swiz_x = 0; + texture_view->swiz_y = 1; + texture_view->swiz_z = 2; + texture_view->swiz_w = 3; + texture->views.push_back(std::move(texture_view)); + } - VkImageSubresource subresource; - subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; - subresource.mipLevel = 0; - subresource.arrayLayer = 0; - VkSubresourceLayout layout; - vkGetImageSubresourceLayout(*device_, grid_image_2d_, &subresource, &layout); + return texture; +} - void* gpu_data = nullptr; - err = vkMapMemory(*device_, grid_image_2d_memory_, 0, layout.size, 0, - &gpu_data); - CheckResult(err, "vkMapMemory"); +bool TextureCache::FreeTexture(Texture* texture) { + if (texture->in_flight_fence && + texture->in_flight_fence->status() != VK_SUCCESS) { + // Texture still in flight. + return false; + } - uint32_t grid_pixels[8 * 8]; - for (int y = 0; y < 8; ++y) { - for (int x = 0; x < 8; ++x) { - grid_pixels[y * 8 + x] = - ((y % 2 == 0) ^ (x % 2 != 0)) ? 0xFFFFFFFF : 0xFF0000FF; + for (auto it = texture->views.begin(); it != texture->views.end();) { + vkDestroyImageView(*device_, (*it)->view, nullptr); + it = texture->views.erase(it); + } + + if (texture->access_watch_handle) { + memory_->CancelAccessWatch(texture->access_watch_handle); + texture->access_watch_handle = 0; + } + + vkDestroyImage(*device_, texture->image, nullptr); + vkFreeMemory(*device_, texture->image_memory, nullptr); + delete texture; + return true; +} + +TextureCache::Texture* TextureCache::DemandResolveTexture( + const TextureInfo& texture_info, TextureFormat format, + VkOffset2D* out_offset) { + // Check to see if we've already used a texture at this location. + auto texture = LookupAddress( + texture_info.guest_address, texture_info.size_2d.block_width, + texture_info.size_2d.block_height, format, out_offset); + if (texture) { + return texture; + } + + // No texture at this location. Make a new one. + texture = AllocateTexture(texture_info); + texture->is_full_texture = false; + + // Setup an access watch. If this texture is touched, it is destroyed. + texture->access_watch_handle = memory_->AddPhysicalAccessWatch( + texture_info.guest_address, texture_info.input_length, + cpu::MMIOHandler::kWatchWrite, + [](void* context_ptr, void* data_ptr, uint32_t address) { + auto self = reinterpret_cast(context_ptr); + auto touched_texture = reinterpret_cast(data_ptr); + // Clear watch handle first so we don't redundantly + // remove. + touched_texture->access_watch_handle = 0; + touched_texture->pending_invalidation = true; + // Add to pending list so Scavenge will clean it up. + self->invalidated_resolve_textures_mutex_.lock(); + self->invalidated_resolve_textures_.push_back(touched_texture); + self->invalidated_resolve_textures_mutex_.unlock(); + }, + this, texture); + + resolve_textures_.push_back(texture); + return texture; +} + +TextureCache::Texture* TextureCache::Demand( + const TextureInfo& texture_info, VkCommandBuffer command_buffer, + std::shared_ptr completion_fence) { + // Run a tight loop to scan for an exact match existing texture. + auto texture_hash = texture_info.hash(); + for (auto it = textures_.find(texture_hash); it != textures_.end(); ++it) { + if (it->second->texture_info == texture_info) { + if (it->second->pending_invalidation) { + // This texture has been invalidated! + Scavenge(); + break; + } + + return it->second; } } - std::memcpy(gpu_data, grid_pixels, sizeof(grid_pixels)); - vkUnmapMemory(*device_, grid_image_2d_memory_); + // Check resolve textures. + for (auto it = resolve_textures_.begin(); it != resolve_textures_.end(); + ++it) { + auto texture = (*it); + if (texture_info.guest_address == texture->texture_info.guest_address && + texture_info.size_2d.logical_width == + texture->texture_info.size_2d.logical_width && + texture_info.size_2d.logical_height == + texture->texture_info.size_2d.logical_height) { + // Exact match. + // TODO: Lazy match (at an offset) + // Upgrade this texture to a full texture. + texture->is_full_texture = true; + texture->texture_info = texture_info; + + if (texture->access_watch_handle) { + memory_->CancelAccessWatch(texture->access_watch_handle); + } + + texture->access_watch_handle = memory_->AddPhysicalAccessWatch( + texture_info.guest_address, texture_info.input_length, + cpu::MMIOHandler::kWatchWrite, + [](void* context_ptr, void* data_ptr, uint32_t address) { + auto self = reinterpret_cast(context_ptr); + auto touched_texture = reinterpret_cast(data_ptr); + // Clear watch handle first so we don't redundantly + // remove. + touched_texture->access_watch_handle = 0; + touched_texture->pending_invalidation = true; + // Add to pending list so Scavenge will clean it up. + self->invalidated_textures_mutex_.lock(); + self->invalidated_textures_->push_back(touched_texture); + self->invalidated_textures_mutex_.unlock(); + }, + this, texture); + + textures_[texture_hash] = *it; + it = resolve_textures_.erase(it); + return textures_[texture_hash]; + } + } + + if (!command_buffer) { + // Texture not found and no command buffer was passed, preventing us from + // uploading a new one. + return nullptr; + } + + if (texture_info.dimension != Dimension::k2D) { + // Abort. + return nullptr; + } + + // Create a new texture and cache it. + auto texture = AllocateTexture(texture_info); + if (!texture) { + // Failed to allocate texture (out of memory?) + assert_always(); + return nullptr; + } + + bool uploaded = false; + switch (texture_info.dimension) { + case Dimension::k2D: { + uploaded = UploadTexture2D(command_buffer, completion_fence, texture, + texture_info); + } break; + default: + assert_unhandled_case(texture_info.dimension); + break; + } + + if (!uploaded) { + FreeTexture(texture); + return nullptr; + } + + // Copy in overlapping resolve textures. + // FIXME: RDR appears to take textures from small chunks of a resolve texture? + if (texture_info.dimension == Dimension::k2D) { + for (auto it = resolve_textures_.begin(); it != resolve_textures_.end(); + ++it) { + auto texture = (*it); + if (texture_info.guest_address >= texture->texture_info.guest_address && + texture_info.guest_address < texture->texture_info.guest_address + + texture->texture_info.input_length) { + // Lazy matched a resolve texture. Copy it in and destroy it. + // Future resolves will just copy directly into this texture. + // assert_always(); + } + } + } + + // Though we didn't find an exact match, that doesn't mean we're out of the + // woods yet. This texture could either be a portion of another texture or + // vice versa. Copy any overlapping textures into this texture. + // TODO: Byte count -> pixel count (on x and y axes) + for (auto it = textures_.begin(); it != textures_.end(); ++it) { + } + + // Okay. Now that the texture is uploaded from system memory, put a writewatch + // on it to tell us if it's been modified from the guest. + texture->access_watch_handle = memory_->AddPhysicalAccessWatch( + texture_info.guest_address, texture_info.input_length, + cpu::MMIOHandler::kWatchWrite, + [](void* context_ptr, void* data_ptr, uint32_t address) { + auto self = reinterpret_cast(context_ptr); + auto touched_texture = reinterpret_cast(data_ptr); + // Clear watch handle first so we don't redundantly + // remove. + touched_texture->access_watch_handle = 0; + touched_texture->pending_invalidation = true; + // Add to pending list so Scavenge will clean it up. + self->invalidated_textures_mutex_.lock(); + self->invalidated_textures_->push_back(touched_texture); + self->invalidated_textures_mutex_.unlock(); + }, + this, texture); + + textures_[texture_hash] = texture; + return texture; +} + +TextureCache::TextureView* TextureCache::DemandView(Texture* texture, + uint16_t swizzle) { + for (auto it = texture->views.begin(); it != texture->views.end(); ++it) { + if ((*it)->swizzle == swizzle) { + return (*it).get(); + } + } + + VkImageViewCreateInfo view_info; + view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + view_info.pNext = nullptr; + view_info.flags = 0; + view_info.image = texture->image; + view_info.format = texture->format; + + switch (texture->texture_info.dimension) { + case Dimension::k1D: + view_info.viewType = VK_IMAGE_VIEW_TYPE_1D; + break; + case Dimension::k2D: + view_info.viewType = VK_IMAGE_VIEW_TYPE_2D; + break; + case Dimension::k3D: + view_info.viewType = VK_IMAGE_VIEW_TYPE_3D; + break; + case Dimension::kCube: + view_info.viewType = VK_IMAGE_VIEW_TYPE_CUBE; + break; + default: + assert_always(); + } + + VkComponentSwizzle swiz_component_map[] = { + VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, + VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A, + VK_COMPONENT_SWIZZLE_ZERO, VK_COMPONENT_SWIZZLE_ONE, + VK_COMPONENT_SWIZZLE_IDENTITY, + }; + + view_info.components = { + swiz_component_map[(swizzle >> 0) & 0x7], + swiz_component_map[(swizzle >> 3) & 0x7], + swiz_component_map[(swizzle >> 6) & 0x7], + swiz_component_map[(swizzle >> 9) & 0x7], + }; + view_info.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + VkImageView view; + auto status = vkCreateImageView(*device_, &view_info, nullptr, &view); + CheckResult(status, "vkCreateImageView"); + if (status == VK_SUCCESS) { + auto texture_view = new TextureView(); + texture_view->texture = texture; + texture_view->view = view; + texture_view->swizzle = swizzle; + texture->views.push_back(std::unique_ptr(texture_view)); + return texture_view; + } + + return nullptr; +} + +TextureCache::Sampler* TextureCache::Demand(const SamplerInfo& sampler_info) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + auto sampler_hash = sampler_info.hash(); + for (auto it = samplers_.find(sampler_hash); it != samplers_.end(); ++it) { + if (it->second->sampler_info == sampler_info) { + // Found a compatible sampler. + return it->second; + } + } + + VkResult status = VK_SUCCESS; + + // Create a new sampler and cache it. + // TODO: Actually set the properties + VkSamplerCreateInfo sampler_create_info; + sampler_create_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; + sampler_create_info.pNext = nullptr; + sampler_create_info.flags = 0; + sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; + + // Texture level filtering. + VkSamplerMipmapMode mip_filter; + switch (sampler_info.mip_filter) { + case TextureFilter::kBaseMap: + // TODO(DrChat): ? + mip_filter = VK_SAMPLER_MIPMAP_MODE_NEAREST; + break; + case TextureFilter::kPoint: + mip_filter = VK_SAMPLER_MIPMAP_MODE_NEAREST; + break; + case TextureFilter::kLinear: + mip_filter = VK_SAMPLER_MIPMAP_MODE_LINEAR; + break; + default: + assert_unhandled_case(sampler_info.mip_filter); + return nullptr; + } + + VkFilter min_filter; + switch (sampler_info.min_filter) { + case TextureFilter::kPoint: + min_filter = VK_FILTER_NEAREST; + break; + case TextureFilter::kLinear: + min_filter = VK_FILTER_LINEAR; + break; + default: + assert_unhandled_case(sampler_info.min_filter); + return nullptr; + } + VkFilter mag_filter; + switch (sampler_info.mag_filter) { + case TextureFilter::kPoint: + mag_filter = VK_FILTER_NEAREST; + break; + case TextureFilter::kLinear: + mag_filter = VK_FILTER_LINEAR; + break; + default: + assert_unhandled_case(mag_filter); + return nullptr; + } + + sampler_create_info.minFilter = min_filter; + sampler_create_info.magFilter = mag_filter; + sampler_create_info.mipmapMode = mip_filter; + + // FIXME: Both halfway / mirror clamp to border aren't mapped properly. + VkSamplerAddressMode address_mode_map[] = { + /* kRepeat */ VK_SAMPLER_ADDRESS_MODE_REPEAT, + /* kMirroredRepeat */ VK_SAMPLER_ADDRESS_MODE_MIRRORED_REPEAT, + /* kClampToEdge */ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + /* kMirrorClampToEdge */ VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, + /* kClampToHalfway */ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE, + /* kMirrorClampToHalfway */ VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, + /* kClampToBorder */ VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + /* kMirrorClampToBorder */ VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE, + }; + sampler_create_info.addressModeU = + address_mode_map[static_cast(sampler_info.clamp_u)]; + sampler_create_info.addressModeV = + address_mode_map[static_cast(sampler_info.clamp_v)]; + sampler_create_info.addressModeW = + address_mode_map[static_cast(sampler_info.clamp_w)]; + + sampler_create_info.mipLodBias = 0.0f; + + float aniso = 0.f; + switch (sampler_info.aniso_filter) { + case AnisoFilter::kDisabled: + aniso = 1.0f; + break; + case AnisoFilter::kMax_1_1: + aniso = 1.0f; + break; + case AnisoFilter::kMax_2_1: + aniso = 2.0f; + break; + case AnisoFilter::kMax_4_1: + aniso = 4.0f; + break; + case AnisoFilter::kMax_8_1: + aniso = 8.0f; + break; + case AnisoFilter::kMax_16_1: + aniso = 16.0f; + break; + default: + assert_unhandled_case(aniso); + return nullptr; + } + + sampler_create_info.anisotropyEnable = + sampler_info.aniso_filter != AnisoFilter::kDisabled ? VK_TRUE : VK_FALSE; + sampler_create_info.maxAnisotropy = aniso; + + sampler_create_info.compareEnable = VK_FALSE; + sampler_create_info.compareOp = VK_COMPARE_OP_NEVER; + sampler_create_info.minLod = 0.0f; + sampler_create_info.maxLod = 0.0f; + sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; + sampler_create_info.unnormalizedCoordinates = VK_FALSE; + VkSampler vk_sampler; + status = + vkCreateSampler(*device_, &sampler_create_info, nullptr, &vk_sampler); + CheckResult(status, "vkCreateSampler"); + if (status != VK_SUCCESS) { + return nullptr; + } + + auto sampler = new Sampler(); + sampler->sampler = vk_sampler; + sampler->sampler_info = sampler_info; + samplers_[sampler_hash] = sampler; + + return sampler; +} + +TextureCache::Texture* TextureCache::LookupAddress(uint32_t guest_address, + uint32_t width, + uint32_t height, + TextureFormat format, + VkOffset2D* out_offset) { + for (auto it = textures_.begin(); it != textures_.end(); ++it) { + const auto& texture_info = it->second->texture_info; + if (guest_address >= texture_info.guest_address && + guest_address < + texture_info.guest_address + texture_info.input_length && + texture_info.size_2d.input_width >= width && + texture_info.size_2d.input_height >= height && out_offset) { + auto offset_bytes = guest_address - texture_info.guest_address; + + if (texture_info.dimension == Dimension::k2D) { + out_offset->x = 0; + out_offset->y = offset_bytes / texture_info.size_2d.input_pitch; + if (offset_bytes % texture_info.size_2d.input_pitch != 0) { + // TODO: offset_x + } + } + + return it->second; + } + + if (texture_info.guest_address == guest_address && + texture_info.dimension == Dimension::k2D && + texture_info.size_2d.input_width == width && + texture_info.size_2d.input_height == height) { + if (out_offset) { + out_offset->x = 0; + out_offset->y = 0; + } + + return it->second; + } + } + + // Check resolve textures + for (auto it = resolve_textures_.begin(); it != resolve_textures_.end(); + ++it) { + const auto& texture_info = (*it)->texture_info; + if (texture_info.guest_address == guest_address && + texture_info.dimension == Dimension::k2D && + texture_info.size_2d.input_width == width && + texture_info.size_2d.input_height == height) { + if (out_offset) { + out_offset->x = 0; + out_offset->y = 0; + } + + return (*it); + } + } + + return nullptr; +} + +void TextureSwap(Endian endianness, void* dest, const void* src, + size_t length) { + switch (endianness) { + case Endian::k8in16: + xe::copy_and_swap_16_aligned(dest, src, length / 2); + break; + case Endian::k8in32: + xe::copy_and_swap_32_aligned(dest, src, length / 4); + break; + case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word + xe::copy_and_swap_16_in_32_aligned(dest, src, length); + break; + default: + case Endian::kUnspecified: + std::memcpy(dest, src, length); + break; + } +} + +bool TextureCache::UploadTexture2D( + VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, Texture* dest, + TextureInfo src) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + + assert_true(src.dimension == Dimension::k2D); + + if (!staging_buffer_.CanAcquire(src.input_length)) { + // Need to have unique memory for every upload for at least one frame. If we + // run out of memory, we need to flush all queued upload commands to the + // GPU. + // TODO: Actually flush commands. + assert_always(); + } + + // Grab some temporary memory for staging. + size_t unpack_length = src.output_length; + auto alloc = staging_buffer_.Acquire(unpack_length, completion_fence); + assert_not_null(alloc); + + // Upload texture into GPU memory. + // TODO: If the GPU supports it, we can submit a compute batch to convert the + // texture and copy it to its destination. Otherwise, fallback to conversion + // on the CPU. + void* host_address = memory_->TranslatePhysical(src.guest_address); + if (!src.is_tiled) { + if (src.size_2d.input_pitch == src.size_2d.output_pitch) { + // Fast path copy entire image. + TextureSwap(src.endianness, alloc->host_ptr, host_address, unpack_length); + } else { + // Slow path copy row-by-row because strides differ. + // UNPACK_ROW_LENGTH only works for uncompressed images, and likely does + // this exact thing under the covers, so we just always do it here. + const uint8_t* src_mem = reinterpret_cast(host_address); + uint8_t* dest = reinterpret_cast(alloc->host_ptr); + uint32_t pitch = + std::min(src.size_2d.input_pitch, src.size_2d.output_pitch); + for (uint32_t y = 0; + y < std::min(src.size_2d.block_height, src.size_2d.logical_height); + y++) { + TextureSwap(src.endianness, dest, src_mem, pitch); + src_mem += src.size_2d.input_pitch; + dest += src.size_2d.output_pitch; + } + } + } else { + // Untile image. + // We could do this in a shader to speed things up, as this is pretty slow. + + // TODO(benvanik): optimize this inner loop (or work by tiles). + const uint8_t* src_mem = reinterpret_cast(host_address); + uint8_t* dest = reinterpret_cast(alloc->host_ptr); + uint32_t bytes_per_block = src.format_info->block_width * + src.format_info->block_height * + src.format_info->bits_per_pixel / 8; + + // Tiled textures can be packed; get the offset into the packed texture. + uint32_t offset_x; + uint32_t offset_y; + TextureInfo::GetPackedTileOffset(src, &offset_x, &offset_y); + auto bpp = (bytes_per_block >> 2) + + ((bytes_per_block >> 1) >> (bytes_per_block >> 2)); + for (uint32_t y = 0, output_base_offset = 0; + y < std::min(src.size_2d.block_height, src.size_2d.logical_height); + y++, output_base_offset += src.size_2d.output_pitch) { + auto input_base_offset = TextureInfo::TiledOffset2DOuter( + offset_y + y, + (src.size_2d.input_width / src.format_info->block_width), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < src.size_2d.block_width; x++, output_offset += bytes_per_block) { + auto input_offset = + TextureInfo::TiledOffset2DInner(offset_x + x, offset_y + y, bpp, + input_base_offset) >> + bpp; + TextureSwap(src.endianness, dest + output_offset, + src_mem + input_offset * bytes_per_block, bytes_per_block); + } + } + } + + staging_buffer_.Flush(alloc); + + // Transition the texture into a transfer destination layout. + VkImageMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.pNext = nullptr; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = + VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_HOST_WRITE_BIT; + barrier.oldLayout = dest->image_layout; + barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = dest->image; + barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + // Now move the converted texture into the destination. + VkBufferImageCopy copy_region; + copy_region.bufferOffset = alloc->offset; + copy_region.bufferRowLength = src.size_2d.output_width; + copy_region.bufferImageHeight = src.size_2d.output_height; + copy_region.imageSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + copy_region.imageOffset = {0, 0, 0}; + copy_region.imageExtent = {src.size_2d.output_width, + src.size_2d.output_height, 1}; + vkCmdCopyBufferToImage(command_buffer, staging_buffer_.gpu_buffer(), + dest->image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, + ©_region); + + // Now transition the texture into a shader readonly source. + barrier.srcAccessMask = barrier.dstAccessMask; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.oldLayout = barrier.newLayout; + barrier.newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + dest->image_layout = barrier.newLayout; + return true; } VkDescriptorSet TextureCache::PrepareTextureSet( VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, const std::vector& vertex_bindings, const std::vector& pixel_bindings) { // Clear state. auto update_set_info = &update_set_info_; update_set_info->has_setup_fetch_mask = 0; - update_set_info->image_1d_write_count = 0; - update_set_info->image_2d_write_count = 0; - update_set_info->image_3d_write_count = 0; - update_set_info->image_cube_write_count = 0; + update_set_info->image_write_count = 0; std::memset(update_set_info, 0, sizeof(update_set_info_)); @@ -178,10 +912,12 @@ VkDescriptorSet TextureCache::PrepareTextureSet( // This does things lazily and de-dupes fetch constants reused in both // shaders. bool any_failed = false; - any_failed = - !SetupTextureBindings(update_set_info, vertex_bindings) || any_failed; - any_failed = - !SetupTextureBindings(update_set_info, pixel_bindings) || any_failed; + any_failed = !SetupTextureBindings(command_buffer, completion_fence, + update_set_info, vertex_bindings) || + any_failed; + any_failed = !SetupTextureBindings(command_buffer, completion_fence, + update_set_info, pixel_bindings) || + any_failed; if (any_failed) { XELOGW("Failed to setup one or more texture bindings"); // TODO(benvanik): actually bail out here? @@ -199,75 +935,87 @@ VkDescriptorSet TextureCache::PrepareTextureSet( vkAllocateDescriptorSets(*device_, &set_alloc_info, &descriptor_set); CheckResult(err, "vkAllocateDescriptorSets"); - // Write all updated descriptors. - // TODO(benvanik): optimize? split into multiple sets? set per type? - VkWriteDescriptorSet descriptor_writes[4]; - std::memset(descriptor_writes, 0, sizeof(descriptor_writes)); - uint32_t descriptor_write_count = 0; - if (update_set_info->sampler_write_count) { - auto& sampler_write = descriptor_writes[descriptor_write_count++]; - sampler_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - sampler_write.pNext = nullptr; - sampler_write.dstSet = descriptor_set; - sampler_write.dstBinding = 0; - sampler_write.dstArrayElement = 0; - sampler_write.descriptorCount = update_set_info->sampler_write_count; - sampler_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLER; - sampler_write.pImageInfo = update_set_info->sampler_infos; - } - if (update_set_info->image_1d_write_count) { - auto& image_write = descriptor_writes[descriptor_write_count++]; - image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - image_write.pNext = nullptr; - image_write.dstSet = descriptor_set; - image_write.dstBinding = 1; - image_write.dstArrayElement = 0; - image_write.descriptorCount = update_set_info->image_1d_write_count; - image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; - image_write.pImageInfo = update_set_info->image_1d_infos; - } - if (update_set_info->image_2d_write_count) { - auto& image_write = descriptor_writes[descriptor_write_count++]; - image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - image_write.pNext = nullptr; - image_write.dstSet = descriptor_set; - image_write.dstBinding = 2; - image_write.dstArrayElement = 0; - image_write.descriptorCount = update_set_info->image_2d_write_count; - image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; - image_write.pImageInfo = update_set_info->image_2d_infos; - } - if (update_set_info->image_3d_write_count) { - auto& image_write = descriptor_writes[descriptor_write_count++]; - image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - image_write.pNext = nullptr; - image_write.dstSet = descriptor_set; - image_write.dstBinding = 3; - image_write.dstArrayElement = 0; - image_write.descriptorCount = update_set_info->image_3d_write_count; - image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; - image_write.pImageInfo = update_set_info->image_3d_infos; - } - if (update_set_info->image_cube_write_count) { - auto& image_write = descriptor_writes[descriptor_write_count++]; - image_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - image_write.pNext = nullptr; - image_write.dstSet = descriptor_set; - image_write.dstBinding = 4; - image_write.dstArrayElement = 0; - image_write.descriptorCount = update_set_info->image_cube_write_count; - image_write.descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE; - image_write.pImageInfo = update_set_info->image_cube_infos; - } - if (descriptor_write_count) { - vkUpdateDescriptorSets(*device_, descriptor_write_count, descriptor_writes, - 0, nullptr); + if (err != VK_SUCCESS) { + return nullptr; } + // Write all updated descriptors. + // TODO(benvanik): optimize? split into multiple sets? set per type? + // First: Reorganize and pool image update infos. + struct DescriptorInfo { + Dimension dimension; + uint32_t tf_binding_base; + std::vector infos; + }; + + std::vector descriptor_update_infos; + for (uint32_t i = 0; i < update_set_info->image_write_count; i++) { + auto& image_info = update_set_info->image_infos[i]; + if (descriptor_update_infos.size() > 0) { + // Check last write to see if we can pool more into it. + DescriptorInfo& last_write = + descriptor_update_infos[descriptor_update_infos.size() - 1]; + if (last_write.dimension == image_info.dimension && + last_write.tf_binding_base + last_write.infos.size() == + image_info.tf_binding) { + // Compatible! Pool into it. + last_write.infos.push_back(image_info.info); + continue; + } + } + + // Push a new descriptor write entry. + DescriptorInfo desc_info; + desc_info.dimension = image_info.dimension; + desc_info.tf_binding_base = image_info.tf_binding; + desc_info.infos.push_back(image_info.info); + descriptor_update_infos.push_back(desc_info); + } + + // Finalize the writes so they're consumable by Vulkan. + std::vector descriptor_writes; + descriptor_writes.resize(descriptor_update_infos.size()); + for (size_t i = 0; i < descriptor_update_infos.size(); i++) { + auto& update_info = descriptor_update_infos[i]; + auto& write_info = descriptor_writes[i]; + std::memset(&write_info, 0, sizeof(VkWriteDescriptorSet)); + + write_info.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + write_info.dstSet = descriptor_set; + + switch (update_info.dimension) { + case Dimension::k1D: + write_info.dstBinding = 0; + break; + case Dimension::k2D: + write_info.dstBinding = 1; + break; + case Dimension::k3D: + write_info.dstBinding = 2; + break; + case Dimension::kCube: + write_info.dstBinding = 3; + break; + } + + write_info.dstArrayElement = update_info.tf_binding_base; + write_info.descriptorCount = uint32_t(update_info.infos.size()); + write_info.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + write_info.pImageInfo = update_info.infos.data(); + } + + if (descriptor_writes.size() > 0) { + vkUpdateDescriptorSets(*device_, uint32_t(descriptor_writes.size()), + descriptor_writes.data(), 0, nullptr); + } + + in_flight_sets_.push_back({descriptor_set, completion_fence}); return descriptor_set; } bool TextureCache::SetupTextureBindings( + VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, UpdateSetInfo* update_set_info, const std::vector& bindings) { bool any_failed = false; @@ -275,15 +1023,23 @@ bool TextureCache::SetupTextureBindings( uint32_t fetch_bit = 1 << binding.fetch_constant; if ((update_set_info->has_setup_fetch_mask & fetch_bit) == 0) { // Needs setup. - any_failed = !SetupTextureBinding(update_set_info, binding) || any_failed; + any_failed = !SetupTextureBinding(command_buffer, completion_fence, + update_set_info, binding) || + any_failed; update_set_info->has_setup_fetch_mask |= fetch_bit; } } return !any_failed; } -bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info, - const Shader::TextureBinding& binding) { +bool TextureCache::SetupTextureBinding( + VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, + UpdateSetInfo* update_set_info, const Shader::TextureBinding& binding) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + auto& regs = *register_file_; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6; auto group = @@ -308,47 +1064,100 @@ bool TextureCache::SetupTextureBinding(UpdateSetInfo* update_set_info, return false; // invalid texture used } + auto texture = Demand(texture_info, command_buffer, completion_fence); + auto sampler = Demand(sampler_info); + // assert_true(texture != nullptr && sampler != nullptr); + if (texture == nullptr || sampler == nullptr) { + return false; + } + + uint16_t swizzle = static_cast(fetch.swizzle); + auto view = DemandView(texture, swizzle); + trace_writer_->WriteMemoryRead(texture_info.guest_address, texture_info.input_length); - // TODO(benvanik): reuse. - VkSamplerCreateInfo sampler_create_info; - sampler_create_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO; - sampler_create_info.pNext = nullptr; - sampler_create_info.flags = 0; - sampler_create_info.magFilter = VK_FILTER_NEAREST; - sampler_create_info.minFilter = VK_FILTER_NEAREST; - sampler_create_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST; - sampler_create_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT; - sampler_create_info.mipLodBias = 0.0f; - sampler_create_info.anisotropyEnable = VK_FALSE; - sampler_create_info.maxAnisotropy = 1.0f; - sampler_create_info.compareEnable = VK_FALSE; - sampler_create_info.compareOp = VK_COMPARE_OP_ALWAYS; - sampler_create_info.minLod = 0.0f; - sampler_create_info.maxLod = 0.0f; - sampler_create_info.borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; - sampler_create_info.unnormalizedCoordinates = VK_FALSE; - VkSampler sampler; - auto err = vkCreateSampler(*device_, &sampler_create_info, nullptr, &sampler); - CheckResult(err, "vkCreateSampler"); - - auto& sampler_write = - update_set_info->sampler_infos[update_set_info->sampler_write_count++]; - sampler_write.sampler = sampler; - - auto& image_write = - update_set_info->image_2d_infos[update_set_info->image_2d_write_count++]; - image_write.imageView = grid_image_2d_view_; - image_write.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + auto image_write = + &update_set_info->image_infos[update_set_info->image_write_count++]; + image_write->dimension = texture_info.dimension; + image_write->tf_binding = binding.fetch_constant; + image_write->info.imageView = view->view; + image_write->info.imageLayout = texture->image_layout; + image_write->info.sampler = sampler->sampler; + texture->in_flight_fence = completion_fence; return true; } void TextureCache::ClearCache() { - // TODO(benvanik): caching. + // TODO(DrChat): Nuke everything. +} + +void TextureCache::Scavenge() { + // Free unused descriptor sets + for (auto it = in_flight_sets_.begin(); it != in_flight_sets_.end();) { + if (vkGetFenceStatus(*device_, *it->second) == VK_SUCCESS) { + // We can free this one. + vkFreeDescriptorSets(*device_, descriptor_pool_, 1, &it->first); + it = in_flight_sets_.erase(it); + continue; + } + + // We've encountered an item that hasn't been used yet, so any items + // afterwards are guaranteed to be unused. + break; + } + + staging_buffer_.Scavenge(); + + // Kill all pending delete textures. + if (!pending_delete_textures_.empty()) { + for (auto it = pending_delete_textures_.begin(); + it != pending_delete_textures_.end();) { + if (!FreeTexture(*it)) { + break; + } + + it = pending_delete_textures_.erase(it); + } + } + + // Clean up any invalidated textures. + invalidated_textures_mutex_.lock(); + std::vector& invalidated_textures = *invalidated_textures_; + if (invalidated_textures_ == &invalidated_textures_sets_[0]) { + invalidated_textures_ = &invalidated_textures_sets_[1]; + } else { + invalidated_textures_ = &invalidated_textures_sets_[0]; + } + invalidated_textures_mutex_.unlock(); + if (!invalidated_textures.empty()) { + for (auto it = invalidated_textures.begin(); + it != invalidated_textures.end(); ++it) { + pending_delete_textures_.push_back(*it); + textures_.erase((*it)->texture_info.hash()); + } + + invalidated_textures.clear(); + } + + // Invalidated resolve textures. + invalidated_resolve_textures_mutex_.lock(); + if (!invalidated_resolve_textures_.empty()) { + for (auto it = invalidated_resolve_textures_.begin(); + it != invalidated_resolve_textures_.end(); ++it) { + pending_delete_textures_.push_back(*it); + + auto tex = + std::find(resolve_textures_.begin(), resolve_textures_.end(), *it); + if (tex != resolve_textures_.end()) { + resolve_textures_.erase(tex); + } + } + + invalidated_resolve_textures_.clear(); + } + invalidated_resolve_textures_mutex_.unlock(); } } // namespace vulkan diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h index 9ba3f3577..8f47f33df 100644 --- a/src/xenia/gpu/vulkan/texture_cache.h +++ b/src/xenia/gpu/vulkan/texture_cache.h @@ -10,10 +10,16 @@ #ifndef XENIA_GPU_VULKAN_TEXTURE_CACHE_H_ #define XENIA_GPU_VULKAN_TEXTURE_CACHE_H_ +#include + #include "xenia/gpu/register_file.h" +#include "xenia/gpu/sampler_info.h" #include "xenia/gpu/shader.h" +#include "xenia/gpu/texture_info.h" #include "xenia/gpu/trace_writer.h" +#include "xenia/gpu/vulkan/vulkan_command_processor.h" #include "xenia/gpu/xenos.h" +#include "xenia/ui/vulkan/circular_buffer.h" #include "xenia/ui/vulkan/vulkan.h" #include "xenia/ui/vulkan/vulkan_device.h" @@ -24,8 +30,51 @@ namespace vulkan { // class TextureCache { public: - TextureCache(RegisterFile* register_file, TraceWriter* trace_writer, - ui::vulkan::VulkanDevice* device); + struct TextureView; + + // This represents an uploaded Vulkan texture. + struct Texture { + TextureInfo texture_info; + std::vector> views; + + // True if we know all info about this texture, false otherwise. + // (e.g. we resolve to system memory and may not know the full details about + // this texture) + bool is_full_texture; + VkFormat format; + VkImage image; + VkImageLayout image_layout; + VkDeviceMemory image_memory; + VkDeviceSize memory_offset; + VkDeviceSize memory_size; + + uintptr_t access_watch_handle; + bool pending_invalidation; + + // Pointer to the latest usage fence. + std::shared_ptr in_flight_fence; + }; + + struct TextureView { + Texture* texture; + VkImageView view; + + union { + struct { + // FIXME: This only applies on little-endian platforms! + uint16_t swiz_x : 3; + uint16_t swiz_y : 3; + uint16_t swiz_z : 3; + uint16_t swiz_w : 3; + uint16_t : 4; + }; + + uint16_t swizzle; + }; + }; + + TextureCache(Memory* memory, RegisterFile* register_file, + TraceWriter* trace_writer, ui::vulkan::VulkanDevice* device); ~TextureCache(); // Descriptor set layout containing all possible texture bindings. @@ -36,8 +85,11 @@ class TextureCache { // Prepares a descriptor set containing the samplers and images for all // bindings. The textures will be uploaded/converted/etc as needed. + // Requires a fence to be provided that will be signaled when finished + // using the returned descriptor set. VkDescriptorSet PrepareTextureSet( - VkCommandBuffer command_buffer, + VkCommandBuffer setup_command_buffer, + std::shared_ptr completion_fence, const std::vector& vertex_bindings, const std::vector& pixel_bindings); @@ -45,45 +97,106 @@ class TextureCache { // TODO(benvanik): Resolve. // TODO(benvanik): ReadTexture. + // Looks for a texture either containing or matching these parameters. + // Caller is responsible for checking if the texture returned is an exact + // match or just contains the texture given by the parameters. + // If offset_x and offset_y are not null, this may return a texture that + // contains this address at an offset. + Texture* LookupAddress(uint32_t guest_address, uint32_t width, + uint32_t height, TextureFormat format, + VkOffset2D* out_offset = nullptr); + + // Demands a texture for the purpose of resolving from EDRAM. This either + // creates a new texture or returns a previously created texture. texture_info + // is not required to be completely filled out, just guest_address and all + // sizes. + // + // It's possible that this may return an image that is larger than the + // requested size (e.g. resolving into a bigger texture) or an image that + // must have an offset applied. If so, the caller must handle this. + // At the very least, it's guaranteed that the image will be large enough to + // hold the requested size. + Texture* DemandResolveTexture(const TextureInfo& texture_info, + TextureFormat format, VkOffset2D* out_offset); + // Clears all cached content. void ClearCache(); + // Frees any unused resources + void Scavenge(); + private: struct UpdateSetInfo; - void SetupGridImages(); + // Cached Vulkan sampler. + struct Sampler { + SamplerInfo sampler_info; + VkSampler sampler; + }; + + // Allocates a new texture and memory to back it on the GPU. + Texture* AllocateTexture(const TextureInfo& texture_info); + bool FreeTexture(Texture* texture); + + // Demands a texture. If command_buffer is null and the texture hasn't been + // uploaded to graphics memory already, we will return null and bail. + Texture* Demand( + const TextureInfo& texture_info, VkCommandBuffer command_buffer = nullptr, + std::shared_ptr completion_fence = nullptr); + TextureView* DemandView(Texture* texture, uint16_t swizzle); + Sampler* Demand(const SamplerInfo& sampler_info); + + // Queues commands to upload a texture from system memory, applying any + // conversions necessary. This may flush the command buffer to the GPU if we + // run out of staging memory. + bool UploadTexture2D(VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, + Texture* dest, TextureInfo src); bool SetupTextureBindings( + VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, UpdateSetInfo* update_set_info, const std::vector& bindings); - bool SetupTextureBinding(UpdateSetInfo* update_set_info, + bool SetupTextureBinding(VkCommandBuffer command_buffer, + std::shared_ptr completion_fence, + UpdateSetInfo* update_set_info, const Shader::TextureBinding& binding); + Memory* memory_ = nullptr; + RegisterFile* register_file_ = nullptr; TraceWriter* trace_writer_ = nullptr; ui::vulkan::VulkanDevice* device_ = nullptr; VkDescriptorPool descriptor_pool_ = nullptr; VkDescriptorSetLayout texture_descriptor_set_layout_ = nullptr; + std::list>> + in_flight_sets_; - VkDeviceMemory grid_image_2d_memory_ = nullptr; - VkImage grid_image_2d_ = nullptr; - VkImageView grid_image_2d_view_ = nullptr; + ui::vulkan::CircularBuffer staging_buffer_; + std::unordered_map textures_; + std::unordered_map samplers_; + std::vector resolve_textures_; + std::list pending_delete_textures_; + + std::mutex invalidated_textures_mutex_; + std::vector* invalidated_textures_; + std::vector invalidated_textures_sets_[2]; + + std::mutex invalidated_resolve_textures_mutex_; + std::vector invalidated_resolve_textures_; struct UpdateSetInfo { // Bitmap of all 32 fetch constants and whether they have been setup yet. // This prevents duplication across the vertex and pixel shader. uint32_t has_setup_fetch_mask; - uint32_t sampler_write_count = 0; - VkDescriptorImageInfo sampler_infos[32]; - uint32_t image_1d_write_count = 0; - VkDescriptorImageInfo image_1d_infos[32]; - uint32_t image_2d_write_count = 0; - VkDescriptorImageInfo image_2d_infos[32]; - uint32_t image_3d_write_count = 0; - VkDescriptorImageInfo image_3d_infos[32]; - uint32_t image_cube_write_count = 0; - VkDescriptorImageInfo image_cube_infos[32]; + uint32_t image_write_count = 0; + struct ImageSetInfo { + Dimension dimension; + uint32_t tf_binding; + VkDescriptorImageInfo info; + } image_infos[32]; } update_set_info_; }; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index f04ec1ad3..f31b28142 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -37,9 +37,22 @@ VulkanCommandProcessor::VulkanCommandProcessor( VulkanCommandProcessor::~VulkanCommandProcessor() = default; +void VulkanCommandProcessor::RequestFrameTrace(const std::wstring& root_path) { + // Override traces if renderdoc is attached. + if (device_->is_renderdoc_attached()) { + trace_requested_ = true; + return; + } + + return CommandProcessor::RequestFrameTrace(root_path); +} + void VulkanCommandProcessor::ClearCaches() { CommandProcessor::ClearCaches(); + auto status = vkQueueWaitIdle(queue_); + CheckResult(status, "vkQueueWaitIdle"); + buffer_cache_->ClearCache(); pipeline_cache_->ClearCache(); render_cache_->ClearCache(); @@ -69,8 +82,8 @@ bool VulkanCommandProcessor::SetupContext() { // Initialize the state machine caches. buffer_cache_ = std::make_unique(register_file_, device_, kDefaultBufferCacheCapacity); - texture_cache_ = - std::make_unique(register_file_, &trace_writer_, device_); + texture_cache_ = std::make_unique(memory_, register_file_, + &trace_writer_, device_); pipeline_cache_ = std::make_unique( register_file_, device_, buffer_cache_->constant_descriptor_set_layout(), texture_cache_->texture_descriptor_set_layout()); @@ -82,6 +95,11 @@ bool VulkanCommandProcessor::SetupContext() { void VulkanCommandProcessor::ShutdownContext() { // TODO(benvanik): wait until idle. + if (swap_state_.front_buffer_texture) { + // Free swap chain images. + DestroySwapImages(); + } + buffer_cache_.reset(); pipeline_cache_.reset(); render_cache_.reset(); @@ -90,7 +108,7 @@ void VulkanCommandProcessor::ShutdownContext() { // Free all pools. This must come after all of our caches clean up. command_buffer_pool_.reset(); - // Release queue, if were using an acquired one. + // Release queue, if we were using an acquired one. if (!queue_mutex_) { device_->ReleaseQueue(queue_); queue_ = nullptr; @@ -131,24 +149,241 @@ void VulkanCommandProcessor::ReturnFromWait() { CommandProcessor::ReturnFromWait(); } +void VulkanCommandProcessor::CreateSwapImages(VkCommandBuffer setup_buffer, + VkExtent2D extents) { + VkImageCreateInfo image_info; + std::memset(&image_info, 0, sizeof(VkImageCreateInfo)); + image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO; + image_info.imageType = VK_IMAGE_TYPE_2D; + image_info.format = VK_FORMAT_R8G8B8A8_UNORM; + image_info.extent = {extents.width, extents.height, 1}; + image_info.mipLevels = 1; + image_info.arrayLayers = 1; + image_info.samples = VK_SAMPLE_COUNT_1_BIT; + image_info.tiling = VK_IMAGE_TILING_OPTIMAL; + image_info.usage = + VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT; + image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + image_info.queueFamilyIndexCount = 0; + image_info.pQueueFamilyIndices = nullptr; + image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkImage image_fb, image_bb; + auto status = vkCreateImage(*device_, &image_info, nullptr, &image_fb); + CheckResult(status, "vkCreateImage"); + + status = vkCreateImage(*device_, &image_info, nullptr, &image_bb); + CheckResult(status, "vkCreateImage"); + + // Bind memory to images. + VkMemoryRequirements mem_requirements; + vkGetImageMemoryRequirements(*device_, image_fb, &mem_requirements); + fb_memory = device_->AllocateMemory(mem_requirements, 0); + assert_not_null(fb_memory); + + status = vkBindImageMemory(*device_, image_fb, fb_memory, 0); + CheckResult(status, "vkBindImageMemory"); + + vkGetImageMemoryRequirements(*device_, image_fb, &mem_requirements); + bb_memory = device_->AllocateMemory(mem_requirements, 0); + assert_not_null(bb_memory); + + status = vkBindImageMemory(*device_, image_bb, bb_memory, 0); + CheckResult(status, "vkBindImageMemory"); + + std::lock_guard lock(swap_state_.mutex); + swap_state_.front_buffer_texture = reinterpret_cast(image_fb); + swap_state_.back_buffer_texture = reinterpret_cast(image_bb); + + // Transition both images to general layout. + VkImageMemoryBarrier barrier; + std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier)); + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = 0; + barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = image_fb; + barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + + vkCmdPipelineBarrier(setup_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + barrier.image = image_bb; + + vkCmdPipelineBarrier(setup_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); +} + +void VulkanCommandProcessor::DestroySwapImages() { + std::lock_guard lock(swap_state_.mutex); + vkDestroyImage(*device_, + reinterpret_cast(swap_state_.front_buffer_texture), + nullptr); + vkDestroyImage(*device_, + reinterpret_cast(swap_state_.back_buffer_texture), + nullptr); + vkFreeMemory(*device_, fb_memory, nullptr); + vkFreeMemory(*device_, bb_memory, nullptr); + + swap_state_.front_buffer_texture = 0; + swap_state_.back_buffer_texture = 0; + fb_memory = nullptr; + bb_memory = nullptr; +} + void VulkanCommandProcessor::PerformSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width, uint32_t frontbuffer_height) { - // Ensure we issue any pending draws. - // draw_batcher_.Flush(DrawBatcher::FlushMode::kMakeCoherent); + SCOPE_profile_cpu_f("gpu"); - // Need to finish to be sure the other context sees the right data. - // TODO(benvanik): prevent this? fences? - // glFinish(); - - if (context_->WasLost()) { - // We've lost the context due to a TDR. - // TODO: Dump the current commands to a tracefile. - assert_always(); + // Build a final command buffer that copies the game's frontbuffer texture + // into our backbuffer texture. + VkCommandBuffer copy_commands = nullptr; + bool opened_batch; + if (command_buffer_pool_->has_open_batch()) { + copy_commands = command_buffer_pool_->AcquireEntry(); + opened_batch = false; + } else { + command_buffer_pool_->BeginBatch(); + copy_commands = command_buffer_pool_->AcquireEntry(); + current_batch_fence_.reset(new ui::vulkan::Fence(*device_)); + opened_batch = true; } - // Remove any dead textures, etc. - // texture_cache_.Scavenge(); + VkCommandBufferBeginInfo begin_info; + std::memset(&begin_info, 0, sizeof(begin_info)); + begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + auto status = vkBeginCommandBuffer(copy_commands, &begin_info); + CheckResult(status, "vkBeginCommandBuffer"); + + if (!frontbuffer_ptr) { + // Trace viewer does this. + frontbuffer_ptr = last_copy_base_; + } + + if (!swap_state_.back_buffer_texture) { + CreateSwapImages(copy_commands, {frontbuffer_width, frontbuffer_height}); + } + auto swap_bb = reinterpret_cast(swap_state_.back_buffer_texture); + + // Issue the commands to copy the game's frontbuffer to our backbuffer. + auto texture = texture_cache_->LookupAddress( + frontbuffer_ptr, xe::round_up(frontbuffer_width, 32), + xe::round_up(frontbuffer_height, 32), TextureFormat::k_8_8_8_8); + if (texture) { + texture->in_flight_fence = current_batch_fence_; + + // Insert a barrier so the GPU finishes writing to the image. + VkImageMemoryBarrier barrier; + std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier)); + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.srcAccessMask = + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.oldLayout = texture->image_layout; + barrier.newLayout = texture->image_layout; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = texture->image; + barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + + vkCmdPipelineBarrier(copy_commands, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + // Now issue a blit command. + VkImageBlit blit; + std::memset(&blit, 0, sizeof(VkImageBlit)); + blit.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + blit.srcOffsets[0] = {0, 0, 0}; + blit.srcOffsets[1] = {int32_t(frontbuffer_width), + int32_t(frontbuffer_height), 1}; + blit.dstSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + blit.dstOffsets[0] = {0, 0, 0}; + blit.dstOffsets[1] = {int32_t(frontbuffer_width), + int32_t(frontbuffer_height), 1}; + + vkCmdBlitImage(copy_commands, texture->image, texture->image_layout, + swap_bb, VK_IMAGE_LAYOUT_GENERAL, 1, &blit, + VK_FILTER_LINEAR); + + std::lock_guard lock(swap_state_.mutex); + swap_state_.width = frontbuffer_width; + swap_state_.height = frontbuffer_height; + } + + status = vkEndCommandBuffer(copy_commands); + CheckResult(status, "vkEndCommandBuffer"); + + // Queue up current command buffers. + // TODO(benvanik): bigger batches. + std::vector submit_buffers; + if (current_command_buffer_) { + if (current_render_state_) { + render_cache_->EndRenderPass(); + current_render_state_ = nullptr; + } + + status = vkEndCommandBuffer(current_setup_buffer_); + CheckResult(status, "vkEndCommandBuffer"); + status = vkEndCommandBuffer(current_command_buffer_); + CheckResult(status, "vkEndCommandBuffer"); + + // TODO(DrChat): If the setup buffer is empty, don't bother queueing it up. + submit_buffers.push_back(current_setup_buffer_); + submit_buffers.push_back(current_command_buffer_); + + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + } + + submit_buffers.push_back(copy_commands); + if (!submit_buffers.empty()) { + // TODO(benvanik): move to CP or to host (trace dump, etc). + // This only needs to surround a vkQueueSubmit. + if (queue_mutex_) { + queue_mutex_->lock(); + } + + VkSubmitInfo submit_info; + std::memset(&submit_info, 0, sizeof(VkSubmitInfo)); + submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submit_info.commandBufferCount = uint32_t(submit_buffers.size()); + submit_info.pCommandBuffers = submit_buffers.data(); + status = vkQueueSubmit(queue_, 1, &submit_info, *current_batch_fence_); + CheckResult(status, "vkQueueSubmit"); + + if (device_->is_renderdoc_attached() && capturing_) { + device_->EndRenderDocFrameCapture(); + capturing_ = false; + } + if (queue_mutex_) { + queue_mutex_->unlock(); + } + } + + command_buffer_pool_->EndBatch(current_batch_fence_); + + // Scavenging. + { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_i( + "gpu", + "xe::gpu::vulkan::VulkanCommandProcessor::PerformSwap Scavenging"); +#endif // FINE_GRAINED_DRAW_SCOPES + command_buffer_pool_->Scavenge(); + + texture_cache_->Scavenge(); + buffer_cache_->Scavenge(); + } + + current_batch_fence_ = nullptr; } Shader* VulkanCommandProcessor::LoadShader(ShaderType shader_type, @@ -178,16 +413,16 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, return IssueCopy(); } - // TODO(benvanik): move to CP or to host (trace dump, etc). - if (FLAGS_vulkan_renderdoc_capture_all && device_->is_renderdoc_attached()) { - device_->BeginRenderDocFrameCapture(); + if ((regs[XE_GPU_REG_RB_SURFACE_INFO].u32 & 0x3FFF) == 0) { + // Doesn't actually draw. + return true; } // Shaders will have already been defined by previous loads. - // We need the to do just about anything so validate here. + // We need them to do just about anything so validate here. auto vertex_shader = static_cast(active_vertex_shader()); auto pixel_shader = static_cast(active_pixel_shader()); - if (!vertex_shader || !vertex_shader->is_valid()) { + if (!vertex_shader) { // Always need a vertex shader. return true; } @@ -196,61 +431,142 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, // Use a dummy pixel shader when required. // TODO(benvanik): dummy pixel shader. assert_not_null(pixel_shader); - } else if (!pixel_shader || !pixel_shader->is_valid()) { + } else if (!pixel_shader) { // Need a pixel shader in normal color mode. return true; } - // TODO(benvanik): bigger batches. - command_buffer_pool_->BeginBatch(); - VkCommandBuffer command_buffer = command_buffer_pool_->AcquireEntry(); - VkCommandBufferBeginInfo command_buffer_begin_info; - command_buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; - command_buffer_begin_info.pNext = nullptr; - command_buffer_begin_info.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; - command_buffer_begin_info.pInheritanceInfo = nullptr; - auto err = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info); - CheckResult(err, "vkBeginCommandBuffer"); + bool started_command_buffer = false; + if (!current_command_buffer_) { + // TODO(benvanik): bigger batches. + // TODO(DrChat): Decouple setup buffer from current batch. + command_buffer_pool_->BeginBatch(); + current_command_buffer_ = command_buffer_pool_->AcquireEntry(); + current_setup_buffer_ = command_buffer_pool_->AcquireEntry(); + current_batch_fence_.reset(new ui::vulkan::Fence(*device_)); + + VkCommandBufferBeginInfo command_buffer_begin_info; + command_buffer_begin_info.sType = + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + command_buffer_begin_info.pNext = nullptr; + command_buffer_begin_info.flags = + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + command_buffer_begin_info.pInheritanceInfo = nullptr; + auto status = vkBeginCommandBuffer(current_command_buffer_, + &command_buffer_begin_info); + CheckResult(status, "vkBeginCommandBuffer"); + + status = + vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info); + CheckResult(status, "vkBeginCommandBuffer"); + + static uint32_t frame = 0; + if (device_->is_renderdoc_attached() && !capturing_ && + (FLAGS_vulkan_renderdoc_capture_all || trace_requested_)) { + if (queue_mutex_) { + queue_mutex_->lock(); + } + + capturing_ = true; + trace_requested_ = false; + device_->BeginRenderDocFrameCapture(); + + if (queue_mutex_) { + queue_mutex_->unlock(); + } + } + + started_command_buffer = true; + } + auto command_buffer = current_command_buffer_; + auto setup_buffer = current_setup_buffer_; // Begin the render pass. // This will setup our framebuffer and begin the pass in the command buffer. - auto render_state = render_cache_->BeginRenderPass( - command_buffer, vertex_shader, pixel_shader); - if (!render_state) { - return false; + // This reuses a previous render pass if one is already open. + if (render_cache_->dirty() || !current_render_state_) { + if (current_render_state_) { + render_cache_->EndRenderPass(); + current_render_state_ = nullptr; + } + + current_render_state_ = render_cache_->BeginRenderPass( + command_buffer, vertex_shader, pixel_shader); + if (!current_render_state_) { + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; + return false; + } } // Configure the pipeline for drawing. // This encodes all render state (blend, depth, etc), our shader stages, // and our vertex input layout. - if (!pipeline_cache_->ConfigurePipeline(command_buffer, render_state, - vertex_shader, pixel_shader, - primitive_type)) { + VkPipeline pipeline = nullptr; + auto pipeline_status = pipeline_cache_->ConfigurePipeline( + command_buffer, current_render_state_, vertex_shader, pixel_shader, + primitive_type, &pipeline); + if (pipeline_status == PipelineCache::UpdateStatus::kMismatch || + started_command_buffer) { + vkCmdBindPipeline(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, + pipeline); + } else if (pipeline_status == PipelineCache::UpdateStatus::kError) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; + current_render_state_ = nullptr; return false; } + pipeline_cache_->SetDynamicState(command_buffer, started_command_buffer); // Pass registers to the shaders. if (!PopulateConstants(command_buffer, vertex_shader, pixel_shader)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; + current_render_state_ = nullptr; return false; } // Upload and bind index buffer data (if we have any). if (!PopulateIndexBuffer(command_buffer, index_buffer_info)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; + current_render_state_ = nullptr; return false; } // Upload and bind all vertex buffer data. if (!PopulateVertexBuffers(command_buffer, vertex_shader)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; + current_render_state_ = nullptr; return false; } - // Upload and set descriptors for all textures. - if (!PopulateSamplers(command_buffer, vertex_shader, pixel_shader)) { + // Bind samplers/textures. + // Uploads all textures that need it. + // Setup buffer may be flushed to GPU if the texture cache needs it. + if (!PopulateSamplers(command_buffer, setup_buffer, vertex_shader, + pixel_shader)) { render_cache_->EndRenderPass(); + command_buffer_pool_->CancelBatch(); + current_command_buffer_ = nullptr; + current_setup_buffer_ = nullptr; + current_batch_fence_ = nullptr; + current_render_state_ = nullptr; return false; } @@ -273,68 +589,21 @@ bool VulkanCommandProcessor::IssueDraw(PrimitiveType primitive_type, vertex_offset, first_instance); } - // End the rendering pass. - render_cache_->EndRenderPass(); - - // TODO(benvanik): bigger batches. - err = vkEndCommandBuffer(command_buffer); - CheckResult(err, "vkEndCommandBuffer"); - VkFence fence; - VkFenceCreateInfo fence_info; - fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fence_info.pNext = nullptr; - fence_info.flags = 0; - vkCreateFence(*device_, &fence_info, nullptr, &fence); - command_buffer_pool_->EndBatch(fence); - VkSubmitInfo submit_info; - submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; - submit_info.pNext = nullptr; - submit_info.waitSemaphoreCount = 0; - submit_info.pWaitSemaphores = nullptr; - submit_info.commandBufferCount = 1; - submit_info.pCommandBuffers = &command_buffer; - submit_info.signalSemaphoreCount = 0; - submit_info.pSignalSemaphores = nullptr; - if (queue_mutex_) { - queue_mutex_->lock(); - } - err = vkQueueSubmit(queue_, 1, &submit_info, fence); - if (queue_mutex_) { - queue_mutex_->unlock(); - } - CheckResult(err, "vkQueueSubmit"); - if (queue_mutex_) { - queue_mutex_->lock(); - } - err = vkQueueWaitIdle(queue_); - CheckResult(err, "vkQueueWaitIdle"); - err = vkDeviceWaitIdle(*device_); - CheckResult(err, "vkDeviceWaitIdle"); - if (queue_mutex_) { - queue_mutex_->unlock(); - } - while (command_buffer_pool_->has_pending()) { - command_buffer_pool_->Scavenge(); - xe::threading::MaybeYield(); - } - vkDestroyFence(*device_, fence, nullptr); - - // TODO(benvanik): move to CP or to host (trace dump, etc). - if (FLAGS_vulkan_renderdoc_capture_all && device_->is_renderdoc_attached()) { - device_->EndRenderDocFrameCapture(); - } - return true; } bool VulkanCommandProcessor::PopulateConstants(VkCommandBuffer command_buffer, VulkanShader* vertex_shader, VulkanShader* pixel_shader) { +#if FINE_GRAINED_DRAW_SCOPES + SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + // Upload the constants the shaders require. // These are optional, and if none are defined 0 will be returned. auto constant_offsets = buffer_cache_->UploadConstantRegisters( vertex_shader->constant_register_map(), - pixel_shader->constant_register_map()); + pixel_shader->constant_register_map(), current_batch_fence_); if (constant_offsets.first == VK_WHOLE_SIZE || constant_offsets.second == VK_WHOLE_SIZE) { // Shader wants constants but we couldn't upload them. @@ -387,8 +656,8 @@ bool VulkanCommandProcessor::PopulateIndexBuffer( size_t source_length = info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t) : sizeof(uint16_t)); - auto buffer_ref = - buffer_cache_->UploadIndexBuffer(source_ptr, source_length, info.format); + auto buffer_ref = buffer_cache_->UploadIndexBuffer( + source_ptr, source_length, info.format, current_batch_fence_); if (buffer_ref.second == VK_WHOLE_SIZE) { // Failed to upload buffer. return false; @@ -413,6 +682,11 @@ bool VulkanCommandProcessor::PopulateVertexBuffers( #endif // FINE_GRAINED_DRAW_SCOPES auto& vertex_bindings = vertex_shader->vertex_bindings(); + if (vertex_bindings.empty()) { + // No bindings. + return true; + } + assert_true(vertex_bindings.size() <= 32); VkBuffer all_buffers[32]; VkDeviceSize all_buffer_offsets[32]; @@ -434,7 +708,6 @@ bool VulkanCommandProcessor::PopulateVertexBuffers( fetch = &group->vertex_fetch_2; break; } - assert_true(fetch->endian == 2); // TODO(benvanik): compute based on indices or vertex count. // THIS CAN BE MASSIVELY INCORRECT (too large). @@ -446,8 +719,9 @@ bool VulkanCommandProcessor::PopulateVertexBuffers( const void* source_ptr = memory_->TranslatePhysical(fetch->address << 2); size_t source_length = valid_range; - auto buffer_ref = - buffer_cache_->UploadVertexBuffer(source_ptr, source_length); + auto buffer_ref = buffer_cache_->UploadVertexBuffer( + source_ptr, source_length, static_cast(fetch->endian), + current_batch_fence_); if (buffer_ref.second == VK_WHOLE_SIZE) { // Failed to upload buffer. return false; @@ -467,6 +741,7 @@ bool VulkanCommandProcessor::PopulateVertexBuffers( } bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer, + VkCommandBuffer setup_buffer, VulkanShader* vertex_shader, VulkanShader* pixel_shader) { #if FINE_GRAINED_DRAW_SCOPES @@ -474,14 +749,13 @@ bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer, #endif // FINE_GRAINED_DRAW_SCOPES auto descriptor_set = texture_cache_->PrepareTextureSet( - command_buffer, vertex_shader->texture_bindings(), + setup_buffer, current_batch_fence_, vertex_shader->texture_bindings(), pixel_shader->texture_bindings()); if (!descriptor_set) { // Unable to bind set. return false; } - // Bind samplers/textures. vkCmdBindDescriptorSets(command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_cache_->pipeline_layout(), 1, 1, &descriptor_set, 0, nullptr); @@ -491,7 +765,294 @@ bool VulkanCommandProcessor::PopulateSamplers(VkCommandBuffer command_buffer, bool VulkanCommandProcessor::IssueCopy() { SCOPE_profile_cpu_f("gpu"); - // TODO(benvanik): resolve. + auto& regs = *register_file_; + + // This is used to resolve surfaces, taking them from EDRAM render targets + // to system memory. It can optionally clear color/depth surfaces, too. + // The command buffer has stuff for actually doing this by drawing, however + // we should be able to do it without that much easier. + + uint32_t copy_control = regs[XE_GPU_REG_RB_COPY_CONTROL].u32; + // Render targets 0-3, 4 = depth + uint32_t copy_src_select = copy_control & 0x7; + bool color_clear_enabled = (copy_control >> 8) & 0x1; + bool depth_clear_enabled = (copy_control >> 9) & 0x1; + auto copy_command = static_cast((copy_control >> 20) & 0x3); + + uint32_t copy_dest_info = regs[XE_GPU_REG_RB_COPY_DEST_INFO].u32; + auto copy_dest_endian = static_cast(copy_dest_info & 0x7); + uint32_t copy_dest_array = (copy_dest_info >> 3) & 0x1; + assert_true(copy_dest_array == 0); + uint32_t copy_dest_slice = (copy_dest_info >> 4) & 0x7; + assert_true(copy_dest_slice == 0); + auto copy_dest_format = + static_cast((copy_dest_info >> 7) & 0x3F); + uint32_t copy_dest_number = (copy_dest_info >> 13) & 0x7; + // assert_true(copy_dest_number == 0); // ? + uint32_t copy_dest_bias = (copy_dest_info >> 16) & 0x3F; + // assert_true(copy_dest_bias == 0); + uint32_t copy_dest_swap = (copy_dest_info >> 25) & 0x1; + + uint32_t copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32; + uint32_t copy_dest_pitch = regs[XE_GPU_REG_RB_COPY_DEST_PITCH].u32; + uint32_t copy_dest_height = (copy_dest_pitch >> 16) & 0x3FFF; + copy_dest_pitch &= 0x3FFF; + + // None of this is supported yet: + uint32_t copy_surface_slice = regs[XE_GPU_REG_RB_COPY_SURFACE_SLICE].u32; + assert_true(copy_surface_slice == 0); + uint32_t copy_func = regs[XE_GPU_REG_RB_COPY_FUNC].u32; + assert_true(copy_func == 0); + uint32_t copy_ref = regs[XE_GPU_REG_RB_COPY_REF].u32; + assert_true(copy_ref == 0); + uint32_t copy_mask = regs[XE_GPU_REG_RB_COPY_MASK].u32; + assert_true(copy_mask == 0); + + // Supported in GL4, not supported here yet. + assert_zero(copy_dest_swap); + + // RB_SURFACE_INFO + // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html + uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32; + uint32_t surface_pitch = surface_info & 0x3FFF; + auto surface_msaa = static_cast((surface_info >> 16) & 0x3); + + // TODO(benvanik): any way to scissor this? a200 has: + // REG_A2XX_RB_COPY_DEST_OFFSET = A2XX_RB_COPY_DEST_OFFSET_X(tile->xoff) | + // A2XX_RB_COPY_DEST_OFFSET_Y(tile->yoff); + // but I can't seem to find something similar. + uint32_t dest_logical_width = copy_dest_pitch; + uint32_t dest_logical_height = copy_dest_height; + uint32_t dest_block_width = xe::round_up(dest_logical_width, 32); + uint32_t dest_block_height = xe::round_up(dest_logical_height, 32); + + uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; + int16_t window_offset_x = window_offset & 0x7FFF; + int16_t window_offset_y = (window_offset >> 16) & 0x7FFF; + // Sign-extension + if (window_offset_x & 0x4000) { + window_offset_x |= 0x8000; + } + if (window_offset_y & 0x4000) { + window_offset_y |= 0x8000; + } + + size_t read_size = GetTexelSize(ColorFormatToTextureFormat(copy_dest_format)); + + // Adjust the copy base offset to point to the beginning of the texture, so + // we don't run into hiccups down the road (e.g. resolving the last part going + // backwards). + int32_t dest_offset = window_offset_y * copy_dest_pitch * int(read_size); + dest_offset += window_offset_x * 32 * int(read_size); + copy_dest_base += dest_offset; + + // HACK: vertices to use are always in vf0. + int copy_vertex_fetch_slot = 0; + int r = + XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (copy_vertex_fetch_slot / 3) * 6; + const auto group = reinterpret_cast(®s.values[r]); + const xe_gpu_vertex_fetch_t* fetch = nullptr; + switch (copy_vertex_fetch_slot % 3) { + case 0: + fetch = &group->vertex_fetch_0; + break; + case 1: + fetch = &group->vertex_fetch_1; + break; + case 2: + fetch = &group->vertex_fetch_2; + break; + } + assert_true(fetch->type == 3); + assert_true(fetch->endian == 2); + assert_true(fetch->size == 6); + const uint8_t* vertex_addr = memory_->TranslatePhysical(fetch->address << 2); + trace_writer_.WriteMemoryRead(fetch->address << 2, fetch->size * 4); + int32_t dest_min_x = int32_t((std::min( + std::min( + GpuSwap(xe::load(vertex_addr + 0), Endian(fetch->endian)), + GpuSwap(xe::load(vertex_addr + 8), Endian(fetch->endian))), + GpuSwap(xe::load(vertex_addr + 16), Endian(fetch->endian))))); + int32_t dest_max_x = int32_t((std::max( + std::max( + GpuSwap(xe::load(vertex_addr + 0), Endian(fetch->endian)), + GpuSwap(xe::load(vertex_addr + 8), Endian(fetch->endian))), + GpuSwap(xe::load(vertex_addr + 16), Endian(fetch->endian))))); + int32_t dest_min_y = int32_t((std::min( + std::min( + GpuSwap(xe::load(vertex_addr + 4), Endian(fetch->endian)), + GpuSwap(xe::load(vertex_addr + 12), Endian(fetch->endian))), + GpuSwap(xe::load(vertex_addr + 20), Endian(fetch->endian))))); + int32_t dest_max_y = int32_t((std::max( + std::max( + GpuSwap(xe::load(vertex_addr + 4), Endian(fetch->endian)), + GpuSwap(xe::load(vertex_addr + 12), Endian(fetch->endian))), + GpuSwap(xe::load(vertex_addr + 20), Endian(fetch->endian))))); + + uint32_t color_edram_base = 0; + uint32_t depth_edram_base = 0; + ColorRenderTargetFormat color_format; + DepthRenderTargetFormat depth_format; + if (copy_src_select <= 3) { + // Source from a color target. + uint32_t color_info[4] = { + regs[XE_GPU_REG_RB_COLOR_INFO].u32, regs[XE_GPU_REG_RB_COLOR1_INFO].u32, + regs[XE_GPU_REG_RB_COLOR2_INFO].u32, + regs[XE_GPU_REG_RB_COLOR3_INFO].u32, + }; + color_edram_base = color_info[copy_src_select] & 0xFFF; + + color_format = static_cast( + (color_info[copy_src_select] >> 16) & 0xF); + } + + if (copy_src_select > 3 || depth_clear_enabled) { + // Source from a depth target. + uint32_t depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32; + depth_edram_base = depth_info & 0xFFF; + + depth_format = + static_cast((depth_info >> 16) & 0x1); + } + + // Demand a resolve texture from the texture cache. + TextureInfo tex_info = {}; + tex_info.guest_address = copy_dest_base; + tex_info.width = dest_logical_width - 1; + tex_info.height = dest_logical_height - 1; + tex_info.dimension = gpu::Dimension::k2D; + tex_info.input_length = copy_dest_pitch * copy_dest_height * 4; + tex_info.format_info = + FormatInfo::Get(uint32_t(ColorFormatToTextureFormat(copy_dest_format))); + tex_info.size_2d.logical_width = dest_logical_width; + tex_info.size_2d.logical_height = dest_logical_height; + tex_info.size_2d.block_width = dest_block_width; + tex_info.size_2d.block_height = dest_block_height; + tex_info.size_2d.input_width = dest_block_width; + tex_info.size_2d.input_height = dest_block_height; + tex_info.size_2d.input_pitch = copy_dest_pitch * 4; + auto texture = texture_cache_->DemandResolveTexture( + tex_info, ColorFormatToTextureFormat(copy_dest_format), nullptr); + assert_not_null(texture); + texture->in_flight_fence = current_batch_fence_; + + // For debugging purposes only (trace viewer) + last_copy_base_ = texture->texture_info.guest_address; + + if (!current_command_buffer_) { + command_buffer_pool_->BeginBatch(); + current_command_buffer_ = command_buffer_pool_->AcquireEntry(); + current_setup_buffer_ = command_buffer_pool_->AcquireEntry(); + current_batch_fence_.reset(new ui::vulkan::Fence(*device_)); + + VkCommandBufferBeginInfo command_buffer_begin_info; + command_buffer_begin_info.sType = + VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + command_buffer_begin_info.pNext = nullptr; + command_buffer_begin_info.flags = + VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + command_buffer_begin_info.pInheritanceInfo = nullptr; + auto status = vkBeginCommandBuffer(current_command_buffer_, + &command_buffer_begin_info); + CheckResult(status, "vkBeginCommandBuffer"); + + status = + vkBeginCommandBuffer(current_setup_buffer_, &command_buffer_begin_info); + CheckResult(status, "vkBeginCommandBuffer"); + } else if (current_render_state_) { + render_cache_->EndRenderPass(); + current_render_state_ = nullptr; + } + auto command_buffer = current_command_buffer_; + + if (texture->image_layout == VK_IMAGE_LAYOUT_UNDEFINED) { + // Transition the image to a general layout. + VkImageMemoryBarrier image_barrier; + image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + image_barrier.pNext = nullptr; + image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + image_barrier.image = texture->image; + image_barrier.subresourceRange = {0, 0, 1, 0, 1}; + image_barrier.subresourceRange.aspectMask = + copy_src_select <= 3 + ? VK_IMAGE_ASPECT_COLOR_BIT + : VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + texture->image_layout = VK_IMAGE_LAYOUT_GENERAL; + + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); + } + + VkOffset3D resolve_offset = {dest_min_x, dest_min_y, 0}; + VkExtent3D resolve_extent = {uint32_t(dest_max_x - dest_min_x), + uint32_t(dest_max_y - dest_min_y), 1}; + + // Ask the render cache to copy to the resolve texture. + auto edram_base = copy_src_select <= 3 ? color_edram_base : depth_edram_base; + uint32_t src_format = copy_src_select <= 3 + ? static_cast(color_format) + : static_cast(depth_format); + switch (copy_command) { + case CopyCommand::kRaw: + /* + render_cache_->RawCopyToImage(command_buffer, edram_base, texture->image, + texture->image_layout, copy_src_select <= 3, + resolve_offset, resolve_extent); + break; + */ + case CopyCommand::kConvert: + render_cache_->BlitToImage( + command_buffer, edram_base, surface_pitch, resolve_extent.height, + surface_msaa, texture->image, texture->image_layout, + copy_src_select <= 3, src_format, VK_FILTER_LINEAR, resolve_offset, + resolve_extent); + break; + + case CopyCommand::kConstantOne: + case CopyCommand::kNull: + assert_always(); + break; + } + + // Perform any requested clears. + uint32_t copy_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; + uint32_t copy_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; + uint32_t copy_color_clear_low = regs[XE_GPU_REG_RB_COLOR_CLEAR_LOW].u32; + assert_true(copy_color_clear == copy_color_clear_low); + + if (color_clear_enabled) { + // If color clear is enabled, we can only clear a selected color target! + assert_true(copy_src_select <= 3); + + // TODO(benvanik): verify color order. + float color[] = {((copy_color_clear >> 0) & 0xFF) / 255.0f, + ((copy_color_clear >> 8) & 0xFF) / 255.0f, + ((copy_color_clear >> 16) & 0xFF) / 255.0f, + ((copy_color_clear >> 24) & 0xFF) / 255.0f}; + + // TODO(DrChat): Do we know the surface height at this point? + render_cache_->ClearEDRAMColor(command_buffer, color_edram_base, + color_format, surface_pitch, + resolve_extent.height, surface_msaa, color); + } + + if (depth_clear_enabled) { + float depth = + (copy_depth_clear & 0xFFFFFF00) / static_cast(0xFFFFFF00); + uint8_t stencil = copy_depth_clear & 0xFF; + + // TODO(DrChat): Do we know the surface height at this point? + render_cache_->ClearEDRAMDepthStencil( + command_buffer, depth_edram_base, depth_format, surface_pitch, + resolve_extent.height, surface_msaa, depth, stencil); + } + return true; } diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 43aec9edd..f58e2319b 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -34,12 +34,14 @@ #include "xenia/ui/vulkan/fenced_pools.h" #include "xenia/ui/vulkan/vulkan_context.h" #include "xenia/ui/vulkan/vulkan_device.h" +#include "xenia/ui/vulkan/vulkan_util.h" namespace xe { namespace gpu { namespace vulkan { class VulkanGraphicsSystem; +class TextureCache; class VulkanCommandProcessor : public CommandProcessor { public: @@ -47,8 +49,11 @@ class VulkanCommandProcessor : public CommandProcessor { kernel::KernelState* kernel_state); ~VulkanCommandProcessor() override; + virtual void RequestFrameTrace(const std::wstring& root_path) override; void ClearCaches() override; + RenderCache* render_cache() { return render_cache_.get(); } + private: bool SetupContext() override; void ShutdownContext() override; @@ -57,6 +62,9 @@ class VulkanCommandProcessor : public CommandProcessor { void PrepareForWait() override; void ReturnFromWait() override; + void CreateSwapImages(VkCommandBuffer setup_buffer, VkExtent2D extents); + void DestroySwapImages(); + void PerformSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width, uint32_t frontbuffer_height) override; @@ -74,12 +82,17 @@ class VulkanCommandProcessor : public CommandProcessor { bool PopulateVertexBuffers(VkCommandBuffer command_buffer, VulkanShader* vertex_shader); bool PopulateSamplers(VkCommandBuffer command_buffer, + VkCommandBuffer setup_buffer, VulkanShader* vertex_shader, VulkanShader* pixel_shader); bool IssueCopy() override; xe::ui::vulkan::VulkanDevice* device_ = nullptr; + // front buffer / back buffer memory + VkDeviceMemory fb_memory = nullptr; + VkDeviceMemory bb_memory = nullptr; + // TODO(benvanik): abstract behind context? // Queue used to submit work. This may be a dedicated queue for the command // processor and no locking will be required for use. If a dedicated queue @@ -88,12 +101,22 @@ class VulkanCommandProcessor : public CommandProcessor { VkQueue queue_ = nullptr; std::mutex* queue_mutex_ = nullptr; + // Last copy base address, for debugging only. + uint32_t last_copy_base_ = 0; + bool capturing_ = false; + bool trace_requested_ = false; + std::unique_ptr buffer_cache_; std::unique_ptr pipeline_cache_; std::unique_ptr render_cache_; std::unique_ptr texture_cache_; std::unique_ptr command_buffer_pool_; + + const RenderState* current_render_state_ = nullptr; + VkCommandBuffer current_command_buffer_ = nullptr; + VkCommandBuffer current_setup_buffer_ = nullptr; + std::shared_ptr current_batch_fence_; }; } // namespace vulkan diff --git a/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc b/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc index 1f018db54..fd2fe7789 100644 --- a/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc +++ b/src/xenia/gpu/vulkan/vulkan_gpu_flags.cc @@ -11,3 +11,6 @@ DEFINE_bool(vulkan_renderdoc_capture_all, false, "Capture everything with RenderDoc."); +DEFINE_bool(vulkan_native_msaa, false, "Use native MSAA"); +DEFINE_bool(vulkan_dump_disasm, false, + "Dump shader disassembly. NVIDIA only supported."); diff --git a/src/xenia/gpu/vulkan/vulkan_gpu_flags.h b/src/xenia/gpu/vulkan/vulkan_gpu_flags.h index ca83dfb7a..169e797c8 100644 --- a/src/xenia/gpu/vulkan/vulkan_gpu_flags.h +++ b/src/xenia/gpu/vulkan/vulkan_gpu_flags.h @@ -15,5 +15,7 @@ #define FINE_GRAINED_DRAW_SCOPES 1 DECLARE_bool(vulkan_renderdoc_capture_all); +DECLARE_bool(vulkan_native_msaa); +DECLARE_bool(vulkan_dump_disasm); #endif // XENIA_GPU_VULKAN_VULKAN_GPU_FLAGS_H_ diff --git a/src/xenia/gpu/vulkan/vulkan_graphics_system.cc b/src/xenia/gpu/vulkan/vulkan_graphics_system.cc index 74ec57849..08c6120d7 100644 --- a/src/xenia/gpu/vulkan/vulkan_graphics_system.cc +++ b/src/xenia/gpu/vulkan/vulkan_graphics_system.cc @@ -19,14 +19,14 @@ #include "xenia/gpu/vulkan/vulkan_command_processor.h" #include "xenia/gpu/vulkan/vulkan_gpu_flags.h" #include "xenia/ui/vulkan/vulkan_provider.h" +#include "xenia/ui/vulkan/vulkan_swap_chain.h" #include "xenia/ui/window.h" namespace xe { namespace gpu { namespace vulkan { -VulkanGraphicsSystem::VulkanGraphicsSystem() = default; - +VulkanGraphicsSystem::VulkanGraphicsSystem() {} VulkanGraphicsSystem::~VulkanGraphicsSystem() = default; X_STATUS VulkanGraphicsSystem::Setup(cpu::Processor* processor, @@ -74,12 +74,41 @@ void VulkanGraphicsSystem::Swap(xe::ui::UIEvent* e) { return; } - // Blit the frontbuffer. - // display_context_->blitter()->BlitTexture2D( - // static_cast(swap_state.front_buffer_texture), - // Rect2D(0, 0, swap_state.width, swap_state.height), - // Rect2D(0, 0, target_window_->width(), target_window_->height()), - // GL_LINEAR, false); + auto swap_chain = display_context_->swap_chain(); + auto copy_cmd_buffer = swap_chain->copy_cmd_buffer(); + auto front_buffer = + reinterpret_cast(swap_state.front_buffer_texture); + + VkImageMemoryBarrier barrier; + std::memset(&barrier, 0, sizeof(VkImageMemoryBarrier)); + barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; + barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.image = front_buffer; + barrier.subresourceRange = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + vkCmdPipelineBarrier(copy_cmd_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &barrier); + + VkImageBlit region; + region.srcSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + region.srcOffsets[0] = {0, 0, 0}; + region.srcOffsets[1] = {static_cast(swap_state.width), + static_cast(swap_state.height), 1}; + + region.dstSubresource = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 0, 1}; + region.dstOffsets[0] = {0, 0, 0}; + region.dstOffsets[1] = {static_cast(swap_chain->surface_width()), + static_cast(swap_chain->surface_height()), + 1}; + vkCmdBlitImage(copy_cmd_buffer, front_buffer, VK_IMAGE_LAYOUT_GENERAL, + swap_chain->surface_image(), + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, ®ion, + VK_FILTER_LINEAR); } } // namespace vulkan diff --git a/src/xenia/gpu/vulkan/vulkan_shader.cc b/src/xenia/gpu/vulkan/vulkan_shader.cc index b3c72abf3..c18341a71 100644 --- a/src/xenia/gpu/vulkan/vulkan_shader.cc +++ b/src/xenia/gpu/vulkan/vulkan_shader.cc @@ -44,11 +44,11 @@ bool VulkanShader::Prepare() { shader_info.codeSize = translated_binary_.size(); shader_info.pCode = reinterpret_cast(translated_binary_.data()); - auto err = + auto status = vkCreateShaderModule(device_, &shader_info, nullptr, &shader_module_); - CheckResult(err, "vkCreateShaderModule"); + CheckResult(status, "vkCreateShaderModule"); - return true; + return status == VK_SUCCESS; } } // namespace vulkan diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 65c1f0bad..32c33cae8 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -49,6 +49,7 @@ enum class PrimitiveType : uint32_t { kLineLoop = 0x0C, kQuadList = 0x0D, kQuadStrip = 0x0E, + kUnknown0x11 = 0x11, }; enum class Dimension : uint32_t { @@ -382,7 +383,7 @@ XEPACKEDUNION(xe_gpu_vertex_fetch_t, { uint32_t type : 2; uint32_t address : 30; uint32_t endian : 2; - uint32_t size : 24; + uint32_t size : 24; // size in words uint32_t unk1 : 6; }); XEPACKEDSTRUCTANONYMOUS({ @@ -486,6 +487,46 @@ XEPACKEDUNION(xe_gpu_fetch_group_t, { }); }); +enum Event { + SAMPLE_STREAMOUTSTATS1 = (1 << 0), + SAMPLE_STREAMOUTSTATS2 = (2 << 0), + SAMPLE_STREAMOUTSTATS3 = (3 << 0), + CACHE_FLUSH_TS = (4 << 0), + CACHE_FLUSH = (6 << 0), + CS_PARTIAL_FLUSH = (7 << 0), + VGT_STREAMOUT_RESET = (10 << 0), + END_OF_PIPE_INCR_DE = (11 << 0), + END_OF_PIPE_IB_END = (12 << 0), + RST_PIX_CNT = (13 << 0), + VS_PARTIAL_FLUSH = (15 << 0), + PS_PARTIAL_FLUSH = (16 << 0), + CACHE_FLUSH_AND_INV_TS_EVENT = (20 << 0), + ZPASS_DONE = (21 << 0), + CACHE_FLUSH_AND_INV_EVENT = (22 << 0), + PERFCOUNTER_START = (23 << 0), + PERFCOUNTER_STOP = (24 << 0), + PIPELINESTAT_START = (25 << 0), + PIPELINESTAT_STOP = (26 << 0), + PERFCOUNTER_SAMPLE = (27 << 0), + SAMPLE_PIPELINESTAT = (30 << 0), + SAMPLE_STREAMOUTSTATS = (32 << 0), + RESET_VTX_CNT = (33 << 0), + VGT_FLUSH = (36 << 0), + BOTTOM_OF_PIPE_TS = (40 << 0), + DB_CACHE_FLUSH_AND_INV = (42 << 0), + FLUSH_AND_INV_DB_DATA_TS = (43 << 0), + FLUSH_AND_INV_DB_META = (44 << 0), + FLUSH_AND_INV_CB_DATA_TS = (45 << 0), + FLUSH_AND_INV_CB_META = (46 << 0), + CS_DONE = (47 << 0), + PS_DONE = (48 << 0), + FLUSH_AND_INV_CB_PIXEL_DATA = (49 << 0), + THREAD_TRACE_START = (51 << 0), + THREAD_TRACE_STOP = (52 << 0), + THREAD_TRACE_FLUSH = (54 << 0), + THREAD_TRACE_FINISH = (55 << 0), +}; + // Opcodes (IT_OPCODE) for Type-3 commands in the ringbuffer. // https://github.com/freedreno/amd-gpu/blob/master/include/api/gsl_pm4types.h // Not sure if all of these are used. @@ -501,7 +542,7 @@ enum Type3Opcode { PM4_WAIT_FOR_IDLE = 0x26, // wait for the IDLE state of the engine PM4_WAIT_REG_MEM = 0x3c, // wait until a register or memory location is a specific value PM4_WAIT_REG_EQ = 0x52, // wait until a register location is equal to a specific value - PM4_WAT_REG_GTE = 0x53, // wait until a register location is >= a specific value + PM4_WAIT_REG_GTE = 0x53, // wait until a register location is >= a specific value PM4_WAIT_UNTIL_READ = 0x5c, // wait until a read completes PM4_WAIT_IB_PFD_COMPLETE = 0x5d, // wait until all base/size writes from an IB_PFD packet have completed diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc index e979cb62a..208473cf2 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc @@ -366,7 +366,7 @@ void VdSwap(lpvoid_t buffer_ptr, // ptr into primary ringbuffer auto dwords = buffer_ptr.as_array(); dwords[0] = xenos::MakePacketType3(); dwords[1] = 'SWAP'; - dwords[2] = *frontbuffer_ptr; + dwords[2] = (*frontbuffer_ptr) & 0x1FFFFFFF; // Set by VdCallGraphicsNotificationRoutines. dwords[3] = last_frontbuffer_width_; diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index d7507df23..5dcf5bfa8 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -376,17 +376,19 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) { return mmio_handler_->LookupRange(virtual_address); } -uintptr_t Memory::AddPhysicalWriteWatch(uint32_t physical_address, - uint32_t length, - cpu::WriteWatchCallback callback, - void* callback_context, - void* callback_data) { - return mmio_handler_->AddPhysicalWriteWatch( - physical_address, length, callback, callback_context, callback_data); +uintptr_t Memory::AddPhysicalAccessWatch(uint32_t physical_address, + uint32_t length, + cpu::MMIOHandler::WatchType type, + cpu::AccessWatchCallback callback, + void* callback_context, + void* callback_data) { + return mmio_handler_->AddPhysicalAccessWatch(physical_address, length, type, + callback, callback_context, + callback_data); } -void Memory::CancelWriteWatch(uintptr_t watch_handle) { - mmio_handler_->CancelWriteWatch(watch_handle); +void Memory::CancelAccessWatch(uintptr_t watch_handle) { + mmio_handler_->CancelAccessWatch(watch_handle); } uint32_t Memory::SystemHeapAlloc(uint32_t size, uint32_t alignment, @@ -453,6 +455,7 @@ bool Memory::Save(ByteStream* stream) { } bool Memory::Restore(ByteStream* stream) { + XELOGD("Restoring memory..."); heaps_.v00000000.Restore(stream); heaps_.v40000000.Restore(stream); heaps_.v80000000.Restore(stream); @@ -577,6 +580,8 @@ bool BaseHeap::Save(ByteStream* stream) { } bool BaseHeap::Restore(ByteStream* stream) { + XELOGD("Heap %.8X-%.8X", heap_base_, heap_base_ + heap_size_); + for (size_t i = 0; i < page_table_.size(); i++) { auto& page = page_table_[i]; page.qword = stream->Read(); @@ -897,7 +902,7 @@ bool BaseHeap::Release(uint32_t base_address, uint32_t* out_region_size) { auto base_page_entry = page_table_[base_page_number]; if (base_page_entry.base_address != base_page_number) { XELOGE("BaseHeap::Release failed because address is not a region start"); - // return false; + return false; } if (out_region_size) { diff --git a/src/xenia/memory.h b/src/xenia/memory.h index 6a0fc9c5d..e27976de2 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -303,12 +303,13 @@ class Memory { // // This has a significant performance penalty for writes in in the range or // nearby (sharing 64KiB pages). - uintptr_t AddPhysicalWriteWatch(uint32_t physical_address, uint32_t length, - cpu::WriteWatchCallback callback, - void* callback_context, void* callback_data); + uintptr_t AddPhysicalAccessWatch(uint32_t physical_address, uint32_t length, + cpu::MMIOHandler::WatchType type, + cpu::AccessWatchCallback callback, + void* callback_context, void* callback_data); - // Cancels a write watch requested with AddPhysicalWriteWatch. - void CancelWriteWatch(uintptr_t watch_handle); + // Cancels a write watch requested with AddPhysicalAccessWatch. + void CancelAccessWatch(uintptr_t watch_handle); // Allocates virtual memory from the 'system' heap. // System memory is kept separate from game memory but is still accessible diff --git a/src/xenia/ui/spirv/spirv_validator.cc b/src/xenia/ui/spirv/spirv_validator.cc new file mode 100644 index 000000000..734688eb6 --- /dev/null +++ b/src/xenia/ui/spirv/spirv_validator.cc @@ -0,0 +1,80 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/ui/spirv/spirv_validator.h" + +#include "third_party/spirv-tools/include/spirv-tools/libspirv.h" +#include "xenia/base/logging.h" + +namespace xe { +namespace ui { +namespace spirv { + +SpirvValidator::Result::Result(spv_text text, spv_diagnostic diagnostic) + : text_(text), diagnostic_(diagnostic) {} + +SpirvValidator::Result::~Result() { + if (text_) { + spvTextDestroy(text_); + } + if (diagnostic_) { + spvDiagnosticDestroy(diagnostic_); + } +} + +bool SpirvValidator::Result::has_error() const { return !!diagnostic_; } + +size_t SpirvValidator::Result::error_word_index() const { + return diagnostic_ ? diagnostic_->position.index : 0; +} + +const char* SpirvValidator::Result::error_string() const { + return diagnostic_ ? diagnostic_->error : ""; +} + +const char* SpirvValidator::Result::text() const { + return text_ ? text_->str : ""; +} + +std::string SpirvValidator::Result::to_string() const { + return text_ ? std::string(text_->str, text_->length) : ""; +} + +void SpirvValidator::Result::AppendText(StringBuffer* target_buffer) const { + if (text_) { + target_buffer->AppendBytes(reinterpret_cast(text_->str), + text_->length); + } +} + +SpirvValidator::SpirvValidator() : spv_context_(spvContextCreate()) {} +SpirvValidator::~SpirvValidator() { spvContextDestroy(spv_context_); } + +std::unique_ptr SpirvValidator::Validate( + const uint32_t* words, size_t word_count) { + spv_text text = nullptr; + spv_diagnostic diagnostic = nullptr; + spv_const_binary_t binary = {words, word_count}; + auto result_code = + spvValidate(spv_context_, &binary, SPV_VALIDATE_ALL, &diagnostic); + std::unique_ptr result(new Result(text, diagnostic)); + if (result_code) { + XELOGE("Failed to validate spv: %d", result_code); + if (result->has_error()) { + return result; + } else { + return nullptr; + } + } + return result; +} + +} // namespace spirv +} // namespace ui +} // namespace xe \ No newline at end of file diff --git a/src/xenia/ui/spirv/spirv_validator.h b/src/xenia/ui/spirv/spirv_validator.h new file mode 100644 index 000000000..890843f27 --- /dev/null +++ b/src/xenia/ui/spirv/spirv_validator.h @@ -0,0 +1,66 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2016 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_UI_SPIRV_SPIRV_VALIDATOR_H_ +#define XENIA_UI_SPIRV_SPIRV_VALIDATOR_H_ + +#include +#include + +#include "xenia/base/string_buffer.h" +#include "xenia/ui/spirv/spirv_util.h" + +namespace xe { +namespace ui { +namespace spirv { + +class SpirvValidator { + public: + class Result { + public: + Result(spv_text text, spv_diagnostic diagnostic); + ~Result(); + + // True if the result has an error associated with it. + bool has_error() const; + // Index of the error in the provided binary word data. + size_t error_word_index() const; + // Human-readable description of the error. + const char* error_string() const; + + // Disassembled source text. + // Returned pointer lifetime is tied to this Result instance. + const char* text() const; + // Converts the disassembled source text to a string. + std::string to_string() const; + // Appends the disassembled source text to the given buffer. + void AppendText(StringBuffer* target_buffer) const; + + private: + spv_text text_ = nullptr; + spv_diagnostic diagnostic_ = nullptr; + }; + + SpirvValidator(); + ~SpirvValidator(); + + // Validates the given SPIRV binary. + // The return will be nullptr if validation fails due to a library error. + // The return may have an error set on it if the SPIRV binary is malformed. + std::unique_ptr Validate(const uint32_t* words, size_t word_count); + + private: + spv_context spv_context_ = nullptr; +}; + +} // namespace spirv +} // namespace ui +} // namespace xe + +#endif // XENIA_UI_SPIRV_SPIRV_VALIDATOR_H_ diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc new file mode 100644 index 000000000..94d2996ce --- /dev/null +++ b/src/xenia/ui/vulkan/circular_buffer.cc @@ -0,0 +1,227 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2015 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include "xenia/base/assert.h" +#include "xenia/base/logging.h" +#include "xenia/base/math.h" + +#include "xenia/ui/vulkan/circular_buffer.h" + +namespace xe { +namespace ui { +namespace vulkan { + +CircularBuffer::CircularBuffer(VulkanDevice* device) : device_(device) {} +CircularBuffer::~CircularBuffer() { Shutdown(); } + +bool CircularBuffer::Initialize(VkDeviceSize capacity, VkBufferUsageFlags usage, + VkDeviceSize alignment) { + VkResult status = VK_SUCCESS; + capacity = xe::round_up(capacity, alignment); + + // Create our internal buffer. + VkBufferCreateInfo buffer_info; + buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + buffer_info.pNext = nullptr; + buffer_info.flags = 0; + buffer_info.size = capacity; + buffer_info.usage = usage; + buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + buffer_info.queueFamilyIndexCount = 0; + buffer_info.pQueueFamilyIndices = nullptr; + status = vkCreateBuffer(*device_, &buffer_info, nullptr, &gpu_buffer_); + CheckResult(status, "vkCreateBuffer"); + if (status != VK_SUCCESS) { + return false; + } + + VkMemoryRequirements reqs; + vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs); + + // Allocate memory from the device to back the buffer. + assert_true(reqs.size == capacity); + reqs.alignment = std::max(alignment, reqs.alignment); + gpu_memory_ = device_->AllocateMemory(reqs); + if (!gpu_memory_) { + XELOGE("CircularBuffer::Initialize - Failed to allocate memory!"); + Shutdown(); + return false; + } + + alignment_ = reqs.alignment; + capacity_ = reqs.size; + gpu_base_ = 0; + + // Bind the buffer to its backing memory. + status = vkBindBufferMemory(*device_, gpu_buffer_, gpu_memory_, gpu_base_); + CheckResult(status, "vkBindBufferMemory"); + if (status != VK_SUCCESS) { + XELOGE("CircularBuffer::Initialize - Failed to bind memory!"); + Shutdown(); + return false; + } + + // Map the memory so we can access it. + status = vkMapMemory(*device_, gpu_memory_, gpu_base_, capacity_, 0, + reinterpret_cast(&host_base_)); + CheckResult(status, "vkMapMemory"); + if (status != VK_SUCCESS) { + XELOGE("CircularBuffer::Initialize - Failed to map memory!"); + Shutdown(); + return false; + } + + return true; +} + +void CircularBuffer::Shutdown() { + Clear(); + if (host_base_) { + vkUnmapMemory(*device_, gpu_memory_); + host_base_ = nullptr; + } + if (gpu_buffer_) { + vkDestroyBuffer(*device_, gpu_buffer_, nullptr); + gpu_buffer_ = nullptr; + } + if (gpu_memory_) { + vkFreeMemory(*device_, gpu_memory_, nullptr); + gpu_memory_ = nullptr; + } +} + +bool CircularBuffer::CanAcquire(VkDeviceSize length) { + // Make sure the length is aligned. + length = xe::round_up(length, alignment_); + if (allocations_.empty()) { + // Read head has caught up to write head (entire buffer available for write) + assert_true(read_head_ == write_head_); + return capacity_ >= length; + } else if (write_head_ < read_head_) { + // Write head wrapped around and is behind read head. + // | write |---- read ----| + return (read_head_ - write_head_) >= length; + } else if (write_head_ > read_head_) { + // Read head behind write head. + // 1. Check if there's enough room from write -> capacity + // | |---- read ----| write | + if ((capacity_ - write_head_) >= length) { + return true; + } + + // 2. Check if there's enough room from 0 -> read + // | write |---- read ----| | + if ((read_head_ - 0) >= length) { + return true; + } + } + + return false; +} + +CircularBuffer::Allocation* CircularBuffer::Acquire( + VkDeviceSize length, std::shared_ptr fence) { + VkDeviceSize aligned_length = xe::round_up(length, alignment_); + if (!CanAcquire(aligned_length)) { + return nullptr; + } + + assert_true(write_head_ % alignment_ == 0); + if (write_head_ < read_head_) { + // Write head behind read head. + assert_true(read_head_ - write_head_ >= aligned_length); + + auto alloc = new Allocation(); + alloc->host_ptr = host_base_ + write_head_; + alloc->gpu_memory = gpu_memory_; + alloc->offset = gpu_base_ + write_head_; + alloc->length = length; + alloc->aligned_length = aligned_length; + alloc->fence = fence; + write_head_ += aligned_length; + allocations_.push_back(alloc); + + return alloc; + } else { + // Write head equal to/after read head + if (capacity_ - write_head_ >= aligned_length) { + // Free space from write -> capacity + auto alloc = new Allocation(); + alloc->host_ptr = host_base_ + write_head_; + alloc->gpu_memory = gpu_memory_; + alloc->offset = gpu_base_ + write_head_; + alloc->length = length; + alloc->aligned_length = aligned_length; + alloc->fence = fence; + write_head_ += aligned_length; + allocations_.push_back(alloc); + + return alloc; + } else if ((read_head_ - 0) >= aligned_length) { + // Free space from begin -> read + auto alloc = new Allocation(); + alloc->host_ptr = host_base_ + 0; + alloc->gpu_memory = gpu_memory_; + alloc->offset = gpu_base_ + 0; + alloc->length = length; + alloc->aligned_length = aligned_length; + alloc->fence = fence; + write_head_ = aligned_length; + allocations_.push_back(alloc); + + return alloc; + } + } + + return nullptr; +} + +void CircularBuffer::Flush(Allocation* allocation) { + VkMappedMemoryRange range; + range.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + range.pNext = nullptr; + range.memory = gpu_memory_; + range.offset = gpu_base_ + allocation->offset; + range.size = allocation->length; + vkFlushMappedMemoryRanges(*device_, 1, &range); +} + +void CircularBuffer::Clear() { + for (auto alloc : allocations_) { + delete alloc; + } + allocations_.clear(); + + write_head_ = read_head_ = 0; +} + +void CircularBuffer::Scavenge() { + for (auto it = allocations_.begin(); it != allocations_.end();) { + if ((*it)->fence->status() != VK_SUCCESS) { + // Don't bother freeing following allocations to ensure proper ordering. + break; + } + + if (capacity_ - read_head_ < (*it)->aligned_length) { + // This allocation is stored at the beginning of the buffer. + read_head_ = (*it)->aligned_length; + } else { + read_head_ += (*it)->aligned_length; + } + + delete *it; + it = allocations_.erase(it); + } +} + +} // namespace vulkan +} // namespace ui +} // namespace xe \ No newline at end of file diff --git a/src/xenia/ui/vulkan/circular_buffer.h b/src/xenia/ui/vulkan/circular_buffer.h new file mode 100644 index 000000000..54aa916fd --- /dev/null +++ b/src/xenia/ui/vulkan/circular_buffer.h @@ -0,0 +1,87 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2015 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_UI_VULKAN_CIRCULAR_BUFFER_H_ +#define XENIA_UI_VULKAN_CIRCULAR_BUFFER_H_ + +#include + +#include "xenia/ui/vulkan/vulkan.h" +#include "xenia/ui/vulkan/vulkan_device.h" + +namespace xe { +namespace ui { +namespace vulkan { + +// A circular buffer, intended to hold (fairly) temporary memory that will be +// released when a fence is signaled. Best used when allocations are taken +// in-order with command buffer submission. +// +// Allocations loop around the buffer in circles (but are not fragmented at the +// ends of the buffer), where trailing older allocations are freed after use. +class CircularBuffer { + public: + CircularBuffer(VulkanDevice* device); + ~CircularBuffer(); + + struct Allocation { + void* host_ptr; + VkDeviceMemory gpu_memory; + VkDeviceSize offset; + VkDeviceSize length; + VkDeviceSize aligned_length; + + // Allocation usage fence. This allocation will be deleted when the fence + // becomes signaled. + std::shared_ptr fence; + }; + + bool Initialize(VkDeviceSize capacity, VkBufferUsageFlags usage, + VkDeviceSize alignment = 256); + void Shutdown(); + + VkDeviceSize alignment() const { return alignment_; } + VkDeviceSize capacity() const { return capacity_; } + VkBuffer gpu_buffer() const { return gpu_buffer_; } + VkDeviceMemory gpu_memory() const { return gpu_memory_; } + uint8_t* host_base() const { return host_base_; } + + bool CanAcquire(VkDeviceSize length); + + // Acquires space to hold memory. This allocation is only freed when the fence + // reaches the signaled state. + Allocation* Acquire(VkDeviceSize length, std::shared_ptr fence); + void Flush(Allocation* allocation); + + // Clears all allocations, regardless of whether they've been consumed or not. + void Clear(); + + // Frees any allocations whose fences have been signaled. + void Scavenge(); + + private: + VkDeviceSize capacity_ = 0; + VkDeviceSize alignment_ = 0; + VkDeviceSize write_head_ = 0; + VkDeviceSize read_head_ = 0; + + VulkanDevice* device_; + VkBuffer gpu_buffer_ = nullptr; + VkDeviceMemory gpu_memory_ = nullptr; + VkDeviceSize gpu_base_ = 0; + uint8_t* host_base_ = nullptr; + + std::list allocations_; +}; + +} // namespace vulkan +} // namespace ui +} // namespace xe + +#endif // XENIA_UI_GL_CIRCULAR_BUFFER_H_ diff --git a/src/xenia/ui/vulkan/fenced_pools.h b/src/xenia/ui/vulkan/fenced_pools.h index a50f82d08..d62ad7452 100644 --- a/src/xenia/ui/vulkan/fenced_pools.h +++ b/src/xenia/ui/vulkan/fenced_pools.h @@ -14,6 +14,7 @@ #include "xenia/base/assert.h" #include "xenia/ui/vulkan/vulkan.h" +#include "xenia/ui/vulkan/vulkan_util.h" namespace xe { namespace ui { @@ -40,13 +41,15 @@ class BaseFencedPool { // True if one or more batches are still pending on the GPU. bool has_pending() const { return pending_batch_list_head_ != nullptr; } + // True if a batch is open. + bool has_open_batch() const { return open_batch_ != nullptr; } // Checks all pending batches for completion and scavenges their entries. // This should be called as frequently as reasonable. void Scavenge() { while (pending_batch_list_head_) { auto batch = pending_batch_list_head_; - if (vkGetFenceStatus(device_, batch->fence) == VK_SUCCESS) { + if (vkGetFenceStatus(device_, *batch->fence) == VK_SUCCESS) { // Batch has completed. Reclaim. pending_batch_list_head_ = batch->next; if (batch == pending_batch_list_tail_) { @@ -88,6 +91,24 @@ class BaseFencedPool { open_batch_ = batch; } + // Cancels an open batch, and releases all entries acquired within. + void CancelBatch() { + assert_not_null(open_batch_); + + auto batch = open_batch_; + open_batch_ = nullptr; + + // Relink the batch back into the free batch list. + batch->next = free_batch_list_head_; + free_batch_list_head_ = batch; + + // Relink entries back into free entries list. + batch->entry_list_tail->next = free_entry_list_head_; + free_entry_list_head_ = batch->entry_list_head; + batch->entry_list_head = nullptr; + batch->entry_list_tail = nullptr; + } + // Attempts to acquire an entry from the pool in the current batch. // If none are available a new one will be allocated. HANDLE AcquireEntry() { @@ -114,7 +135,7 @@ class BaseFencedPool { // Ends the current batch using the given fence to indicate when the batch // has completed execution on the GPU. - void EndBatch(VkFence fence) { + void EndBatch(std::shared_ptr fence) { assert_not_null(open_batch_); // Close and see if we have anything. @@ -137,6 +158,7 @@ class BaseFencedPool { } if (pending_batch_list_tail_) { pending_batch_list_tail_->next = batch; + pending_batch_list_tail_ = batch; } else { pending_batch_list_tail_ = batch; } @@ -176,7 +198,7 @@ class BaseFencedPool { Batch* next; Entry* entry_list_head; Entry* entry_list_tail; - VkFence fence; + std::shared_ptr fence; }; Batch* free_batch_list_head_ = nullptr; diff --git a/src/xenia/ui/vulkan/vulkan_device.cc b/src/xenia/ui/vulkan/vulkan_device.cc index 42077ca82..7b1dc7f8d 100644 --- a/src/xenia/ui/vulkan/vulkan_device.cc +++ b/src/xenia/ui/vulkan/vulkan_device.cc @@ -93,8 +93,8 @@ bool VulkanDevice::Initialize(DeviceInfo device_info) { } ENABLE_AND_EXPECT(geometryShader); ENABLE_AND_EXPECT(depthClamp); - ENABLE_AND_EXPECT(alphaToOne); ENABLE_AND_EXPECT(multiViewport); + ENABLE_AND_EXPECT(independentBlend); // TODO(benvanik): add other features. if (any_features_missing) { XELOGE( diff --git a/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc b/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc index 23dffd6c6..49b0cbc4d 100644 --- a/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc +++ b/src/xenia/ui/vulkan/vulkan_immediate_drawer.cc @@ -136,6 +136,46 @@ class LightweightCircularBuffer { class VulkanImmediateTexture : public ImmediateTexture { public: + VulkanImmediateTexture(VulkanDevice* device, VkDescriptorPool descriptor_pool, + VkDescriptorSetLayout descriptor_set_layout, + VkImageView image_view, VkSampler sampler, + uint32_t width, uint32_t height) + : ImmediateTexture(width, height), + device_(*device), + descriptor_pool_(descriptor_pool), + image_view_(image_view), + sampler_(sampler) { + handle = reinterpret_cast(this); + + // Create descriptor set used just for this texture. + // It never changes, so we can reuse it and not worry with updates. + VkDescriptorSetAllocateInfo set_alloc_info; + set_alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + set_alloc_info.pNext = nullptr; + set_alloc_info.descriptorPool = descriptor_pool_; + set_alloc_info.descriptorSetCount = 1; + set_alloc_info.pSetLayouts = &descriptor_set_layout; + auto err = + vkAllocateDescriptorSets(device_, &set_alloc_info, &descriptor_set_); + CheckResult(err, "vkAllocateDescriptorSets"); + + // Initialize descriptor with our texture. + VkDescriptorImageInfo texture_info; + texture_info.sampler = sampler_; + texture_info.imageView = image_view_; + texture_info.imageLayout = VK_IMAGE_LAYOUT_GENERAL; + VkWriteDescriptorSet descriptor_write; + descriptor_write.sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + descriptor_write.pNext = nullptr; + descriptor_write.dstSet = descriptor_set_; + descriptor_write.dstBinding = 0; + descriptor_write.dstArrayElement = 0; + descriptor_write.descriptorCount = 1; + descriptor_write.descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + descriptor_write.pImageInfo = &texture_info; + vkUpdateDescriptorSets(device_, 1, &descriptor_write, 0, nullptr); + } + VulkanImmediateTexture(VulkanDevice* device, VkDescriptorPool descriptor_pool, VkDescriptorSetLayout descriptor_set_layout, VkSampler sampler, uint32_t width, uint32_t height) @@ -161,7 +201,7 @@ class VulkanImmediateTexture : public ImmediateTexture { image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE; image_info.queueFamilyIndexCount = 0; image_info.pQueueFamilyIndices = nullptr; - image_info.initialLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + image_info.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED; auto err = vkCreateImage(device_, &image_info, nullptr, &image_); CheckResult(err, "vkCreateImage"); @@ -221,9 +261,12 @@ class VulkanImmediateTexture : public ImmediateTexture { ~VulkanImmediateTexture() override { vkFreeDescriptorSets(device_, descriptor_pool_, 1, &descriptor_set_); - vkDestroyImageView(device_, image_view_, nullptr); - vkDestroyImage(device_, image_, nullptr); - vkFreeMemory(device_, device_memory_, nullptr); + + if (device_memory_) { + vkDestroyImageView(device_, image_view_, nullptr); + vkDestroyImage(device_, image_, nullptr); + vkFreeMemory(device_, device_memory_, nullptr); + } } void Upload(const uint8_t* src_data) { @@ -238,25 +281,49 @@ class VulkanImmediateTexture : public ImmediateTexture { vkGetImageSubresourceLayout(device_, image_, &subresource, &layout); // Map memory for upload. - void* gpu_data = nullptr; - auto err = - vkMapMemory(device_, device_memory_, 0, layout.size, 0, &gpu_data); + uint8_t* gpu_data = nullptr; + auto err = vkMapMemory(device_, device_memory_, 0, layout.size, 0, + reinterpret_cast(&gpu_data)); CheckResult(err, "vkMapMemory"); // Copy the entire texture, hoping its layout matches what we expect. - std::memcpy(gpu_data, src_data, layout.size); + std::memcpy(gpu_data + layout.offset, src_data, layout.size); vkUnmapMemory(device_, device_memory_); } + // Queues a command to transition this texture to a new layout. This assumes + // the command buffer WILL be queued and executed by the device. + void TransitionLayout(VkCommandBuffer command_buffer, + VkImageLayout new_layout) { + VkImageMemoryBarrier image_barrier; + image_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + image_barrier.pNext = nullptr; + image_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + image_barrier.srcAccessMask = 0; + image_barrier.dstAccessMask = 0; + image_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + image_barrier.newLayout = new_layout; + image_barrier.image = image_; + image_barrier.subresourceRange = {0, 0, 1, 0, 1}; + image_barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + image_layout_ = new_layout; + + vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &image_barrier); + } + VkDescriptorSet descriptor_set() const { return descriptor_set_; } + VkImageLayout layout() const { return image_layout_; } private: VkDevice device_ = nullptr; VkDescriptorPool descriptor_pool_ = nullptr; VkSampler sampler_ = nullptr; // Not owned. VkImage image_ = nullptr; - VkImageLayout image_layout_ = VK_IMAGE_LAYOUT_UNDEFINED; + VkImageLayout image_layout_ = VK_IMAGE_LAYOUT_PREINITIALIZED; VkDeviceMemory device_memory_ = nullptr; VkImageView image_view_ = nullptr; VkDescriptorSet descriptor_set_ = nullptr; @@ -538,7 +605,7 @@ VulkanImmediateDrawer::VulkanImmediateDrawer(VulkanContext* graphics_context) pipeline_info.renderPass = context_->swap_chain()->render_pass(); pipeline_info.subpass = 0; pipeline_info.basePipelineHandle = nullptr; - pipeline_info.basePipelineIndex = 0; + pipeline_info.basePipelineIndex = -1; err = vkCreateGraphicsPipelines(*device, nullptr, 1, &pipeline_info, nullptr, &triangle_pipeline_); CheckResult(err, "vkCreateGraphicsPipelines"); @@ -547,7 +614,7 @@ VulkanImmediateDrawer::VulkanImmediateDrawer(VulkanContext* graphics_context) pipeline_info.flags = VK_PIPELINE_CREATE_DERIVATIVE_BIT; input_info.topology = VK_PRIMITIVE_TOPOLOGY_LINE_LIST; pipeline_info.basePipelineHandle = triangle_pipeline_; - pipeline_info.basePipelineIndex = 0; + pipeline_info.basePipelineIndex = -1; err = vkCreateGraphicsPipelines(*device, nullptr, 1, &pipeline_info, nullptr, &line_pipeline_); CheckResult(err, "vkCreateGraphicsPipelines"); @@ -604,6 +671,14 @@ std::unique_ptr VulkanImmediateDrawer::CreateTexture( return std::unique_ptr(texture.release()); } +std::unique_ptr VulkanImmediateDrawer::WrapTexture( + VkImageView image_view, VkSampler sampler, uint32_t width, + uint32_t height) { + return std::make_unique( + context_->device(), descriptor_pool_, texture_set_layout_, image_view, + sampler, width, height); +} + void VulkanImmediateDrawer::UpdateTexture(ImmediateTexture* texture, const uint8_t* data) { static_cast(texture)->Upload(data); @@ -672,9 +747,6 @@ void VulkanImmediateDrawer::BeginDrawBatch(const ImmediateDrawBatch& batch) { void VulkanImmediateDrawer::Draw(const ImmediateDraw& draw) { auto swap_chain = context_->swap_chain(); - if (draw.primitive_type != ImmediatePrimitiveType::kTriangles) { - return; - } switch (draw.primitive_type) { case ImmediatePrimitiveType::kLines: vkCmdBindPipeline(current_cmd_buffer_, VK_PIPELINE_BIND_POINT_GRAPHICS, @@ -689,6 +761,10 @@ void VulkanImmediateDrawer::Draw(const ImmediateDraw& draw) { // Setup texture binding. auto texture = reinterpret_cast(draw.texture_handle); if (texture) { + if (texture->layout() != VK_IMAGE_LAYOUT_GENERAL) { + texture->TransitionLayout(current_cmd_buffer_, VK_IMAGE_LAYOUT_GENERAL); + } + auto texture_set = texture->descriptor_set(); vkCmdBindDescriptorSets(current_cmd_buffer_, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout_, diff --git a/src/xenia/ui/vulkan/vulkan_immediate_drawer.h b/src/xenia/ui/vulkan/vulkan_immediate_drawer.h index d14a6eb7c..1db47f0d8 100644 --- a/src/xenia/ui/vulkan/vulkan_immediate_drawer.h +++ b/src/xenia/ui/vulkan/vulkan_immediate_drawer.h @@ -32,6 +32,10 @@ class VulkanImmediateDrawer : public ImmediateDrawer { ImmediateTextureFilter filter, bool repeat, const uint8_t* data) override; + std::unique_ptr WrapTexture(VkImageView image_view, + VkSampler sampler, + uint32_t width, + uint32_t height); void UpdateTexture(ImmediateTexture* texture, const uint8_t* data) override; void Begin(int render_target_width, int render_target_height) override; diff --git a/src/xenia/ui/vulkan/vulkan_swap_chain.cc b/src/xenia/ui/vulkan/vulkan_swap_chain.cc index 15d2795fd..ad383f32f 100644 --- a/src/xenia/ui/vulkan/vulkan_swap_chain.cc +++ b/src/xenia/ui/vulkan/vulkan_swap_chain.cc @@ -187,6 +187,10 @@ bool VulkanSwapChain::Initialize(VkSurfaceKHR surface) { vkAllocateCommandBuffers(*device_, &cmd_buffer_info, &render_cmd_buffer_); CheckResult(err, "vkCreateCommandBuffer"); + // Create another command buffer that handles image copies. + err = vkAllocateCommandBuffers(*device_, &cmd_buffer_info, ©_cmd_buffer_); + CheckResult(err, "vkCreateCommandBuffer"); + // Create the render pass used to draw to the swap chain. // The actual framebuffer attached will depend on which image we are drawing // into. @@ -194,7 +198,7 @@ bool VulkanSwapChain::Initialize(VkSurfaceKHR surface) { color_attachment.flags = 0; color_attachment.format = surface_format_; color_attachment.samples = VK_SAMPLE_COUNT_1_BIT; - color_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; + color_attachment.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; // CLEAR; color_attachment.storeOp = VK_ATTACHMENT_STORE_OP_STORE; color_attachment.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; color_attachment.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; @@ -388,6 +392,7 @@ bool VulkanSwapChain::Begin() { // Reset all command buffers. vkResetCommandBuffer(render_cmd_buffer_, 0); + vkResetCommandBuffer(copy_cmd_buffer_, 0); auto& current_buffer = buffers_[current_buffer_index_]; // Build the command buffer that will execute all queued rendering buffers. @@ -399,14 +404,18 @@ bool VulkanSwapChain::Begin() { err = vkBeginCommandBuffer(render_cmd_buffer_, &begin_info); CheckResult(err, "vkBeginCommandBuffer"); - // Transition the image to a format we can render to. + // Start recording the copy command buffer as well. + err = vkBeginCommandBuffer(copy_cmd_buffer_, &begin_info); + CheckResult(err, "vkBeginCommandBuffer"); + + // Transition the image to a format we can copy to. VkImageMemoryBarrier pre_image_memory_barrier; pre_image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; pre_image_memory_barrier.pNext = nullptr; pre_image_memory_barrier.srcAccessMask = 0; - pre_image_memory_barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + pre_image_memory_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; pre_image_memory_barrier.oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; - pre_image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + pre_image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; pre_image_memory_barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; pre_image_memory_barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; pre_image_memory_barrier.image = current_buffer.image; @@ -416,23 +425,37 @@ bool VulkanSwapChain::Begin() { pre_image_memory_barrier.subresourceRange.levelCount = 1; pre_image_memory_barrier.subresourceRange.baseArrayLayer = 0; pre_image_memory_barrier.subresourceRange.layerCount = 1; + vkCmdPipelineBarrier(copy_cmd_buffer_, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, + VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, + nullptr, 1, &pre_image_memory_barrier); + + // First: Issue a command to clear the render target. + VkImageSubresourceRange clear_range = {VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1}; + VkClearColorValue clear_color; + clear_color.float32[0] = 238 / 255.0f; + clear_color.float32[1] = 238 / 255.0f; + clear_color.float32[2] = 238 / 255.0f; + clear_color.float32[3] = 1.0f; + if (FLAGS_vulkan_random_clear_color) { + clear_color.float32[0] = + rand() / static_cast(RAND_MAX); // NOLINT(runtime/threadsafe_fn) + clear_color.float32[1] = 1.0f; + clear_color.float32[2] = 0.0f; + } + vkCmdClearColorImage(copy_cmd_buffer_, current_buffer.image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, &clear_color, 1, + &clear_range); + + // Transition the image to a color attachment target for drawing. + pre_image_memory_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + pre_image_memory_barrier.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + pre_image_memory_barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; + pre_image_memory_barrier.newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; vkCmdPipelineBarrier(render_cmd_buffer_, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0, 0, nullptr, 0, nullptr, 1, &pre_image_memory_barrier); // Begin render pass. - VkClearValue color_clear_value; - color_clear_value.color.float32[0] = 238 / 255.0f; - color_clear_value.color.float32[1] = 238 / 255.0f; - color_clear_value.color.float32[2] = 238 / 255.0f; - color_clear_value.color.float32[3] = 1.0f; - if (FLAGS_vulkan_random_clear_color) { - color_clear_value.color.float32[0] = - rand() / static_cast(RAND_MAX); // NOLINT(runtime/threadsafe_fn) - color_clear_value.color.float32[1] = 1.0f; - color_clear_value.color.float32[2] = 0.0f; - } - VkClearValue clear_values[] = {color_clear_value}; VkRenderPassBeginInfo render_pass_begin_info; render_pass_begin_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; render_pass_begin_info.pNext = nullptr; @@ -442,9 +465,8 @@ bool VulkanSwapChain::Begin() { render_pass_begin_info.renderArea.offset.y = 0; render_pass_begin_info.renderArea.extent.width = surface_width_; render_pass_begin_info.renderArea.extent.height = surface_height_; - render_pass_begin_info.clearValueCount = - static_cast(xe::countof(clear_values)); - render_pass_begin_info.pClearValues = clear_values; + render_pass_begin_info.clearValueCount = 0; + render_pass_begin_info.pClearValues = nullptr; vkCmdBeginRenderPass(render_cmd_buffer_, &render_pass_begin_info, VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS); @@ -458,6 +480,7 @@ bool VulkanSwapChain::End() { vkCmdEndRenderPass(render_cmd_buffer_); // Transition the image to a format the presentation engine can source from. + // FIXME: Do we need more synchronization here between the copy buffer? VkImageMemoryBarrier post_image_memory_barrier; post_image_memory_barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; post_image_memory_barrier.pNext = nullptr; @@ -483,14 +506,20 @@ bool VulkanSwapChain::End() { auto err = vkEndCommandBuffer(render_cmd_buffer_); CheckResult(err, "vkEndCommandBuffer"); + err = vkEndCommandBuffer(copy_cmd_buffer_); + CheckResult(err, "vkEndCommandBuffer"); + + VkCommandBuffer command_buffers[] = {copy_cmd_buffer_, render_cmd_buffer_}; + // Submit rendering. VkSubmitInfo render_submit_info; render_submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; render_submit_info.pNext = nullptr; render_submit_info.waitSemaphoreCount = 0; render_submit_info.pWaitSemaphores = nullptr; - render_submit_info.commandBufferCount = 1; - render_submit_info.pCommandBuffers = &render_cmd_buffer_; + render_submit_info.commandBufferCount = + static_cast(xe::countof(command_buffers)); + render_submit_info.pCommandBuffers = command_buffers; render_submit_info.signalSemaphoreCount = 0; render_submit_info.pSignalSemaphores = nullptr; { diff --git a/src/xenia/ui/vulkan/vulkan_swap_chain.h b/src/xenia/ui/vulkan/vulkan_swap_chain.h index 1d1f578c3..773a52053 100644 --- a/src/xenia/ui/vulkan/vulkan_swap_chain.h +++ b/src/xenia/ui/vulkan/vulkan_swap_chain.h @@ -35,11 +35,16 @@ class VulkanSwapChain { uint32_t surface_width() const { return surface_width_; } uint32_t surface_height() const { return surface_height_; } + VkImage surface_image() const { + return buffers_[current_buffer_index_].image; + } // Render pass used for compositing. VkRenderPass render_pass() const { return render_pass_; } // Render command buffer, active inside the render pass from Begin to End. VkCommandBuffer render_cmd_buffer() const { return render_cmd_buffer_; } + // Copy commands, ran before the render command buffer. + VkCommandBuffer copy_cmd_buffer() const { return copy_cmd_buffer_; } // Initializes the swap chain with the given WSI surface. bool Initialize(VkSurfaceKHR surface); @@ -74,6 +79,7 @@ class VulkanSwapChain { uint32_t surface_height_ = 0; VkFormat surface_format_ = VK_FORMAT_UNDEFINED; VkCommandPool cmd_pool_ = nullptr; + VkCommandBuffer copy_cmd_buffer_ = nullptr; VkCommandBuffer render_cmd_buffer_ = nullptr; VkRenderPass render_pass_ = nullptr; VkSemaphore image_available_semaphore_ = nullptr; diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h index fcf9e4f8f..f5475edd8 100644 --- a/src/xenia/ui/vulkan/vulkan_util.h +++ b/src/xenia/ui/vulkan/vulkan_util.h @@ -25,6 +25,30 @@ namespace xe { namespace ui { namespace vulkan { +class Fence { + public: + Fence(VkDevice device) : device_(device) { + VkFenceCreateInfo fence_info; + fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fence_info.pNext = nullptr; + fence_info.flags = 0; + vkCreateFence(device, &fence_info, nullptr, &fence_); + } + ~Fence() { + vkDestroyFence(device_, fence_, nullptr); + fence_ = nullptr; + } + + VkResult status() const { return vkGetFenceStatus(device_, fence_); } + + VkFence fence() const { return fence_; } + operator VkFence() const { return fence_; } + + private: + VkDevice device_; + VkFence fence_ = nullptr; +}; + struct Version { uint32_t major; uint32_t minor; diff --git a/third_party/glslang-spirv/SpvBuilder.cpp b/third_party/glslang-spirv/SpvBuilder.cpp index 0a2fa2139..13a6c946a 100644 --- a/third_party/glslang-spirv/SpvBuilder.cpp +++ b/third_party/glslang-spirv/SpvBuilder.cpp @@ -1166,6 +1166,7 @@ void Builder::createMemoryBarrier(unsigned executionScope, unsigned memorySemant // An opcode that has one operands, a result id, and a type Id Builder::createUnaryOp(Op opCode, Id typeId, Id operand) { + assert(operand != 0); Instruction* op = new Instruction(getUniqueId(), typeId, opCode); op->addIdOperand(operand); buildPoint->addInstruction(std::unique_ptr(op)); @@ -1175,6 +1176,8 @@ Id Builder::createUnaryOp(Op opCode, Id typeId, Id operand) Id Builder::createBinOp(Op opCode, Id typeId, Id left, Id right) { + assert(left != 0); + assert(right != 0); Instruction* op = new Instruction(getUniqueId(), typeId, opCode); op->addIdOperand(left); op->addIdOperand(right); @@ -1185,6 +1188,9 @@ Id Builder::createBinOp(Op opCode, Id typeId, Id left, Id right) Id Builder::createTriOp(Op opCode, Id typeId, Id op1, Id op2, Id op3) { + assert(op1 != 0); + assert(op2 != 0); + assert(op3 != 0); Instruction* op = new Instruction(getUniqueId(), typeId, opCode); op->addIdOperand(op1); op->addIdOperand(op2); diff --git a/third_party/glslang-spirv/SpvBuilder.h b/third_party/glslang-spirv/SpvBuilder.h index d6dc61218..7eae4fe91 100644 --- a/third_party/glslang-spirv/SpvBuilder.h +++ b/third_party/glslang-spirv/SpvBuilder.h @@ -93,6 +93,8 @@ public: return id; } + Module* getModule() { return &module; } + // For creating new types (will return old type if the requested one was already made). Id makeVoidType(); Id makeBoolType(); @@ -517,6 +519,7 @@ public: void createBranch(Block* block); void createConditionalBranch(Id condition, Block* thenBlock, Block* elseBlock); void createLoopMerge(Block* mergeBlock, Block* continueBlock, unsigned int control); + void createSelectionMerge(Block* mergeBlock, unsigned int control); protected: Id makeIntConstant(Id typeId, unsigned value, bool specConstant); @@ -527,7 +530,6 @@ public: void transferAccessChainSwizzle(bool dynamic); void simplifyAccessChainSwizzle(); void createAndSetNoPredecessorBlock(const char*); - void createSelectionMerge(Block* mergeBlock, unsigned int control); void dumpInstructions(std::vector&, const std::vector >&) const; SourceLanguage source; diff --git a/third_party/glslang-spirv/spvIR.h b/third_party/glslang-spirv/spvIR.h index 98f4971b4..63e460ebb 100644 --- a/third_party/glslang-spirv/spvIR.h +++ b/third_party/glslang-spirv/spvIR.h @@ -180,6 +180,11 @@ public: void addInstruction(std::unique_ptr inst); void addPredecessor(Block* pred) { predecessors.push_back(pred); pred->successors.push_back(this);} void addLocalVariable(std::unique_ptr inst) { localVariables.push_back(std::move(inst)); } + void insertInstruction(size_t pos, std::unique_ptr inst); + + size_t getInstructionCount() { return instructions.size(); } + Instruction* getInstruction(size_t i) { return instructions[i].get(); } + void removeInstruction(size_t i) { instructions.erase(instructions.begin() + i); } const std::vector& getPredecessors() const { return predecessors; } const std::vector& getSuccessors() const { return successors; } void setUnreachable() { unreachable = true; } @@ -200,6 +205,10 @@ public: bool isTerminated() const { + if (instructions.size() == 0) { + return false; + } + switch (instructions.back()->getOpCode()) { case OpBranch: case OpBranchConditional: @@ -215,6 +224,7 @@ public: void dump(std::vector& out) const { + // OpLabel instructions[0]->dump(out); for (int i = 0; i < (int)localVariables.size(); ++i) localVariables[i]->dump(out); @@ -222,7 +232,51 @@ public: instructions[i]->dump(out); } -protected: + // Moves all instructions from a target block into this block, and removes + // the target block from our list of successors. + // This function assumes this block unconditionally branches to the target + // block directly. + void merge(Block* target_block) { + if (isTerminated()) { + instructions.erase(instructions.end() - 1); + } + + // Find the target block in our successors first. + for (auto it = successors.begin(); it != successors.end(); ++it) { + if (*it == target_block) { + it = successors.erase(it); + break; + } + } + + // Add target block's successors to our successors. + successors.insert(successors.end(), target_block->successors.begin(), + target_block->successors.end()); + + // For each successor, replace the target block in their predecessors with + // us. + for (auto block : successors) { + std::replace(block->predecessors.begin(), block->predecessors.end(), + target_block, this); + } + + // Move instructions from target block into this block. + for (auto it = target_block->instructions.begin(); + it != target_block->instructions.end();) { + if ((*it)->getOpCode() == spv::Op::OpLabel) { + ++it; + continue; + } + + instructions.push_back(std::move(*it)); + it = target_block->instructions.erase(it); + } + + target_block->predecessors.clear(); + target_block->successors.clear(); + } + + protected: Block(const Block&); Block& operator=(Block&); @@ -275,6 +329,17 @@ public: Module& getParent() const { return parent; } Block* getEntryBlock() const { return blocks.front(); } Block* getLastBlock() const { return blocks.back(); } + Block* findBlockById(Id id) + { + for (auto block : blocks) { + if (block->getId() == id) { + return block; + } + } + + return nullptr; + } + std::vector& getBlocks() { return blocks; } void addLocalVariable(std::unique_ptr inst); Id getReturnType() const { return functionInstruction.getTypeId(); } void dump(std::vector& out) const @@ -315,6 +380,8 @@ public: } void addFunction(Function *fun) { functions.push_back(fun); } + const std::vector& getFunctions() const { return functions; } + std::vector& getFunctions() { return functions; } void mapInstruction(Instruction *instruction) { @@ -398,6 +465,14 @@ __inline void Block::addInstruction(std::unique_ptr inst) parent.getParent().mapInstruction(raw_instruction); } +__inline void Block::insertInstruction(size_t pos, std::unique_ptr inst) { + Instruction* raw_instruction = inst.get(); + instructions.insert(instructions.begin() + pos, std::move(inst)); + raw_instruction->setBlock(this); + if (raw_instruction->getResultId()) + parent.getParent().mapInstruction(raw_instruction); +} + }; // end spv namespace #endif // spvIR_H diff --git a/third_party/spirv-tools.lua b/third_party/spirv-tools.lua index 4218ff08e..afa3cdef5 100644 --- a/third_party/spirv-tools.lua +++ b/third_party/spirv-tools.lua @@ -13,9 +13,9 @@ project("spirv-tools") "spirv-tools/include", }) files({ - "spirv-tools/external/include/headers/GLSL.std.450.h", - "spirv-tools/external/include/headers/OpenCL.std.h", - "spirv-tools/external/include/headers/spirv.h", + "spirv-tools/include/spirv/GLSL.std.450.h", + "spirv-tools/include/spirv/OpenCL.std.h", + "spirv-tools/include/spirv/spirv.h", "spirv-tools/include/spirv-tools/libspirv.h", "spirv-tools/source/assembly_grammar.cpp", "spirv-tools/source/assembly_grammar.h", diff --git a/xenia-build b/xenia-build index 4587374c4..98330b6a5 100755 --- a/xenia-build +++ b/xenia-build @@ -642,8 +642,7 @@ class GenSpirvCommand(Command): print('Generating SPIR-V binaries...') print('') - # TODO(benvanik): actually find vulkan SDK. Env var? etc? - vulkan_sdk_path = 'C:\\VulkanSDK\\1.0.3.1' + vulkan_sdk_path = os.environ['VULKAN_SDK'] vulkan_bin_path = os.path.join(vulkan_sdk_path, 'bin') glslang = os.path.join(vulkan_bin_path, 'glslangValidator') spirv_dis = os.path.join(vulkan_bin_path, 'spirv-dis')