From c1812406f52085f26e0ee8a1dedb4f2277daedba Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 28 May 2014 19:19:39 -0700 Subject: [PATCH] Adding a bunch of profiling tracers. --- src/alloy/backend/x64/x64_assembler.cc | 2 + src/alloy/backend/x64/x64_code_cache.cc | 2 + src/alloy/backend/x64/x64_emitter.cc | 2 + src/alloy/compiler/compiler.cc | 2 + .../passes/constant_propagation_pass.cc | 2 + .../compiler/passes/context_promotion_pass.cc | 2 + .../passes/control_flow_analysis_pass.cc | 2 + .../passes/data_flow_analysis_pass.cc | 2 + .../passes/dead_code_elimination_pass.cc | 2 + .../compiler/passes/finalization_pass.cc | 2 + .../passes/register_allocation_pass.cc | 2 + .../compiler/passes/simplification_pass.cc | 2 + src/alloy/compiler/passes/validation_pass.cc | 2 + .../compiler/passes/value_reduction_pass.cc | 2 + src/alloy/frontend/ppc/ppc_hir_builder.cc | 2 + src/alloy/frontend/ppc/ppc_scanner.cc | 4 ++ src/alloy/frontend/ppc/ppc_translator.cc | 2 + src/alloy/hir/hir_builder.cc | 4 ++ src/alloy/runtime/entry_table.cc | 2 + src/alloy/runtime/function.cc | 2 + src/alloy/runtime/module.cc | 4 ++ src/alloy/runtime/runtime.cc | 8 ++++ src/xenia/apu/audio_system.cc | 38 ++++++++++++------- src/xenia/apu/xaudio2/xaudio2_audio_driver.cc | 2 + src/xenia/cpu/processor.cc | 6 +++ src/xenia/gpu/d3d11/d3d11_geometry_shader.cc | 7 ++++ src/xenia/gpu/d3d11/d3d11_graphics_driver.cc | 35 +++++++++++++++++ src/xenia/gpu/d3d11/d3d11_graphics_system.cc | 3 ++ src/xenia/gpu/d3d11/d3d11_shader.cc | 7 ++++ src/xenia/gpu/d3d11/d3d11_shader_cache.cc | 1 + src/xenia/gpu/d3d11/d3d11_window.cc | 2 + src/xenia/gpu/ring_buffer_worker.cc | 2 + src/xenia/gpu/shader_cache.cc | 2 + src/xenia/hid/input_system.cc | 8 ++++ 34 files changed, 156 insertions(+), 13 deletions(-) diff --git a/src/alloy/backend/x64/x64_assembler.cc b/src/alloy/backend/x64/x64_assembler.cc index 5a7028e11..d70afe909 100644 --- a/src/alloy/backend/x64/x64_assembler.cc +++ b/src/alloy/backend/x64/x64_assembler.cc @@ -66,6 +66,8 @@ int X64Assembler::Assemble( FunctionInfo* symbol_info, HIRBuilder* builder, uint32_t debug_info_flags, DebugInfo* debug_info, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + int result = 0; // Lower HIR -> x64. diff --git a/src/alloy/backend/x64/x64_code_cache.cc b/src/alloy/backend/x64/x64_code_cache.cc index 7282c2e23..9d1c2ce60 100644 --- a/src/alloy/backend/x64/x64_code_cache.cc +++ b/src/alloy/backend/x64/x64_code_cache.cc @@ -75,6 +75,8 @@ int X64CodeCache::Initialize() { void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size, size_t stack_size) { + SCOPE_profile_cpu_f("alloy"); + // Add unwind info into the allocation size. Keep things 16b aligned. code_size += XEROUNDUP(X64CodeChunk::UNWIND_INFO_SIZE, 16); diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 58ac912d9..6616b0d52 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -77,6 +77,8 @@ int X64Emitter::Emit( HIRBuilder* builder, uint32_t debug_info_flags, runtime::DebugInfo* debug_info, void*& out_code_address, size_t& out_code_size) { + SCOPE_profile_cpu_f("alloy"); + // Reset. if (debug_info_flags & DEBUG_INFO_SOURCE_MAP) { source_map_count_ = 0; diff --git a/src/alloy/compiler/compiler.cc b/src/alloy/compiler/compiler.cc index a28f6b48b..62c6e5a4b 100644 --- a/src/alloy/compiler/compiler.cc +++ b/src/alloy/compiler/compiler.cc @@ -49,6 +49,8 @@ void Compiler::Reset() { } int Compiler::Compile(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // TODO(benvanik): sophisticated stuff. Run passes in parallel, run until they // stop changing things, etc. for (auto it = passes_.begin(); it != passes_.end(); ++it) { diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index 5804ed218..f8430c509 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -23,6 +23,8 @@ ConstantPropagationPass::~ConstantPropagationPass() { } int ConstantPropagationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Once ContextPromotion has run there will likely be a whole slew of // constants that can be pushed through the function. // Example: diff --git a/src/alloy/compiler/passes/context_promotion_pass.cc b/src/alloy/compiler/passes/context_promotion_pass.cc index c880c4f0e..dc225aea6 100644 --- a/src/alloy/compiler/passes/context_promotion_pass.cc +++ b/src/alloy/compiler/passes/context_promotion_pass.cc @@ -51,6 +51,8 @@ int ContextPromotionPass::Initialize(Compiler* compiler) { } int ContextPromotionPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Like mem2reg, but because context memory is unaliasable it's easier to // check and convert LoadContext/StoreContext into value operations. // Example of load->value promotion: diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc index 9c1abf118..5cf6ea6a6 100644 --- a/src/alloy/compiler/passes/control_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -30,6 +30,8 @@ ControlFlowAnalysisPass::~ControlFlowAnalysisPass() { } int ControlFlowAnalysisPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // TODO(benvanik): reset edges for all blocks? Needed to be re-runnable. // Add edges. diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.cc b/src/alloy/compiler/passes/data_flow_analysis_pass.cc index 2a44f076d..209410016 100644 --- a/src/alloy/compiler/passes/data_flow_analysis_pass.cc +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc @@ -36,6 +36,8 @@ DataFlowAnalysisPass::~DataFlowAnalysisPass() { } int DataFlowAnalysisPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Linearize blocks so that we can detect cycles and propagate dependencies. uint32_t block_count = LinearizeBlocks(builder); diff --git a/src/alloy/compiler/passes/dead_code_elimination_pass.cc b/src/alloy/compiler/passes/dead_code_elimination_pass.cc index d295cebec..afb8d87b2 100644 --- a/src/alloy/compiler/passes/dead_code_elimination_pass.cc +++ b/src/alloy/compiler/passes/dead_code_elimination_pass.cc @@ -23,6 +23,8 @@ DeadCodeEliminationPass::~DeadCodeEliminationPass() { } int DeadCodeEliminationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // ContextPromotion/DSE will likely leave around a lot of dead statements. // Code generated for comparison/testing produces many unused statements and // with proper use analysis it should be possible to remove most of them: diff --git a/src/alloy/compiler/passes/finalization_pass.cc b/src/alloy/compiler/passes/finalization_pass.cc index 7f827da15..e6358f242 100644 --- a/src/alloy/compiler/passes/finalization_pass.cc +++ b/src/alloy/compiler/passes/finalization_pass.cc @@ -30,6 +30,8 @@ FinalizationPass::~FinalizationPass() { } int FinalizationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Process the HIR and prepare it for lowering. // After this is done the HIR should be ready for emitting. diff --git a/src/alloy/compiler/passes/register_allocation_pass.cc b/src/alloy/compiler/passes/register_allocation_pass.cc index a89e1415c..7c3a0a7a9 100644 --- a/src/alloy/compiler/passes/register_allocation_pass.cc +++ b/src/alloy/compiler/passes/register_allocation_pass.cc @@ -59,6 +59,8 @@ RegisterAllocationPass::~RegisterAllocationPass() { } int RegisterAllocationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Simple per-block allocator that operates on SSA form. // Registers do not move across blocks, though this could be // optimized with some intra-block analysis (dominators/etc). diff --git a/src/alloy/compiler/passes/simplification_pass.cc b/src/alloy/compiler/passes/simplification_pass.cc index 14cea8681..7fc53c940 100644 --- a/src/alloy/compiler/passes/simplification_pass.cc +++ b/src/alloy/compiler/passes/simplification_pass.cc @@ -23,6 +23,8 @@ SimplificationPass::~SimplificationPass() { } int SimplificationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + EliminateConversions(builder); SimplifyAssignments(builder); return 0; diff --git a/src/alloy/compiler/passes/validation_pass.cc b/src/alloy/compiler/passes/validation_pass.cc index bc77ab482..265c82fe9 100644 --- a/src/alloy/compiler/passes/validation_pass.cc +++ b/src/alloy/compiler/passes/validation_pass.cc @@ -30,6 +30,8 @@ ValidationPass::~ValidationPass() { } int ValidationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + StringBuffer str; builder->Dump(&str); printf(str.GetString()); diff --git a/src/alloy/compiler/passes/value_reduction_pass.cc b/src/alloy/compiler/passes/value_reduction_pass.cc index 4eb61a38b..94453e294 100644 --- a/src/alloy/compiler/passes/value_reduction_pass.cc +++ b/src/alloy/compiler/passes/value_reduction_pass.cc @@ -53,6 +53,8 @@ void ValueReductionPass::ComputeLastUse(Value* value) { } int ValueReductionPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Walk each block and reuse variable ordinals as much as possible. llvm::BitVector ordinals(builder->max_value_ordinal()); diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index 1b254ea4e..a8bec8435 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -44,6 +44,8 @@ void PPCHIRBuilder::Reset() { } int PPCHIRBuilder::Emit(FunctionInfo* symbol_info, bool with_debug_info) { + SCOPE_profile_cpu_f("alloy"); + Memory* memory = frontend_->memory(); const uint8_t* p = memory->membase(); diff --git a/src/alloy/frontend/ppc/ppc_scanner.cc b/src/alloy/frontend/ppc/ppc_scanner.cc index f75229b9e..9658bd595 100644 --- a/src/alloy/frontend/ppc/ppc_scanner.cc +++ b/src/alloy/frontend/ppc/ppc_scanner.cc @@ -38,6 +38,8 @@ bool PPCScanner::IsRestGprLr(uint64_t address) { } int PPCScanner::FindExtents(FunctionInfo* symbol_info) { + SCOPE_profile_cpu_f("alloy"); + // This is a simple basic block analyizer. It walks the start address to the // end address looking for branches. Each span of instructions between // branches is considered a basic block. When the last blr (that has no @@ -286,6 +288,8 @@ int PPCScanner::FindExtents(FunctionInfo* symbol_info) { } std::vector PPCScanner::FindBlocks(FunctionInfo* symbol_info) { + SCOPE_profile_cpu_f("alloy"); + Memory* memory = frontend_->memory(); const uint8_t* p = memory->membase(); diff --git a/src/alloy/frontend/ppc/ppc_translator.cc b/src/alloy/frontend/ppc/ppc_translator.cc index 61617db33..4f879336c 100644 --- a/src/alloy/frontend/ppc/ppc_translator.cc +++ b/src/alloy/frontend/ppc/ppc_translator.cc @@ -86,6 +86,8 @@ int PPCTranslator::Translate( FunctionInfo* symbol_info, uint32_t debug_info_flags, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + // Scan the function to find its extents. We only need to do this if we // haven't already been provided with them from some other source. if (!symbol_info->has_end_address()) { diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index f93a310e8..158e08224 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -51,6 +51,8 @@ void HIRBuilder::Reset() { } int HIRBuilder::Finalize() { + SCOPE_profile_cpu_f("alloy"); + // Scan blocks in order and add fallthrough branches. These are needed for // analysis passes to work. We may have also added blocks out of order and // need to ensure they fall through in the right order. @@ -141,6 +143,8 @@ void HIRBuilder::DumpOp( } void HIRBuilder::Dump(StringBuffer* str) { + SCOPE_profile_cpu_f("alloy"); + if (attributes_) { str->Append("; attributes = %.8X\n", attributes_); } diff --git a/src/alloy/runtime/entry_table.cc b/src/alloy/runtime/entry_table.cc index cf6da5d70..ebec56ea4 100644 --- a/src/alloy/runtime/entry_table.cc +++ b/src/alloy/runtime/entry_table.cc @@ -75,6 +75,8 @@ Entry::Status EntryTable::GetOrCreate(uint64_t address, Entry** out_entry) { } std::vector EntryTable::FindWithAddress(uint64_t address) { + SCOPE_profile_cpu_f("alloy"); + std::vector fns; LockMutex(lock_); for (auto it = map_.begin(); it != map_.end(); ++it) { diff --git a/src/alloy/runtime/function.cc b/src/alloy/runtime/function.cc index 853808d53..2dd0ddce5 100644 --- a/src/alloy/runtime/function.cc +++ b/src/alloy/runtime/function.cc @@ -74,6 +74,8 @@ Breakpoint* Function::FindBreakpoint(uint64_t address) { } int Function::Call(ThreadState* thread_state, uint64_t return_address) { + SCOPE_profile_cpu_f("alloy"); + ThreadState* original_thread_state = ThreadState::Get(); if (original_thread_state != thread_state) { ThreadState::Bind(thread_state); diff --git a/src/alloy/runtime/module.cc b/src/alloy/runtime/module.cc index ea056e0dd..5e38c3902 100644 --- a/src/alloy/runtime/module.cc +++ b/src/alloy/runtime/module.cc @@ -161,6 +161,8 @@ SymbolInfo::Status Module::DefineVariable(VariableInfo* symbol_info) { } void Module::ForEachFunction(std::function callback) { + SCOPE_profile_cpu_f("alloy"); + LockMutex(lock_); for (auto it = list_.begin(); it != list_.end(); ++it) { SymbolInfo* symbol_info = *it; @@ -174,6 +176,8 @@ void Module::ForEachFunction(std::function callback) { void Module::ForEachFunction(size_t since, size_t& version, std::function callback) { + SCOPE_profile_cpu_f("alloy"); + LockMutex(lock_); size_t count = list_.size(); version = count; diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index 3fc45a447..1aff92e04 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -159,6 +159,8 @@ std::vector Runtime::FindFunctionsWithAddress(uint64_t address) { } int Runtime::ResolveFunction(uint64_t address, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + *out_function = NULL; Entry* entry; Entry::Status status = entry_table_.GetOrCreate(address, &entry); @@ -192,6 +194,8 @@ int Runtime::ResolveFunction(uint64_t address, Function** out_function) { int Runtime::LookupFunctionInfo( uint64_t address, FunctionInfo** out_symbol_info) { + SCOPE_profile_cpu_f("alloy"); + *out_symbol_info = NULL; // TODO(benvanik): fast reject invalid addresses/log errors. @@ -220,6 +224,8 @@ int Runtime::LookupFunctionInfo( int Runtime::LookupFunctionInfo(Module* module, uint64_t address, FunctionInfo** out_symbol_info) { + SCOPE_profile_cpu_f("alloy"); + // Atomic create/lookup symbol in module. // If we get back the NEW flag we must declare it now. FunctionInfo* symbol_info = NULL; @@ -241,6 +247,8 @@ int Runtime::LookupFunctionInfo(Module* module, uint64_t address, int Runtime::DemandFunction( FunctionInfo* symbol_info, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + *out_function = NULL; // Lock function for generation. If it's already being generated diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 1793fc92d..144d6bb15 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -82,21 +82,26 @@ void AudioSystem::ThreadStart() { if (result == WAIT_FAILED) { DWORD err = GetLastError(); XEASSERTALWAYS(); + break; } + size_t pumped = 0; - if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { - size_t index = result - WAIT_OBJECT_0; - do { - xe_mutex_lock(lock_); - uint32_t client_callback = clients_[index].callback; - uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; - xe_mutex_unlock(lock_); - if (client_callback) { - processor->Execute(thread_state_, client_callback, client_callback_arg, 0); - } - pumped++; - index++; - } while (index < maximum_client_count_ && WaitForSingleObject(client_wait_handles_[index], 0) == WAIT_OBJECT_0); + { + SCOPE_profile_cpu_i("apu", "Pump"); + if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { + size_t index = result - WAIT_OBJECT_0; + do { + xe_mutex_lock(lock_); + uint32_t client_callback = clients_[index].callback; + uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; + xe_mutex_unlock(lock_); + if (client_callback) { + processor->Execute(thread_state_, client_callback, client_callback_arg, 0); + } + pumped++; + index++; + } while (index < maximum_client_count_ && WaitForSingleObject(client_wait_handles_[index], 0) == WAIT_OBJECT_0); + } } if (!running_) { @@ -104,6 +109,7 @@ void AudioSystem::ThreadStart() { } if (!pumped) { + SCOPE_profile_cpu_i("apu", "Sleep"); Sleep(500); } } @@ -126,6 +132,8 @@ void AudioSystem::Shutdown() { X_STATUS AudioSystem::RegisterClient( uint32_t callback, uint32_t callback_arg, size_t* out_index) { + SCOPE_profile_cpu_f("apu"); + XEASSERTTRUE(unused_clients_.size()); xe_mutex_lock(lock_); @@ -157,6 +165,8 @@ X_STATUS AudioSystem::RegisterClient( } void AudioSystem::SubmitFrame(size_t index, uint32_t samples_ptr) { + SCOPE_profile_cpu_f("apu"); + xe_mutex_lock(lock_); XEASSERTTRUE(index < maximum_client_count_); XEASSERTTRUE(clients_[index].driver != NULL); @@ -166,6 +176,8 @@ void AudioSystem::SubmitFrame(size_t index, uint32_t samples_ptr) { } void AudioSystem::UnregisterClient(size_t index) { + SCOPE_profile_cpu_f("apu"); + xe_mutex_lock(lock_); XEASSERTTRUE(index < maximum_client_count_); DestroyDriver(clients_[index].driver); diff --git a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc index 0155753c4..6f2cd6659 100644 --- a/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc +++ b/src/xenia/apu/xaudio2/xaudio2_audio_driver.cc @@ -121,6 +121,8 @@ void XAudio2AudioDriver::Initialize() { } void XAudio2AudioDriver::SubmitFrame(uint32_t frame_ptr) { + SCOPE_profile_cpu_f("apu"); + // Process samples! They are big-endian floats. HRESULT hr; diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index b77664482..0c780ce22 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -147,6 +147,8 @@ void Processor::AddRegisterAccessCallbacks( } int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { + SCOPE_profile_cpu_f("cpu"); + // Attempt to get the function. Function* fn; if (runtime_->ResolveFunction(address, &fn)) { @@ -171,6 +173,8 @@ int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { uint64_t Processor::Execute( XenonThreadState* thread_state, uint64_t address, uint64_t arg0) { + SCOPE_profile_cpu_f("cpu"); + PPCContext* context = thread_state->context(); context->r[3] = arg0; if (Execute(thread_state, address)) { @@ -182,6 +186,8 @@ uint64_t Processor::Execute( uint64_t Processor::Execute( XenonThreadState* thread_state, uint64_t address, uint64_t arg0, uint64_t arg1) { + SCOPE_profile_cpu_f("cpu"); + PPCContext* context = thread_state->context(); context->r[3] = arg0; context->r[4] = arg1; diff --git a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc index 5984631fe..ba677f7a0 100644 --- a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc @@ -34,6 +34,8 @@ D3D11GeometryShader::~D3D11GeometryShader() { } int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { + SCOPE_profile_cpu_f("gpu"); + if (handle_) { return 0; } @@ -74,6 +76,8 @@ int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { } ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { + SCOPE_profile_cpu_f("gpu"); + // TODO(benvanik): pick shared runtime mode defines. D3D10_SHADER_MACRO defines[] = { "TEST_DEFINE", "1", @@ -161,6 +165,7 @@ D3D11PointSpriteGeometryShader::~D3D11PointSpriteGeometryShader() { int D3D11PointSpriteGeometryShader::Generate(D3D11VertexShader* vertex_shader, alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } @@ -215,6 +220,7 @@ D3D11RectListGeometryShader::~D3D11RectListGeometryShader() { int D3D11RectListGeometryShader::Generate(D3D11VertexShader* vertex_shader, alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } @@ -259,6 +265,7 @@ D3D11QuadListGeometryShader::~D3D11QuadListGeometryShader() { int D3D11QuadListGeometryShader::Generate(D3D11VertexShader* vertex_shader, alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index 25410bf6f..11518f71c 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -190,6 +190,8 @@ void D3D11GraphicsDriver::SetShader( } int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; // Ignore copies. @@ -296,6 +298,8 @@ void D3D11GraphicsDriver::DrawIndexBuffer( XE_GPU_PRIMITIVE_TYPE prim_type, bool index_32bit, uint32_t index_count, uint32_t index_base, uint32_t index_size, uint32_t endianness) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; XETRACED3D("D3D11: draw indexed %d (%d indicies) from %.8X", @@ -321,6 +325,8 @@ void D3D11GraphicsDriver::DrawIndexBuffer( void D3D11GraphicsDriver::DrawIndexAuto( XE_GPU_PRIMITIVE_TYPE prim_type, uint32_t index_count) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; XETRACED3D("D3D11: draw indexed %d (%d indicies)", @@ -346,6 +352,8 @@ int D3D11GraphicsDriver::RebuildRenderTargets( return 0; } + SCOPE_profile_cpu_f("gpu"); + // Remove old versions. for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { auto& cb = render_targets_.color_buffers[n]; @@ -426,6 +434,8 @@ int D3D11GraphicsDriver::RebuildRenderTargets( } int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { + SCOPE_profile_cpu_f("gpu"); + // Most information comes from here: // https://chromium.googlesource.com/chromiumos/third_party/mesa/+/6173cc19c45d92ef0b7bc6aa008aa89bb29abbda/src/gallium/drivers/freedreno/freedreno_zsa.c // http://cgit.freedesktop.org/mesa/mesa/diff/?id=aac7f06ad843eaa696363e8e9c7781ca30cb4914 @@ -768,6 +778,8 @@ int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { } int D3D11GraphicsDriver::UpdateConstantBuffers() { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; D3D11_MAPPED_SUBRESOURCE res; @@ -799,6 +811,8 @@ int D3D11GraphicsDriver::UpdateConstantBuffers() { } int D3D11GraphicsDriver::BindShaders() { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; xe_gpu_program_cntl_t program_cntl; program_cntl.dword_0 = rf.values[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; @@ -892,6 +906,8 @@ int D3D11GraphicsDriver::BindShaders() { } int D3D11GraphicsDriver::PrepareFetchers() { + SCOPE_profile_cpu_f("gpu"); + // Input assembly. XEASSERTNOTNULL(state_.vertex_shader); auto vtx_inputs = state_.vertex_shader->GetVertexBufferInputs(); @@ -934,6 +950,8 @@ int D3D11GraphicsDriver::PrepareFetchers() { } int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; @@ -1009,6 +1027,8 @@ int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { } int D3D11GraphicsDriver::PrepareTextureFetchers() { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; for (int n = 0; n < XECOUNT(state_.texture_fetchers); n++) { @@ -1275,6 +1295,8 @@ int D3D11GraphicsDriver::FetchTexture1D( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + uint32_t address = (fetch.address << 12) + address_translation_; uint32_t width = 1 + fetch.size_1d.width; @@ -1299,6 +1321,8 @@ int D3D11GraphicsDriver::FetchTexture1D( } XEFORCEINLINE void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, XE_GPU_ENDIAN endianness) { + SCOPE_profile_cpu_f("gpu"); + switch (endianness) { case XE_GPU_ENDIAN_8IN16: for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { @@ -1344,6 +1368,8 @@ int D3D11GraphicsDriver::FetchTexture2D( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + XEASSERTTRUE(fetch.dimension == 1); uint32_t address = (fetch.address << 12) + address_translation_; @@ -1448,6 +1474,8 @@ int D3D11GraphicsDriver::FetchTexture3D( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + XELOGE("D3D11: FetchTexture2D not yet implemented"); XEASSERTALWAYS(); return 1; @@ -1470,6 +1498,8 @@ int D3D11GraphicsDriver::FetchTextureCube( xe_gpu_texture_fetch_t& fetch, TextureInfo& info, ID3D11Resource** out_texture) { + SCOPE_profile_cpu_f("gpu"); + XELOGE("D3D11: FetchTextureCube not yet implemented"); XEASSERTALWAYS(); return 1; @@ -1477,6 +1507,7 @@ int D3D11GraphicsDriver::FetchTextureCube( int D3D11GraphicsDriver::PrepareTextureSampler( xenos::XE_GPU_SHADER_TYPE shader_type, Shader::tex_buffer_desc_t& desc) { + SCOPE_profile_cpu_f("gpu"); auto& fetcher = state_.texture_fetchers[desc.fetch_slot]; auto& info = fetcher.info; @@ -1588,6 +1619,8 @@ int D3D11GraphicsDriver::PrepareTextureSampler( int D3D11GraphicsDriver::PrepareIndexBuffer( bool index_32bit, uint32_t index_count, uint32_t index_base, uint32_t index_size, uint32_t endianness) { + SCOPE_profile_cpu_f("gpu"); + RegisterFile& rf = register_file_; uint32_t address = index_base + address_translation_; @@ -1634,6 +1667,8 @@ int D3D11GraphicsDriver::PrepareIndexBuffer( } int D3D11GraphicsDriver::Resolve() { + SCOPE_profile_cpu_f("gpu"); + // No clue how this is supposed to work yet. ID3D11Texture2D* back_buffer = 0; swap_chain_->GetBuffer(0, __uuidof(ID3D11Texture2D), diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index 03c91038c..825e9f1ff 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -29,6 +29,7 @@ void __stdcall D3D11GraphicsSystemVsyncCallback( thread_name_set = true; Profiler::ThreadEnter("VsyncTimer"); } + SCOPE_profile_cpu_f("gpu"); gs->MarkVblank(); gs->DispatchInterruptCallback(0); @@ -151,6 +152,8 @@ void D3D11GraphicsSystem::Initialize() { } void D3D11GraphicsSystem::Pump() { + SCOPE_profile_cpu_f("gpu"); + if (swap_pending_) { swap_pending_ = false; diff --git a/src/xenia/gpu/d3d11/d3d11_shader.cc b/src/xenia/gpu/d3d11/d3d11_shader.cc index a60a7bdf3..97e0cb295 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader.cc @@ -145,6 +145,8 @@ void D3D11Shader::set_translated_src(char* value) { } ID3D10Blob* D3D11Shader::Compile(const char* shader_source) { + SCOPE_profile_cpu_f("gpu"); + // TODO(benvanik): pick shared runtime mode defines. D3D10_SHADER_MACRO defines[] = { "TEST_DEFINE", "1", @@ -256,6 +258,7 @@ D3D11VertexShader::~D3D11VertexShader() { } int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { + SCOPE_profile_cpu_f("gpu"); if (handle_) { return 0; } @@ -411,6 +414,8 @@ int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { } const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) { + SCOPE_profile_cpu_f("gpu"); + Output* output = new Output(); xe_gpu_translate_ctx_t ctx; ctx.output = output; @@ -599,6 +604,7 @@ D3D11PixelShader::~D3D11PixelShader() { int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) { + SCOPE_profile_cpu_f("gpu"); if (handle_) { return 0; } @@ -641,6 +647,7 @@ int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl, const char* D3D11PixelShader::Translate( xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) { + SCOPE_profile_cpu_f("gpu"); Output* output = new Output(); xe_gpu_translate_ctx_t ctx; ctx.output = output; diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc index 7f6a5a722..be9352b50 100644 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc +++ b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc @@ -31,6 +31,7 @@ Shader* D3D11ShaderCache::CreateCore( xenos::XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length, uint64_t hash) { + SCOPE_profile_cpu_f("gpu"); switch (type) { case XE_GPU_SHADER_TYPE_VERTEX: return new D3D11VertexShader( diff --git a/src/xenia/gpu/d3d11/d3d11_window.cc b/src/xenia/gpu/d3d11/d3d11_window.cc index 64a9b0df6..da33ab6bb 100644 --- a/src/xenia/gpu/d3d11/d3d11_window.cc +++ b/src/xenia/gpu/d3d11/d3d11_window.cc @@ -114,6 +114,8 @@ int D3D11Window::Initialize(const char* title, uint32_t width, uint32_t height) } void D3D11Window::Swap() { + SCOPE_profile_cpu_f("gpu"); + // Present profiler. context_->OMSetRenderTargets(1, &render_target_view_, NULL); Profiler::Present(); diff --git a/src/xenia/gpu/ring_buffer_worker.cc b/src/xenia/gpu/ring_buffer_worker.cc index 3792c0b61..9999601bb 100644 --- a/src/xenia/gpu/ring_buffer_worker.cc +++ b/src/xenia/gpu/ring_buffer_worker.cc @@ -125,6 +125,8 @@ void RingBufferWorker::Pump() { void RingBufferWorker::ExecutePrimaryBuffer( uint32_t start_index, uint32_t end_index) { + SCOPE_profile_cpu_f("gpu"); + // Adjust pointer base. uint32_t ptr = primary_buffer_ptr_ + start_index * 4; ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (ptr & 0x1FFFFFFF); diff --git a/src/xenia/gpu/shader_cache.cc b/src/xenia/gpu/shader_cache.cc index 9aee3e2b7..33033bc36 100644 --- a/src/xenia/gpu/shader_cache.cc +++ b/src/xenia/gpu/shader_cache.cc @@ -55,6 +55,8 @@ Shader* ShaderCache::Find( Shader* ShaderCache::FindOrCreate( XE_GPU_SHADER_TYPE type, const uint8_t* src_ptr, size_t length) { + SCOPE_profile_cpu_f("gpu"); + uint64_t hash = Hash(src_ptr, length); unordered_map::iterator it = map_.find(hash); if (it != map_.end()) { diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc index b82ca11af..6ad1ab177 100644 --- a/src/xenia/hid/input_system.cc +++ b/src/xenia/hid/input_system.cc @@ -42,6 +42,8 @@ void InputSystem::AddDriver(InputDriver* driver) { X_RESULT InputSystem::GetCapabilities( uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->GetCapabilities(user_index, flags, out_caps))) { @@ -52,6 +54,8 @@ X_RESULT InputSystem::GetCapabilities( } X_RESULT InputSystem::GetState(uint32_t user_index, X_INPUT_STATE& out_state) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (driver->GetState(user_index, out_state) == X_ERROR_SUCCESS) { @@ -63,6 +67,8 @@ X_RESULT InputSystem::GetState(uint32_t user_index, X_INPUT_STATE& out_state) { X_RESULT InputSystem::SetState( uint32_t user_index, X_INPUT_VIBRATION& vibration) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->SetState(user_index, vibration))) { @@ -74,6 +80,8 @@ X_RESULT InputSystem::SetState( X_RESULT InputSystem::GetKeystroke( uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->GetKeystroke(user_index, flags, out_keystroke))) {