diff --git a/README.md b/README.md index 63525bece..e0460e25c 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ Xenia - Xbox 360 Emulator Research Project ========================================== -Xenia is an experimental emulator for the Xbox 360. It does not run games (yet), -and if you are unable to understand that please leave now. +Xenia is an experimental emulator for the Xbox 360. It does not run games (yet). Pull requests are welcome but the code is in a very high churn state and may not be accepted, so ask in IRC before taking on anything big. Contributions are @@ -54,7 +53,7 @@ See [building](docs/building.md) for setup and information about the Have some spare time, know advanced C++, and want to write an emulator? Contribute! There's a ton of work that needs to be done, a lot of which -is wide open greenfield fun. +is wide open greenfield fun. That said, the project is currently undergoing a lot of major foundational development and core pieces are changing rapidly and poorly documented. @@ -64,12 +63,12 @@ doing. Fixes and optimizations are always welcome (please!), but in addition to that there are some major work areas still untouched: +* Help work through missing functionality/bugs in game [compat](https://github.com/benvanik/xenia/issues?labels=compat) * Write an [OpenGL driver](https://github.com/benvanik/xenia/issues/59) * Add input drivers for [OSX](https://github.com/benvanik/xenia/issues/61) and [PS4 controllers](https://github.com/benvanik/xenia/issues/60) (or anything else) * Start [hacking on audio](https://github.com/benvanik/xenia/issues/62) -* Support [loading of PIRS files](https://github.com/benvanik/xenia/issues/63) * Build a [virtual LIVE service](https://github.com/benvanik/xenia/issues/64) - + See more projects [good for contributors](https://github.com/benvanik/xenia/issues?labels=good+for+contributors&page=1&state=open). It's a good idea to ask on IRC/the bugs before beginning work on something. @@ -85,11 +84,9 @@ Come on people. Jeez. ### What kind of machine do I need to run this? -You'll need 64-bit Windows 7 with a processor supporting at least SSE4. -It's only tested on Windows 8 and that may become a requirement as several of -the APIs exposed there are beneficial to emulation. In general if you have to -ask if your machine is good enough to run games at a decent speed the answer is -no. +You'll need 64-bit Windows 8 with a processor supporting at least AVX2 - in +other words, a Haswell. In general if you have to ask if your machine is good +enough to run games at a decent speed the answer is no. ### What about Linux/OSX? @@ -108,7 +105,7 @@ be required in the future. I get asked this about once a day. Yes, I have heard of them. In fact, I spent a long time trying them out: -[LLVM](https://github.com/benvanik/xenia/tree/85bdbd24d1b5923cfb104f45194a96e7ac57026e/src/xenia/cpu/codegen), +[LLVM](https://github.com/benvanik/xenia/tree/85bdbd24d1b5923cfb104f45194a96e7ac57026e/src/xenia/cpu/codegen), [libjit](https://github.com/benvanik/xenia/tree/eee856be0499a4bc721b6097f5f2b9446929f2cc/src/xenia/cpu/libjit), [asmjit](https://github.com/benvanik/xenia/tree/ca208fa60a0285d396409743064784cc2320c094/src/xenia/cpu/x64). They don't work for this purpose. I understand if you disagree, but please diff --git a/src/alloy/alloy-private.h b/src/alloy/alloy-private.h index 76b59e381..a22be71c4 100644 --- a/src/alloy/alloy-private.h +++ b/src/alloy/alloy-private.h @@ -18,6 +18,11 @@ DECLARE_bool(debug); DECLARE_bool(always_disasm); +DECLARE_bool(validate_hir); + +DECLARE_uint64(break_on_instruction); +DECLARE_uint64(break_on_memory); + namespace alloy { diff --git a/src/alloy/alloy.cc b/src/alloy/alloy.cc index 1fd5261c0..bae955976 100644 --- a/src/alloy/alloy.cc +++ b/src/alloy/alloy.cc @@ -21,6 +21,14 @@ using namespace alloy; DEFINE_bool(debug, DEFAULT_DEBUG_FLAG, "Allow debugging and retain debug information."); - DEFINE_bool(always_disasm, false, "Always add debug info to functions, even when no debugger is attached."); + +DEFINE_bool(validate_hir, false, + "Perform validation checks on the HIR during compilation."); + +// Breakpoints: +DEFINE_uint64(break_on_instruction, 0, + "int3 before the given guest address is executed."); +DEFINE_uint64(break_on_memory, 0, + "int3 on read/write to the given memory address."); diff --git a/src/alloy/backend/backend.cc b/src/alloy/backend/backend.cc index 2f6531fb5..d49fb713e 100644 --- a/src/alloy/backend/backend.cc +++ b/src/alloy/backend/backend.cc @@ -18,6 +18,7 @@ using namespace alloy::runtime; Backend::Backend(Runtime* runtime) : runtime_(runtime) { + xe_zero_struct(&machine_info_, sizeof(machine_info_)); } Backend::~Backend() { diff --git a/src/alloy/backend/backend.h b/src/alloy/backend/backend.h index 885844d3f..b6c2c431e 100644 --- a/src/alloy/backend/backend.h +++ b/src/alloy/backend/backend.h @@ -11,6 +11,7 @@ #define ALLOY_BACKEND_BACKEND_H_ #include +#include namespace alloy { namespace runtime { class Runtime; } } @@ -27,6 +28,7 @@ public: virtual ~Backend(); runtime::Runtime* runtime() const { return runtime_; } + const MachineInfo* machine_info() const { return &machine_info_; } virtual int Initialize(); @@ -37,6 +39,7 @@ public: protected: runtime::Runtime* runtime_; + MachineInfo machine_info_; }; diff --git a/src/alloy/backend/ivm/ivm_assembler.cc b/src/alloy/backend/ivm/ivm_assembler.cc index d2b08b964..0431f7ab2 100644 --- a/src/alloy/backend/ivm/ivm_assembler.cc +++ b/src/alloy/backend/ivm/ivm_assembler.cc @@ -61,7 +61,6 @@ int IVMAssembler::Assemble( fn->set_debug_info(debug_info); TranslationContext ctx; - ctx.access_callbacks = backend_->runtime()->access_callbacks(); ctx.register_count = 0; ctx.intcode_count = 0; ctx.intcode_arena = &intcode_arena_; @@ -74,6 +73,19 @@ int IVMAssembler::Assemble( builder->ResetLabelTags(); // Function prologue. + size_t stack_offset = 0; + auto locals = builder->locals(); + for (auto it = locals.begin(); it != locals.end(); ++it) { + auto slot = *it; + size_t type_size = GetTypeSize(slot->type); + // Align to natural size. + stack_offset = XEALIGN(stack_offset, type_size); + slot->set_constant((uint32_t)stack_offset); + stack_offset += type_size; + } + // Ensure 16b alignment. + stack_offset = XEALIGN(stack_offset, 16); + ctx.stack_size = stack_offset; auto block = builder->first_block(); while (block) { @@ -96,7 +108,7 @@ int IVMAssembler::Assemble( // Fixup label references. LabelRef* label_ref = ctx.label_ref_head; while (label_ref) { - label_ref->instr->src1_reg = (uint32_t)label_ref->label->tag & ~0x80000000; + label_ref->instr->src1_reg = (uint32_t)(intptr_t)label_ref->label->tag & ~0x80000000; label_ref = label_ref->next; } diff --git a/src/alloy/backend/ivm/ivm_backend.cc b/src/alloy/backend/ivm/ivm_backend.cc index bb2a42f67..67703f7d4 100644 --- a/src/alloy/backend/ivm/ivm_backend.cc +++ b/src/alloy/backend/ivm/ivm_backend.cc @@ -34,6 +34,20 @@ int IVMBackend::Initialize() { return result; } + machine_info_.register_sets[0] = { + 0, + "gpr", + MachineInfo::RegisterSet::INT_TYPES, + 16, + }; + machine_info_.register_sets[1] = { + 1, + "vec", + MachineInfo::RegisterSet::FLOAT_TYPES | + MachineInfo::RegisterSet::VEC_TYPES, + 16, + }; + alloy::tracing::WriteEvent(EventType::Init({ })); diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index c4c0d97f9..88306b228 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -23,7 +23,7 @@ using namespace alloy::runtime; IVMFunction::IVMFunction(FunctionInfo* symbol_info) : register_count_(0), intcode_count_(0), intcodes_(0), source_map_count_(0), source_map_(0), - GuestFunction(symbol_info) { + Function(symbol_info) { } IVMFunction::~IVMFunction() { @@ -33,6 +33,7 @@ IVMFunction::~IVMFunction() { void IVMFunction::Setup(TranslationContext& ctx) { register_count_ = ctx.register_count; + stack_size_ = ctx.stack_size; intcode_count_ = ctx.intcode_count; intcodes_ = (IntCode*)ctx.intcode_arena->CloneContents(); source_map_count_ = ctx.source_map_count; @@ -104,22 +105,25 @@ void IVMFunction::OnBreakpointHit(ThreadState* thread_state, IntCode* i) { #undef TRACE_SOURCE_OFFSET -int IVMFunction::CallImpl(ThreadState* thread_state) { +int IVMFunction::CallImpl(ThreadState* thread_state, uint64_t return_address) { // Setup register file on stack. auto stack = (IVMStack*)thread_state->backend_data(); auto register_file = (Register*)stack->Alloc(register_count_); + auto local_stack = (uint8_t*)alloca(stack_size_); Memory* memory = thread_state->memory(); IntCodeState ics; ics.rf = register_file; + ics.locals = local_stack; ics.context = (uint8_t*)thread_state->raw_context(); ics.membase = memory->membase(); - ics.reserve_address = memory->reserve_address(); + ics.page_table = ics.membase + memory->page_table(); ics.did_carry = 0; ics.did_saturate = 0; - ics.access_callbacks = thread_state->runtime()->access_callbacks(); ics.thread_state = thread_state; + ics.return_address = return_address; + ics.call_return_address = 0; volatile int* suspend_flag_address = thread_state->suspend_flag_address(); diff --git a/src/alloy/backend/ivm/ivm_function.h b/src/alloy/backend/ivm/ivm_function.h index 7fee49db0..0169ee5b1 100644 --- a/src/alloy/backend/ivm/ivm_function.h +++ b/src/alloy/backend/ivm/ivm_function.h @@ -21,7 +21,7 @@ namespace backend { namespace ivm { -class IVMFunction : public runtime::GuestFunction { +class IVMFunction : public runtime::Function { public: IVMFunction(runtime::FunctionInfo* symbol_info); virtual ~IVMFunction(); @@ -31,16 +31,18 @@ public: protected: virtual int AddBreakpointImpl(runtime::Breakpoint* breakpoint); virtual int RemoveBreakpointImpl(runtime::Breakpoint* breakpoint); - virtual int CallImpl(runtime::ThreadState* thread_state); + virtual int CallImpl(runtime::ThreadState* thread_state, + uint64_t return_address); private: IntCode* GetIntCodeAtSourceOffset(uint64_t offset); void OnBreakpointHit(runtime::ThreadState* thread_state, IntCode* i); private: - size_t register_count_; - size_t intcode_count_; - IntCode* intcodes_; + size_t register_count_; + size_t stack_size_; + size_t intcode_count_; + IntCode* intcodes_; size_t source_map_count_; SourceMapEntry* source_map_; }; diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 7100deaa8..1badeab7e 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -196,213 +196,6 @@ int DispatchToC(TranslationContext& ctx, Instr* i, IntCodeFn fn) { return 0; } -uint32_t IntCode_LOAD_REGISTER_I8(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i8 = (int8_t)cbs->read(cbs->context, address); - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I16(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I32(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I64(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src2_reg | ((uint64_t)i->src3_reg << 32)); - ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); - return IA_NEXT; -} -int DispatchRegisterRead( - TranslationContext& ctx, Instr* i, RegisterAccessCallbacks* cbs) { - static IntCodeFn fns[] = { - IntCode_LOAD_REGISTER_I8, - IntCode_LOAD_REGISTER_I16, - IntCode_LOAD_REGISTER_I32, - IntCode_LOAD_REGISTER_I64, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - }; - IntCodeFn fn = fns[i->dest->type]; - XEASSERT(fn != IntCode_INVALID_TYPE); - uint32_t dest_reg = AllocDynamicRegister(ctx, i->dest); - uint32_t src1_reg = AllocOpRegister(ctx, OPCODE_SIG_TYPE_V, &i->src1); - ctx.intcode_count++; - IntCode* ic = ctx.intcode_arena->Alloc(); - ic->intcode_fn = fn; - ic->flags = i->flags; - ic->debug_flags = 0; - ic->dest_reg = dest_reg; - ic->src1_reg = src1_reg; - ic->src2_reg = (uint32_t)((uint64_t)cbs); - ic->src3_reg = (uint32_t)(((uint64_t)cbs) >> 32); - return 0; -} -uint32_t IntCode_LOAD_REGISTER_I8_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i8 = (int8_t)cbs->read(cbs->context, address); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i16 = XESWAP16((int16_t)cbs->read(cbs->context, address)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i32 = XESWAP32((int32_t)cbs->read(cbs->context, address)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_LOAD_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - ics.rf[i->dest_reg].i64 = XESWAP64((int64_t)cbs->read(cbs->context, address)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} - -uint32_t IntCode_STORE_REGISTER_I8(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i8); - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I16(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I32(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I64(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = (RegisterAccessCallbacks*) - (i->src3_reg | ((uint64_t)i->dest_reg << 32)); - cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); - return IA_NEXT; -} -int DispatchRegisterWrite( - TranslationContext& ctx, Instr* i, RegisterAccessCallbacks* cbs) { - static IntCodeFn fns[] = { - IntCode_STORE_REGISTER_I8, - IntCode_STORE_REGISTER_I16, - IntCode_STORE_REGISTER_I32, - IntCode_STORE_REGISTER_I64, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, - }; - IntCodeFn fn = fns[i->src2.value->type]; - XEASSERT(fn != IntCode_INVALID_TYPE); - uint32_t src1_reg = AllocOpRegister(ctx, OPCODE_SIG_TYPE_V, &i->src1); - uint32_t src2_reg = AllocOpRegister(ctx, OPCODE_SIG_TYPE_V, &i->src2); - ctx.intcode_count++; - IntCode* ic = ctx.intcode_arena->Alloc(); - ic->intcode_fn = fn; - ic->flags = i->flags; - ic->debug_flags = 0; - ic->dest_reg = (uint32_t)(((uint64_t)cbs) >> 32); - ic->src1_reg = src1_reg; - ic->src2_reg = src2_reg; - ic->src3_reg = (uint32_t)((uint64_t)cbs); - return 0; -} -uint32_t IntCode_STORE_REGISTER_I8_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, ics.rf[i->src2_reg].i8); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I16_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP16(ics.rf[i->src2_reg].i16)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I32_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP32(ics.rf[i->src2_reg].i32)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} -uint32_t IntCode_STORE_REGISTER_I64_DYNAMIC(IntCodeState& ics, const IntCode* i) { - uint64_t address = ics.rf[i->src1_reg].u32; - RegisterAccessCallbacks* cbs = ics.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - cbs->write(cbs->context, address, XESWAP64(ics.rf[i->src2_reg].i64)); - return IA_NEXT; - } - cbs = cbs->next; - } - return IA_NEXT; -} - - uint32_t IntCode_INVALID(IntCodeState& ics, const IntCode* i) { XEASSERTALWAYS(); return IA_NEXT; @@ -417,7 +210,7 @@ int TranslateInvalid(TranslationContext& ctx, Instr* i) { uint32_t IntCode_COMMENT(IntCodeState& ics, const IntCode* i) { char* value = (char*)(i->src1_reg | ((uint64_t)i->src2_reg << 32)); - IPRINT("XE[t] :%d: %s\n", ics.thread_state->GetThreadID(), value); + IPRINT("XE[t] :%d: %s\n", ics.thread_state->thread_id(), value); IFLUSH(); return IA_NEXT; } @@ -576,11 +369,15 @@ int Translate_TRAP_TRUE(TranslationContext& ctx, Instr* i) { uint32_t IntCode_CALL_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { FunctionInfo* symbol_info = (FunctionInfo*)ics.rf[reg].u64; - Function* fn = NULL; - ics.thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); + Function* fn = symbol_info->function(); + if (!fn) { + ics.thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); + } XEASSERTNOTNULL(fn); // TODO(benvanik): proper tail call support, somehow. - fn->Call(ics.thread_state); + uint64_t return_address = + (i->flags & CALL_TAIL) ? ics.return_address : ics.call_return_address; + fn->Call(ics.thread_state, return_address); if (i->flags & CALL_TAIL) { return IA_RETURN; } @@ -645,12 +442,21 @@ int Translate_CALL_TRUE(TranslationContext& ctx, Instr* i) { uint32_t IntCode_CALL_INDIRECT_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { uint64_t target = ics.rf[reg].u32; + // Check if return address - if so, return. + if (i->flags & CALL_POSSIBLE_RETURN) { + if (target == ics.return_address) { + return IA_RETURN; + } + } + // Real call. Function* fn = NULL; ics.thread_state->runtime()->ResolveFunction(target, &fn); XEASSERTNOTNULL(fn); // TODO(benvanik): proper tail call support, somehow. - fn->Call(ics.thread_state); + uint64_t return_address = + (i->flags & CALL_TAIL) ? ics.return_address : ics.call_return_address; + fn->Call(ics.thread_state, return_address); if (i->flags & CALL_TAIL) { return IA_RETURN; } @@ -712,6 +518,13 @@ int Translate_CALL_INDIRECT_TRUE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->src1.value->type]); } +uint32_t IntCode_CALL_EXTERN(IntCodeState& ics, const IntCode* i) { + return IntCode_CALL_XX(ics, i, i->src1_reg); +} +int Translate_CALL_EXTERN(TranslationContext& ctx, Instr* i) { + return DispatchToC(ctx, i, IntCode_CALL_EXTERN); +} + uint32_t IntCode_RETURN(IntCodeState& ics, const IntCode* i) { return IA_RETURN; } @@ -768,6 +581,14 @@ int Translate_RETURN_TRUE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->src1.value->type]); } +uint32_t IntCode_SET_RETURN_ADDRESS(IntCodeState& ics, const IntCode* i) { + ics.call_return_address = ics.rf[i->src1_reg].u32; + return IA_NEXT; +} +int Translate_SET_RETURN_ADDRESS(TranslationContext& ctx, Instr* i) { + return DispatchToC(ctx, i, IntCode_SET_RETURN_ADDRESS); +} + uint32_t IntCode_BRANCH_XX(IntCodeState& ics, const IntCode* i, uint32_t reg) { return ics.rf[reg].u32; } @@ -1335,34 +1156,116 @@ int Translate_LOAD_CLOCK(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, IntCode_LOAD_CLOCK); } +uint32_t IntCode_LOAD_LOCAL_I8(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_I16(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_I32(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_I64(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_F32(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].f32 = *((float*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_F64(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].f64 = *((double*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +uint32_t IntCode_LOAD_LOCAL_V128(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u32)); + return IA_NEXT; +} +int Translate_LOAD_LOCAL(TranslationContext& ctx, Instr* i) { + static IntCodeFn fns[] = { + IntCode_LOAD_LOCAL_I8, + IntCode_LOAD_LOCAL_I16, + IntCode_LOAD_LOCAL_I32, + IntCode_LOAD_LOCAL_I64, + IntCode_LOAD_LOCAL_F32, + IntCode_LOAD_LOCAL_F64, + IntCode_LOAD_LOCAL_V128, + }; + return DispatchToC(ctx, i, fns[i->dest->type]); +} + +uint32_t IntCode_STORE_LOCAL_I8(IntCodeState& ics, const IntCode* i) { + *((int8_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i8; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_I16(IntCodeState& ics, const IntCode* i) { + *((int16_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i16; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_I32(IntCodeState& ics, const IntCode* i) { + *((int32_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i32; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_I64(IntCodeState& ics, const IntCode* i) { + *((int64_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].i64; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_F32(IntCodeState& ics, const IntCode* i) { + *((float*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].f32; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_F64(IntCodeState& ics, const IntCode* i) { + *((double*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].f64; + return IA_NEXT; +} +uint32_t IntCode_STORE_LOCAL_V128(IntCodeState& ics, const IntCode* i) { + *((vec128_t*)(ics.locals + ics.rf[i->src1_reg].u32)) = ics.rf[i->src2_reg].v128; + return IA_NEXT; +} +int Translate_STORE_LOCAL(TranslationContext& ctx, Instr* i) { + static IntCodeFn fns[] = { + IntCode_STORE_LOCAL_I8, + IntCode_STORE_LOCAL_I16, + IntCode_STORE_LOCAL_I32, + IntCode_STORE_LOCAL_I64, + IntCode_STORE_LOCAL_F32, + IntCode_STORE_LOCAL_F64, + IntCode_STORE_LOCAL_V128, + }; + return DispatchToC(ctx, i, fns[i->src2.value->type]); +} + uint32_t IntCode_LOAD_CONTEXT_I8(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i8 = *((int8_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%d (%.X) = ctx i8 +%d\n", ics.rf[i->dest_reg].i8, ics.rf[i->dest_reg].u8, ics.rf[i->src1_reg].u64); + DPRINT("%d (%X) = ctx i8 +%d\n", ics.rf[i->dest_reg].i8, ics.rf[i->dest_reg].u8, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_I16(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i16 = *((int16_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%d (%.X) = ctx i16 +%d\n", ics.rf[i->dest_reg].i16, ics.rf[i->dest_reg].u16, ics.rf[i->src1_reg].u64); + DPRINT("%d (%X) = ctx i16 +%d\n", ics.rf[i->dest_reg].i16, ics.rf[i->dest_reg].u16, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_I32(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i32 = *((int32_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%d (%.X) = ctx i32 +%d\n", ics.rf[i->dest_reg].i32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); + DPRINT("%d (%X) = ctx i32 +%d\n", ics.rf[i->dest_reg].i32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_I64(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i64 = *((int64_t*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%lld (%.llX) = ctx i64 +%d\n", ics.rf[i->dest_reg].i64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); + DPRINT("%lld (%llX) = ctx i64 +%d\n", ics.rf[i->dest_reg].i64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_F32(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].f32 = *((float*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%e (%.X) = ctx f32 +%d\n", ics.rf[i->dest_reg].f32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); + DPRINT("%e (%X) = ctx f32 +%d\n", ics.rf[i->dest_reg].f32, ics.rf[i->dest_reg].u32, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_F64(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].f64 = *((double*)(ics.context + ics.rf[i->src1_reg].u64)); - DPRINT("%lle (%.llX) = ctx f64 +%d\n", ics.rf[i->dest_reg].f64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); + DPRINT("%lle (%llX) = ctx f64 +%d\n", ics.rf[i->dest_reg].f64, ics.rf[i->dest_reg].u64, ics.rf[i->src1_reg].u64); return IA_NEXT; } uint32_t IntCode_LOAD_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { @@ -1388,39 +1291,39 @@ int Translate_LOAD_CONTEXT(TranslationContext& ctx, Instr* i) { uint32_t IntCode_STORE_CONTEXT_I8(IntCodeState& ics, const IntCode* i) { *((int8_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i8; - DPRINT("ctx i8 +%d = %d (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); + DPRINT("ctx i8 +%d = %d (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_I16(IntCodeState& ics, const IntCode* i) { *((int16_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i16; - DPRINT("ctx i16 +%d = %d (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); + DPRINT("ctx i16 +%d = %d (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_I32(IntCodeState& ics, const IntCode* i) { *((int32_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i32; - DPRINT("ctx i32 +%d = %d (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); + DPRINT("ctx i32 +%d = %d (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_I64(IntCodeState& ics, const IntCode* i) { *((int64_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].i64; - DPRINT("ctx i64 +%d = %lld (%.llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); + DPRINT("ctx i64 +%d = %lld (%llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_F32(IntCodeState& ics, const IntCode* i) { *((float*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f32; - DPRINT("ctx f32 +%d = %e (%.X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); + DPRINT("ctx f32 +%d = %e (%X)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_F64(IntCodeState& ics, const IntCode* i) { *((double*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].f64; - DPRINT("ctx f64 +%d = %lle (%.llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); + DPRINT("ctx f64 +%d = %lle (%llX)\n", ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); return IA_NEXT; } uint32_t IntCode_STORE_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { *((vec128_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].v128; DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", ics.rf[i->src1_reg].u64, VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), - VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3)); + VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECI4(ics.rf[i->src2_reg].v128,3)); return IA_NEXT; } int Translate_STORE_CONTEXT(TranslationContext& ctx, Instr* i) { @@ -1439,7 +1342,8 @@ int Translate_STORE_CONTEXT(TranslationContext& ctx, Instr* i) { uint32_t IntCode_LOAD_I8(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I8_DYNAMIC(ics, i); + ics.rf[i->dest_reg].i8 = ics.thread_state->memory()->LoadI8(address); + return IA_NEXT; } DPRINT("%d (%X) = load.i8 %.8X\n", *((int8_t*)(ics.membase + address)), @@ -1452,7 +1356,9 @@ uint32_t IntCode_LOAD_I8(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_LOAD_I16(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I16_DYNAMIC(ics, i); + ics.rf[i->dest_reg].i16 = + XESWAP16(ics.thread_state->memory()->LoadI16(address)); + return IA_NEXT; } DPRINT("%d (%X) = load.i16 %.8X\n", *((int16_t*)(ics.membase + address)), @@ -1465,7 +1371,9 @@ uint32_t IntCode_LOAD_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_LOAD_I32(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I32_DYNAMIC(ics, i); + ics.rf[i->dest_reg].i32 = + XESWAP32(ics.thread_state->memory()->LoadI32(address)); + return IA_NEXT; } DFLUSH(); DPRINT("%d (%X) = load.i32 %.8X\n", @@ -1479,7 +1387,9 @@ uint32_t IntCode_LOAD_I32(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_LOAD_I64(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_LOAD_REGISTER_I64(ics, i); + ics.rf[i->dest_reg].i64 = + XESWAP64(ics.thread_state->memory()->LoadI64(address)); + return IA_NEXT; } DPRINT("%lld (%llX) = load.i64 %.8X\n", *((int64_t*)(ics.membase + address)), @@ -1515,7 +1425,7 @@ uint32_t IntCode_LOAD_V128(IntCodeState& ics, const IntCode* i) { for (int n = 0; n < 4; n++) { VECI4(dest,n) = *((uint32_t*)(ics.membase + address + n * 4)); } - DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load v128 %.8X\n", + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load.v128 %.8X\n", VECF4(dest,0), VECF4(dest,1), VECF4(dest,2), VECF4(dest,3), VECI4(dest,0), VECI4(dest,1), VECI4(dest,2), VECI4(dest,3), address); @@ -1532,90 +1442,95 @@ int Translate_LOAD(TranslationContext& ctx, Instr* i) { IntCode_LOAD_F64, IntCode_LOAD_V128, }; - if (i->src1.value->IsConstant()) { - // Constant address - check register access callbacks. - // NOTE: we still will likely want to check on access in debug mode, as - // constant propagation may not have happened. - uint64_t address = i->src1.value->AsUint64(); - RegisterAccessCallbacks* cbs = ctx.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - return DispatchRegisterRead(ctx, i, cbs); - } - cbs = cbs->next; - } - } return DispatchToC(ctx, i, fns[i->dest->type]); } +void MarkPageDirty(IntCodeState& ics, uint32_t address) { + // 16KB pages. + ics.page_table[(address >> 14) & 0x7FFF] = 1; +} uint32_t IntCode_STORE_I8(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I8_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI8(address, ics.rf[i->src2_reg].i8); + return IA_NEXT; } DPRINT("store.i8 %.8X = %d (%X)\n", - address, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].i8); + address, ics.rf[i->src2_reg].i8, ics.rf[i->src2_reg].u8); DFLUSH(); *((int8_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i8; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_I16(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I16_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI16(address, + XESWAP16(ics.rf[i->src2_reg].i16)); + return IA_NEXT; } DPRINT("store.i16 %.8X = %d (%X)\n", - address, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].i16); + address, ics.rf[i->src2_reg].i16, ics.rf[i->src2_reg].u16); DFLUSH(); *((int16_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i16; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_I32(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I32_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI32(address, + XESWAP32(ics.rf[i->src2_reg].i32)); + return IA_NEXT; } DPRINT("store.i32 %.8X = %d (%X)\n", - address, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].i32); + address, ics.rf[i->src2_reg].i32, ics.rf[i->src2_reg].u32); DFLUSH(); *((int32_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i32; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_I64(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; if (DYNAMIC_REGISTER_ACCESS_CHECK(address)) { - return IntCode_STORE_REGISTER_I64_DYNAMIC(ics, i); + ics.thread_state->memory()->StoreI64(address, + XESWAP64(ics.rf[i->src2_reg].i64)); + return IA_NEXT; } DPRINT("store.i64 %.8X = %lld (%llX)\n", - address, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].i64); + address, ics.rf[i->src2_reg].i64, ics.rf[i->src2_reg].u64); DFLUSH(); *((int64_t*)(ics.membase + address)) = ics.rf[i->src2_reg].i64; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_F32(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; DPRINT("store.f32 %.8X = %e (%X)\n", - address, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].i32); + address, ics.rf[i->src2_reg].f32, ics.rf[i->src2_reg].u32); DFLUSH(); *((float*)(ics.membase + address)) = ics.rf[i->src2_reg].f32; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_F64(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; DPRINT("store.f64 %.8X = %lle (%llX)\n", - address, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].i64); + address, ics.rf[i->src2_reg].f64, ics.rf[i->src2_reg].u64); DFLUSH(); *((double*)(ics.membase + address)) = ics.rf[i->src2_reg].f64; + MarkPageDirty(ics, address); return IA_NEXT; } uint32_t IntCode_STORE_V128(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; - DPRINT("store v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", + DPRINT("store.v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", address, VECF4(ics.rf[i->src2_reg].v128,0), VECF4(ics.rf[i->src2_reg].v128,1), VECF4(ics.rf[i->src2_reg].v128,2), VECF4(ics.rf[i->src2_reg].v128,3), VECI4(ics.rf[i->src2_reg].v128,0), VECI4(ics.rf[i->src2_reg].v128,1), VECI4(ics.rf[i->src2_reg].v128,2), VECI4(ics.rf[i->src2_reg].v128,3)); DFLUSH(); *((vec128_t*)(ics.membase + address)) = ics.rf[i->src2_reg].v128; + MarkPageDirty(ics, address); return IA_NEXT; } int Translate_STORE(TranslationContext& ctx, Instr* i) { @@ -1628,19 +1543,6 @@ int Translate_STORE(TranslationContext& ctx, Instr* i) { IntCode_STORE_F64, IntCode_STORE_V128, }; - if (i->src1.value->IsConstant()) { - // Constant address - check register access callbacks. - // NOTE: we still will likely want to check on access in debug mode, as - // constant propagation may not have happened. - uint64_t address = i->src1.value->AsUint64(); - RegisterAccessCallbacks* cbs = ctx.access_callbacks; - while (cbs) { - if (cbs->handles(cbs->context, address)) { - return DispatchRegisterWrite(ctx, i, cbs); - } - cbs = cbs->next; - } - } return DispatchToC(ctx, i, fns[i->src2.value->type]); } @@ -2093,19 +1995,19 @@ int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, IntCode_DID_SATURATE); } -#define VECTOR_COMPARER(type, value, count, op) \ +#define VECTOR_COMPARER(type, value, dest_value, count, op) \ const vec128_t& src1 = ics.rf[i->src1_reg].v128; \ const vec128_t& src2 = ics.rf[i->src2_reg].v128; \ vec128_t& dest = ics.rf[i->dest_reg].v128; \ for (int n = 0; n < count; n++) { \ - dest.value[n] = (type)src1.value[n] op (type)src2.value[n]; \ + dest.dest_value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? 0xFFFFFFFF : 0; \ } \ return IA_NEXT; -uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, ==) }; int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_EQ_I8, @@ -2119,10 +2021,10 @@ int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) }; int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGT_I8, @@ -2136,10 +2038,10 @@ int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) }; int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGE_I8, @@ -2153,10 +2055,10 @@ int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) }; int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGT_I8, @@ -2170,10 +2072,10 @@ int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) }; int Translate_VECTOR_COMPARE_UGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGE_I8, @@ -2466,9 +2368,9 @@ uint32_t IntCode_SUB_I16_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_SUB_I32_I32(IntCodeState& ics, const IntCode* i) { int32_t a = ics.rf[i->src1_reg].i32; int32_t b = ics.rf[i->src2_reg].i32; if (i->flags == ARITHMETIC_SET_CARRY) { - ics.did_carry = a < ~b; + ics.did_carry = SUB_DID_CARRY(a, b); } - ics.did_carry = SUB_DID_CARRY(a, b); + ics.rf[i->dest_reg].i32 = a - b; return IA_NEXT; } uint32_t IntCode_SUB_I64_I64(IntCodeState& ics, const IntCode* i) { @@ -3605,17 +3507,17 @@ int Translate_CNTLZ(TranslationContext& ctx, Instr* i) { uint32_t IntCode_EXTRACT_INT8_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i8 = VECB16(src1,ics.rf[i->src2_reg].i64); + ics.rf[i->dest_reg].i8 = VECB16(src1,ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_EXTRACT_INT16_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i16 = VECS8(src1,ics.rf[i->src2_reg].i64); + ics.rf[i->dest_reg].i16 = VECS8(src1,ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_EXTRACT_INT32_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i32 = VECI4(src1,ics.rf[i->src2_reg].i64); + ics.rf[i->dest_reg].i32 = VECI4(src1,ics.rf[i->src2_reg].i8); return IA_NEXT; } int Translate_EXTRACT(TranslationContext& ctx, Instr* i) { @@ -3817,6 +3719,7 @@ uint32_t IntCode_PACK_FLOAT16_2(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_PACK_FLOAT16_4(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; + dest.ix = dest.iy = 0; dest.iz = ((uint32_t)DirectX::PackedVector::XMConvertFloatToHalf(src1.x) << 16) | DirectX::PackedVector::XMConvertFloatToHalf(src1.y); @@ -4009,8 +3912,10 @@ static const TranslateFn dispatch_table[] = { Translate_CALL_TRUE, Translate_CALL_INDIRECT, Translate_CALL_INDIRECT_TRUE, + Translate_CALL_EXTERN, Translate_RETURN, Translate_RETURN_TRUE, + Translate_SET_RETURN_ADDRESS, Translate_BRANCH, Translate_BRANCH_TRUE, @@ -4031,6 +3936,9 @@ static const TranslateFn dispatch_table[] = { Translate_LOAD_CLOCK, + Translate_LOAD_LOCAL, + Translate_STORE_LOCAL, + Translate_LOAD_CONTEXT, Translate_STORE_CONTEXT, diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index 9f361b2f9..389ccbef2 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -14,7 +14,6 @@ #include #include -#include namespace alloy { namespace runtime { class ThreadState; } } @@ -41,13 +40,15 @@ typedef union { typedef struct { Register* rf; + uint8_t* locals; uint8_t* context; uint8_t* membase; - uint32_t* reserve_address; + uint8_t* page_table; int8_t did_carry; int8_t did_saturate; - runtime::RegisterAccessCallbacks* access_callbacks; runtime::ThreadState* thread_state; + uint64_t return_address; + uint64_t call_return_address; } IntCodeState; @@ -95,8 +96,6 @@ typedef struct SourceMapEntry_s { typedef struct { - runtime::RegisterAccessCallbacks* access_callbacks; - uint32_t register_count; size_t intcode_count; Arena* intcode_arena; @@ -104,6 +103,7 @@ typedef struct { Arena* source_map_arena; Arena* scratch_arena; LabelRef* label_ref_head; + size_t stack_size; } TranslationContext; diff --git a/src/alloy/backend/ivm/tracing.h b/src/alloy/backend/ivm/tracing.h index a1fcdf20d..526aa912e 100644 --- a/src/alloy/backend/ivm/tracing.h +++ b/src/alloy/backend/ivm/tracing.h @@ -32,17 +32,17 @@ public: ALLOY_BACKEND_IVM_ASSEMBLER_DEINIT = ALLOY_BACKEND_IVM_ASSEMBLER | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_DEINIT; } Deinit; - typedef struct { + typedef struct AssemblerInit_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_ASSEMBLER_INIT; } AssemblerInit; - typedef struct { + typedef struct AssemblerDeinit_s { static const uint32_t event_type = ALLOY_BACKEND_IVM_ASSEMBLER_DEINIT; } AssemblerDeinit; }; diff --git a/src/alloy/backend/machine_info.h b/src/alloy/backend/machine_info.h new file mode 100644 index 000000000..2aa7add22 --- /dev/null +++ b/src/alloy/backend/machine_info.h @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_BACKEND_MACHINE_INFO_H_ +#define ALLOY_BACKEND_MACHINE_INFO_H_ + +#include + + +namespace alloy { +namespace backend { + + +struct MachineInfo { + struct RegisterSet { + enum Types { + INT_TYPES = (1 << 1), + FLOAT_TYPES = (1 << 2), + VEC_TYPES = (1 << 3), + }; + uint8_t id; + char name[4]; + uint32_t types; + uint32_t count; + } register_sets[8]; +}; + + +} // namespace backend +} // namespace alloy + + +#endif // ALLOY_BACKEND_MACHINE_INFO_H_ diff --git a/src/alloy/backend/sources.gypi b/src/alloy/backend/sources.gypi index 154cd75ad..41419ac7a 100644 --- a/src/alloy/backend/sources.gypi +++ b/src/alloy/backend/sources.gypi @@ -5,6 +5,7 @@ 'assembler.h', 'backend.cc', 'backend.h', + 'machine_info.h', 'tracing.h', ], diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc deleted file mode 100644 index 5b2cac041..000000000 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ /dev/null @@ -1,1805 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include -#include -#include - -using namespace alloy; -using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; -using namespace alloy::hir; -using namespace alloy::runtime; - -using namespace Xbyak; - - -namespace { - -#define UNIMPLEMENTED_SEQ() __debugbreak() -#define ASSERT_INVALID_TYPE() XEASSERTALWAYS() - -// TODO(benvanik): emit traces/printfs/etc - -void Dummy() { - // -} - -void PrintString(void* raw_context, uint8_t* membase, const char* str) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - fprintf(stdout, "XE[t] :%d: %s\n", thread_state->GetThreadID(), str); - fflush(stdout); -} - -// TODO(benvanik): fancy stuff. -void CallThunk(void* raw_context, uint8_t* membase, - FunctionInfo* symbol_info) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - - Function* fn = NULL; - thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); - XEASSERTNOTNULL(fn); - fn->Call(thread_state); -} -void IssueCall(X64Emitter& e, FunctionInfo* symbol_info, uint32_t flags) { - e.mov(e.r8, (uint64_t)symbol_info); - e.mov(e.rax, (uint64_t)CallThunk); - if (flags & CALL_TAIL) { - e.jmp(e.rax); - } else { - e.call(e.rax); - e.mov(e.rdx, e.qword[e.rsp + 8]); - e.mov(e.rcx, e.qword[e.rsp + 0]); - } -} - -void IndirectCallThunk(void* raw_context, uint8_t* membase, - uint64_t target_address) { - // TODO(benvanik): generate this thunk at runtime? or a shim? - auto thread_state = *((ThreadState**)raw_context); - XEASSERTALWAYS(); -} -void IssueCallIndirect(X64Emitter& e, Value* target, uint32_t flags) { - Reg64 r; - e.BeginOp(target, r, 0); - if (r != e.r8) { - e.mov(e.r8, r); - } - e.EndOp(r); - e.mov(e.rax, (uint64_t)IndirectCallThunk); - if (flags & CALL_TAIL) { - e.jmp(e.rax); - } else { - e.sub(e.rsp, 0x20); - e.call(e.rax); - e.add(e.rsp, 0x20); - } -} - -// Sets EFLAGs with zf for the given value. -void CheckBoolean(X64Emitter& e, Value* v) { - if (v->IsConstant()) { - e.mov(e.ah, (v->IsConstantZero() ? 1 : 0) << 6); - e.sahf(); - } else if (v->type == INT8_TYPE) { - Reg8 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT16_TYPE) { - Reg16 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT32_TYPE) { - Reg32 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == INT64_TYPE) { - Reg64 src; - e.BeginOp(v, src, 0); - e.test(src, src); - e.EndOp(src); - } else if (v->type == FLOAT32_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (v->type == FLOAT64_TYPE) { - UNIMPLEMENTED_SEQ(); - } else if (v->type == VEC128_TYPE) { - UNIMPLEMENTED_SEQ(); - } else { - ASSERT_INVALID_TYPE(); - } -} - -void CompareXX(X64Emitter& e, Instr*& i, void(set_fn)(X64Emitter& e, Reg8& dest, bool invert)) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - Reg8 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i8); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i8); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg8 dest; - Reg16 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i16); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16C, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i16); - e.sete(dest); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg8 dest; - Reg32 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.cmp(src1, i->src2.value->constant.i32); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32C, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.cmp(src2, i->src1.value->constant.i32); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src1, src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - e.cmp(src1, src2); - set_fn(e, dest, false); - e.EndOp(dest, src1, src2); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg8 dest; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - e.mov(e.rax, i->src2.value->constant.i64); - e.cmp(src1, e.rax); - set_fn(e, dest, false); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64C, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src2; - e.BeginOp(i->dest, dest, REG_DEST, - i->src2.value, src2, 0); - e.mov(e.rax, i->src1.value->constant.i64); - e.cmp(src2, e.rax); - set_fn(e, dest, true); - e.EndOp(dest, src2); - } else { - UNIMPLEMENTED_SEQ(); - } -}; - -typedef void(v_fn)(X64Emitter& e, Instr& i, const Reg& dest_src); -template -void UnaryOpV(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, T& src1) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest == src1) { - v_fn(e, *i, dest); - } else { - e.mov(dest, src1); - v_fn(e, *i, dest); - } - e.EndOp(dest, src1); -} -template -void UnaryOpC(X64Emitter& e, Instr*& i, v_fn v_fn, - T& dest, Value* src1) { - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, (uint64_t)src1->get_constant(CT())); - v_fn(e, *i, dest); - e.EndOp(dest); -} -void UnaryOp(X64Emitter& e, Instr*& i, v_fn v_fn) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg16 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg32 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1; - UnaryOpV(e, i, v_fn, dest, src1); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg64 dest; - UnaryOpC(e, i, v_fn, dest, i->src1.value); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); - } -}; - -typedef void(vv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src); -typedef void(vc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src); -template -void BinaryOpVV(X64Emitter& e, Instr*& i, vv_fn vv_fn, - TD& dest, TS1& src1, TS2& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest == src1) { - vv_fn(e, *i, dest, src2); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vv_fn(e, *i, dest, src1); - } else { - // Eww. - e.mov(e.rax, src1); - vv_fn(e, *i, e.rax, src2); - e.mov(dest, e.rax); - } - } else { - e.mov(dest, src1); - vv_fn(e, *i, dest, src2); - } - e.EndOp(dest, src1, src2); -} -template -void BinaryOpVC(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, TS1& src1, Value* src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src1) { - vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); - } else { - e.mov(dest, src1); - vc_fn(e, *i, dest, (uint32_t)src2->get_constant(CT())); - } - } else { - // 64-bit. - if (dest == src1) { - e.mov(e.rax, src2->constant.i64); - vv_fn(e, *i, dest, e.rax); - } else { - e.mov(e.rax, src2->constant.i64); - e.mov(dest, src1); - vv_fn(e, *i, dest, e.rax); - } - } - e.EndOp(dest, src1); -} -template -void BinaryOpCV(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn, - TD& dest, Value* src1, TS2& src2) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src2, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); - } else { - // Eww. - e.mov(e.rax, src2); - e.mov(dest, (uint32_t)src1->get_constant(CT())); - vv_fn(e, *i, dest, e.rax); - } - } else { - e.mov(dest, src2); - vc_fn(e, *i, dest, (uint32_t)src1->get_constant(CT())); - } - } else { - // 64-bit. - if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src1->constant.i64); - vv_fn(e, *i, dest, e.rax); - } else { - // Eww. - e.mov(e.rax, src1->constant.i64); - vv_fn(e, *i, e.rax, src2); - e.mov(dest, e.rax); - } - } else { - e.mov(e.rax, src2); - e.mov(dest, src1->constant.i64); - vv_fn(e, *i, dest, e.rax); - } - } - e.EndOp(dest, src2); -} -void BinaryOp(X64Emitter& e, Instr*& i, vv_fn vv_fn, vc_fn vc_fn) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I8C, SIG_TYPE_I8)) { - Reg8 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I16C)) { - Reg16 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16C, SIG_TYPE_I16)) { - Reg16 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I32C)) { - Reg32 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32C, SIG_TYPE_I32)) { - Reg32 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 dest, src1, src2; - BinaryOpVV(e, i, vv_fn, dest, src1, src2); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I64C)) { - Reg64 dest, src1; - BinaryOpVC(e, i, vv_fn, vc_fn, dest, src1, i->src2.value); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64C, SIG_TYPE_I64)) { - Reg64 dest, src2; - BinaryOpCV(e, i, vv_fn, vc_fn, dest, i->src1.value, src2); - } else { - ASSERT_INVALID_TYPE(); - } - if (i->flags & ARITHMETIC_SET_CARRY) { - // EFLAGS should have CA set? - // (so long as we don't fuck with it) - // UNIMPLEMENTED_SEQ(); - } -}; - -typedef void(vvv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, const Operand& src3); -typedef void(vvc_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, const Operand& src2, uint32_t src3); -typedef void(vcv_fn)(X64Emitter& e, Instr& i, const Reg& dest_src1, uint32_t src2, const Operand& src3); -template -void TernaryOpVVV(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, - TD& dest, TS1& src1, TS2& src2, TS3& src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0, - i->src3.value, src3, 0); - if (dest == src1) { - vvv_fn(e, *i, dest, src2, src3); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvv_fn(e, *i, dest, src1, src3); - } else { - UNIMPLEMENTED_SEQ(); - } - } else { - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, src3); - } - e.EndOp(dest, src1, src2, src3); -} -template -void TernaryOpVVC(X64Emitter& e, Instr*& i, vvv_fn vvv_fn, vvc_fn vvc_fn, - TD& dest, TS1& src1, TS2& src2, Value* src3) { - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0, - i->src2.value, src2, 0); - if (dest.getBit() <= 32) { - // 32-bit. - if (dest == src1) { - vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - vvc_fn(e, *i, dest, src1, (uint32_t)src3->get_constant(CT())); - } else { - // Eww. - e.mov(e.rax, src2); - e.mov(dest, src1); - vvc_fn(e, *i, dest, e.rax, (uint32_t)src3->get_constant(CT())); - } - } else { - e.mov(dest, src1); - vvc_fn(e, *i, dest, src2, (uint32_t)src3->get_constant(CT())); - } - } else { - // 64-bit. - if (dest == src1) { - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src2, e.rax); - } else if (dest == src2) { - if (i->opcode->flags & OPCODE_FLAG_COMMUNATIVE) { - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); - } else { - // Eww. - e.mov(e.rax, src1); - e.mov(src1, src2); - e.mov(dest, e.rax); - e.mov(e.rax, src3->constant.i64); - vvv_fn(e, *i, dest, src1, e.rax); - } - } else { - e.mov(e.rax, src3->constant.i64); - e.mov(dest, src1); - vvv_fn(e, *i, dest, src2, e.rax); - } - } - e.EndOp(dest, src1); -} - -} // namespace - - -void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) { - // -------------------------------------------------------------------------- - // General - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_COMMENT, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): pass through. - auto str = (const char*)i->src1.offset; - auto str_copy = xestrdupa(str); - e.mov(e.r8, (uint64_t)str_copy); - e.mov(e.rax, (uint64_t)PrintString); - e.call(e.rax); - e.mov(e.rdx, e.qword[e.rsp + 8]); - e.mov(e.rcx, e.qword[e.rsp + 0]); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_NOP, [](X64Emitter& e, Instr*& i) { - // If we got this, chances are we want it. - e.nop(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Debugging - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_SOURCE_OFFSET, [](X64Emitter& e, Instr*& i) { -#if XE_DEBUG - e.nop(); - e.nop(); - e.mov(e.eax, (uint32_t)i->src1.offset); - e.nop(); - e.nop(); -#endif // XE_DEBUG - - e.MarkSourceOffset(i); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DEBUG_BREAK, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): insert a call to the debug break function to let the - // debugger know. - e.db(0xCC); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DEBUG_BREAK_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); - // TODO(benvanik): insert a call to the debug break function to let the - // debugger know. - e.db(0xCC); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_TRAP, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): insert a call to the trap function to let the - // debugger know. - e.db(0xCC); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_TRAP_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); - // TODO(benvanik): insert a call to the trap function to let the - // debugger know. - e.db(0xCC); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Calls - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_CALL, [](X64Emitter& e, Instr*& i) { - IssueCall(e, i->src1.symbol_info, i->flags); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CALL_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); - IssueCall(e, i->src2.symbol_info, i->flags); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CALL_INDIRECT, [](X64Emitter& e, Instr*& i) { - IssueCallIndirect(e, i->src1.value, i->flags); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CALL_INDIRECT_TRUE, [](X64Emitter& e, Instr*& i) { - e.inLocalLabel(); - CheckBoolean(e, i->src1.value); - e.jne(".x", e.T_SHORT); - IssueCallIndirect(e, i->src2.value, i->flags); - e.L(".x"); - e.outLocalLabel(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_RETURN, [](X64Emitter& e, Instr*& i) { - // If this is the last instruction in the last block, just let us - // fall through. - if (i->next || i->block->next) { - e.jmp("epilog"); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_RETURN_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - e.je("epilog"); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Branches - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_BRANCH, [](X64Emitter& e, Instr*& i) { - auto target = i->src1.label; - e.jmp(target->name, e.T_NEAR); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_BRANCH_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - auto target = i->src2.label; - e.je(target->name, e.T_NEAR); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_BRANCH_FALSE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - auto target = i->src2.label; - e.jne(target->name, e.T_NEAR); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Types - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_ASSIGN, [](X64Emitter& e, Instr*& i) { - UnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - // nop - the mov will have happened. - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { - // Need a matrix. - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ZERO_EXTEND, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { - Reg32 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { - Reg64 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movzx(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg64 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest.cvt32(), src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SIGN_EXTEND, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I16)) { - Reg32 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I16)) { - Reg64 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I32)) { - Reg64 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.movsx(dest, src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_TRUNCATE, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I32)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I8, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt8()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I32)) { - Reg16 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_I64)) { - Reg16 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt16()); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I64)) { - Reg32 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.mov(dest, src.cvt32()); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ROUND, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_CONVERT_I2F, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_CONVERT_F2I, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Constants - // -------------------------------------------------------------------------- - - // specials for zeroing/etc (xor/etc) - - table->AddSequence(OPCODE_LOAD_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_LOAD_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Context - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(dest, e.ptr[e.rcx + i->src1.offset]); - e.EndOp(dest); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - e.mov(e.qword[e.rcx + i->src1.offset], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // NOTE: we always know we are aligned. - e.movaps(e.ptr[e.rcx + i->src1.offset], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.ptr[e.rcx + i->src1.offset], i->src2.value->constant.v128.low); - e.mov(e.ptr[e.rcx + i->src1.offset + 8], i->src2.value->constant.v128.high); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Memory - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): dynamic register access check. - // mov reg, [membase + address.32] - Reg64 addr_off; - RegExp addr; - if (i->src1.value->IsConstant()) { - // TODO(benvanik): a way to do this without using a register. - e.mov(e.eax, i->src1.value->AsUint32()); - addr = e.rdx + e.rax; - } else { - e.BeginOp(i->src1.value, addr_off, 0); - e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits - addr = e.rdx + addr_off; - } - if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { - Reg16 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { - Reg32 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { - Reg64 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[addr]); - e.EndOp(dest); - } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { - Xmm dest; - e.BeginOp(i->dest, dest, REG_DEST); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(dest, e.ptr[addr]); - e.EndOp(dest); - } else { - ASSERT_INVALID_TYPE(); - } - if (!i->src1.value->IsConstant()) { - e.EndOp(addr_off); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { - // TODO(benvanik): dynamic register access check - // mov [membase + address.32], reg - Reg64 addr_off; - RegExp addr; - if (i->src1.value->IsConstant()) { - e.mov(e.eax, i->src1.value->AsUint32()); - addr = e.rdx + e.rax; - } else { - e.BeginOp(i->src1.value, addr_off, 0); - e.mov(addr_off.cvt32(), addr_off.cvt32()); // trunc to 32bits - addr = e.rdx + addr_off; - } - if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[addr], i->src2.value->constant.i8); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg16 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[addr], i->src2.value->constant.i16); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg32 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg64 src; - e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - e.mov(e.qword[addr], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[addr], i->src2.value->constant.i32); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - e.mov(e.qword[addr], i->src2.value->constant.i64); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { - Xmm src; - e.BeginOp(i->src2.value, src, 0); - // TODO(benvanik): we should try to stick to movaps if possible. - e.movups(e.ptr[addr], src); - e.EndOp(src); - } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { - e.mov(e.ptr[addr], i->src2.value->constant.v128.low); - e.mov(e.ptr[addr + 8], i->src2.value->constant.v128.high); - } else { - ASSERT_INVALID_TYPE(); - } - if (!i->src1.value->IsConstant()) { - e.EndOp(addr_off); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_PREFETCH, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Comparisons - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_MAX, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_MIN, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SELECT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_IS_TRUE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setnz(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_IS_FALSE, [](X64Emitter& e, Instr*& i) { - CheckBoolean(e, i->src1.value); - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setz(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.sete(dest); - } else { - e.setne(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_NE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setne(dest); - } else { - e.sete(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_SLT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setl(dest); - } else { - e.setge(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_SLE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setle(dest); - } else { - e.setg(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setg(dest); - } else { - e.setle(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setge(dest); - } else { - e.setl(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_ULT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setb(dest); - } else { - e.setae(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_ULE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setbe(dest); - } else { - e.seta(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.seta(dest); - } else { - e.setbe(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - CompareXX(e, i, [](X64Emitter& e, Reg8& dest, bool invert) { - if (!invert) { - e.setae(dest); - } else { - e.setb(dest); - } - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DID_CARRY, [](X64Emitter& e, Instr*& i) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.setc(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DID_OVERFLOW, [](X64Emitter& e, Instr*& i) { - Reg8 dest; - e.BeginOp(i->dest, dest, REG_DEST); - e.seto(dest); - e.EndOp(dest); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DID_SATURATE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_EQ, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_SGT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_SGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_UGT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_COMPARE_UGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Math - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_ADD, [](X64Emitter& e, Instr*& i) { - BinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.add(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.add(dest_src, src); - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ADD_CARRY, [](X64Emitter& e, Instr*& i) { - // dest = src1 + src2 + src3.i8 - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8, SIG_TYPE_I8)) { - Reg8 dest, src1, src2; - Reg8 ca; - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I16, SIG_TYPE_I8)) { - Reg16 dest, src1, src2; - Reg8 ca; - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I32, SIG_TYPE_I8)) { - Reg32 dest, src1, src2; - Reg8 ca; - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I64, SIG_TYPE_I8)) { - Reg64 dest, src1, src2; - Reg8 ca; - TernaryOpVVV(e, i, [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src2, const Operand& src3) { - e.mov(e.ah, src3); - e.sahf(); - e.adc(dest_src, src2); - }, dest, src1, src2, ca); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SUB, [](X64Emitter& e, Instr*& i) { - BinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.sub(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.sub(dest_src, src); - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_MUL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_MUL_HI, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DIV, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_MUL_ADD, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_MUL_SUB, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_NEG, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ABS, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SQRT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_RSQRT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_POW2, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_LOG2, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DOT_PRODUCT_3, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_DOT_PRODUCT_4, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_AND, [](X64Emitter& e, Instr*& i) { - BinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.and(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.and(dest_src, src); - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_OR, [](X64Emitter& e, Instr*& i) { - BinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.or(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.or(dest_src, src); - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_XOR, [](X64Emitter& e, Instr*& i) { - BinaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src, const Operand& src) { - e.xor(dest_src, src); - }, - [](X64Emitter& e, Instr& i, const Reg& dest_src, uint32_t src) { - e.xor(dest_src, src); - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_NOT, [](X64Emitter& e, Instr*& i) { - UnaryOp( - e, i, - [](X64Emitter& e, Instr& i, const Reg& dest_src) { - e.not(dest_src); - }); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SHL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SHR, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_SHR, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SHA, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_VECTOR_SHA, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ROTATE_LEFT, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8, SIG_TYPE_I8C)) { - Reg8 dest; - Reg8 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16, SIG_TYPE_I8C)) { - Reg8 dest; - Reg16 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32, SIG_TYPE_I8C)) { - Reg8 dest; - Reg32 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64, SIG_TYPE_I8C)) { - Reg8 dest; - Reg64 src1; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src1, 0); - if (dest != src1) { - e.mov(dest, src1); - } - e.rol(dest, i->src2.value->constant.i8); - e.EndOp(dest, src1); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_BYTE_SWAP, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_I16, SIG_TYPE_I16)) { - Reg16 d, s1; - e.BeginOp(i->dest, d, REG_DEST | REG_ABCD, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - } else { - e.xchg(d.cvt8(), Reg8(d.getIdx() + 4)); - } - e.EndOp(d, s1); - } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_I32)) { - Reg32 d, s1; - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.bswap(d); - } else { - e.bswap(d); - } - e.EndOp(d, s1); - } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_I64)) { - Reg64 d, s1; - e.BeginOp(i->dest, d, REG_DEST, - i->src1.value, s1, 0); - if (d != s1) { - e.mov(d, s1); - e.bswap(d); - } else { - e.bswap(d); - } - e.EndOp(d, s1); - } else { - ASSERT_INVALID_TYPE(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_CNTLZ, [](X64Emitter& e, Instr*& i) { - if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I8)) { - Reg8 dest; - Reg8 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt16(), src.cvt16()); - // ZF = 1 if zero - e.mov(e.eax, 16); - e.cmovz(dest.cvt32(), e.eax); - e.sub(dest, 8); - e.xor(dest, 0x7); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I16)) { - Reg8 dest; - Reg16 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt16(), src); - // ZF = 1 if zero - e.mov(e.eax, 16); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0xF); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I32)) { - Reg8 dest; - Reg32 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest.cvt32(), src); - // ZF = 1 if zero - e.mov(e.eax, 32); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0x1F); - e.EndOp(dest, src); - } else if (i->Match(SIG_TYPE_IGNORE, SIG_TYPE_I64)) { - Reg8 dest; - Reg64 src; - e.BeginOp(i->dest, dest, REG_DEST, - i->src1.value, src, 0); - e.bsr(dest, src); - // ZF = 1 if zero - e.mov(e.eax, 64); - e.cmovz(dest.cvt32(), e.eax); - e.xor(dest, 0x3F); - e.EndOp(dest, src); - } else { - UNIMPLEMENTED_SEQ(); - } - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_INSERT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_EXTRACT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SPLAT, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_PERMUTE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_SWIZZLE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_PACK, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - // -------------------------------------------------------------------------- - // Atomic - // -------------------------------------------------------------------------- - - table->AddSequence(OPCODE_COMPARE_EXCHANGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ATOMIC_EXCHANGE, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ATOMIC_ADD, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); - - table->AddSequence(OPCODE_ATOMIC_SUB, [](X64Emitter& e, Instr*& i) { - UNIMPLEMENTED_SEQ(); - i = e.Advance(i); - return true; - }); -} diff --git a/src/alloy/backend/x64/lowering/lowering_table.cc b/src/alloy/backend/x64/lowering/lowering_table.cc deleted file mode 100644 index 6c5c8468b..000000000 --- a/src/alloy/backend/x64/lowering/lowering_table.cc +++ /dev/null @@ -1,71 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include - -using namespace alloy; -using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; - - -LoweringTable::LoweringTable(X64Backend* backend) : - backend_(backend) { - xe_zero_struct(lookup_, sizeof(lookup_)); -} - -LoweringTable::~LoweringTable() { - for (size_t n = 0; n < XECOUNT(lookup_); n++) { - auto entry = lookup_[n]; - while (entry) { - auto next = entry->next; - delete entry; - entry = next; - } - } -} - -int LoweringTable::Initialize() { - RegisterSequences(this); - return 0; -} - -void LoweringTable::AddSequence(hir::Opcode starting_opcode, sequence_fn_t fn) { - auto existing_entry = lookup_[starting_opcode]; - auto new_entry = new sequence_fn_entry_t(); - new_entry->fn = fn; - new_entry->next = existing_entry; - lookup_[starting_opcode] = new_entry; -} - -int LoweringTable::ProcessBlock(X64Emitter& e, hir::Block* block) { - // Process instructions. - auto instr = block->instr_head; - while (instr) { - bool processed = false; - auto entry = lookup_[instr->opcode->num]; - while (entry) { - if ((*entry->fn)(e, instr)) { - processed = true; - break; - } - entry = entry->next; - } - if (!processed) { - // No sequence found! - XELOGE("Unable to process HIR opcode %s", instr->opcode->name); - return 1; - instr = e.Advance(instr); - } - } - - return 0; -} \ No newline at end of file diff --git a/src/alloy/backend/x64/lowering/lowering_table.h b/src/alloy/backend/x64/lowering/lowering_table.h deleted file mode 100644 index f62bfd777..000000000 --- a/src/alloy/backend/x64/lowering/lowering_table.h +++ /dev/null @@ -1,58 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_ -#define ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_ - -#include -#include - - -namespace alloy { -namespace backend { -namespace x64 { -class X64Backend; -class X64Emitter; -namespace lowering { - - -class LoweringTable { -public: - LoweringTable(X64Backend* backend); - ~LoweringTable(); - - int Initialize(); - - int ProcessBlock(X64Emitter& e, hir::Block* block); - -public: - typedef bool(*sequence_fn_t)(X64Emitter& e, hir::Instr*& instr); - void AddSequence(hir::Opcode starting_opcode, sequence_fn_t fn); - -private: - class sequence_fn_entry_t { - public: - sequence_fn_t fn; - sequence_fn_entry_t* next; - }; - - // NOTE: this class is shared by multiple threads and is not thread safe. - // Do not modify anything after init. - X64Backend* backend_; - sequence_fn_entry_t* lookup_[hir::__OPCODE_MAX_VALUE]; -}; - - -} // namespace lowering -} // namespace x64 -} // namespace backend -} // namespace alloy - - -#endif // ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_TABLE_H_ diff --git a/src/alloy/backend/x64/lowering/sources.gypi b/src/alloy/backend/x64/lowering/sources.gypi deleted file mode 100644 index 5c710cfcc..000000000 --- a/src/alloy/backend/x64/lowering/sources.gypi +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright 2013 Ben Vanik. All Rights Reserved. -{ - 'sources': [ - 'lowering_sequences.cc', - 'lowering_sequences.h', - 'lowering_table.cc', - 'lowering_table.h', - ], -} diff --git a/src/alloy/backend/x64/sources.gypi b/src/alloy/backend/x64/sources.gypi index 0a3ead5a9..38167e3f1 100644 --- a/src/alloy/backend/x64/sources.gypi +++ b/src/alloy/backend/x64/sources.gypi @@ -12,9 +12,12 @@ 'x64_emitter.h', 'x64_function.cc', 'x64_function.h', - ], - - 'includes': [ - 'lowering/sources.gypi', + 'x64_sequence.inl', + 'x64_sequences.cc', + 'x64_sequences.h', + 'x64_thunk_emitter.cc', + 'x64_thunk_emitter.h', + 'x64_tracers.cc', + 'x64_tracers.h', ], } diff --git a/src/alloy/backend/x64/tracing.h b/src/alloy/backend/x64/tracing.h index 36d814d67..e6689b830 100644 --- a/src/alloy/backend/x64/tracing.h +++ b/src/alloy/backend/x64/tracing.h @@ -32,17 +32,17 @@ public: ALLOY_BACKEND_X64_ASSEMBLER_DEINIT = ALLOY_BACKEND_X64_ASSEMBLER | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_BACKEND_X64_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_BACKEND_X64_DEINIT; } Deinit; - typedef struct { + typedef struct AssemblerInit_s { static const uint32_t event_type = ALLOY_BACKEND_X64_ASSEMBLER_INIT; } AssemblerInit; - typedef struct { + typedef struct AssemblerDeinit_s { static const uint32_t event_type = ALLOY_BACKEND_X64_ASSEMBLER_DEINIT; } AssemblerDeinit; }; diff --git a/src/alloy/backend/x64/x64_assembler.cc b/src/alloy/backend/x64/x64_assembler.cc index d4e88e621..d70afe909 100644 --- a/src/alloy/backend/x64/x64_assembler.cc +++ b/src/alloy/backend/x64/x64_assembler.cc @@ -30,7 +30,7 @@ using namespace alloy::runtime; X64Assembler::X64Assembler(X64Backend* backend) : x64_backend_(backend), - emitter_(0), + emitter_(0), allocator_(0), Assembler(backend) { } @@ -39,6 +39,7 @@ X64Assembler::~X64Assembler() { })); delete emitter_; + delete allocator_; } int X64Assembler::Initialize() { @@ -47,8 +48,8 @@ int X64Assembler::Initialize() { return result; } - emitter_ = new X64Emitter(x64_backend_, - new XbyakAllocator()); + allocator_ = new XbyakAllocator(); + emitter_ = new X64Emitter(x64_backend_, allocator_); alloy::tracing::WriteEvent(EventType::AssemblerInit({ })); @@ -65,6 +66,8 @@ int X64Assembler::Assemble( FunctionInfo* symbol_info, HIRBuilder* builder, uint32_t debug_info_flags, DebugInfo* debug_info, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + int result = 0; // Lower HIR -> x64. @@ -82,13 +85,15 @@ int X64Assembler::Assemble( string_buffer_.Reset(); } - X64Function* fn = new X64Function(symbol_info); - fn->set_debug_info(debug_info); - fn->Setup(machine_code, code_size); + { + X64Function* fn = new X64Function(symbol_info); + fn->set_debug_info(debug_info); + fn->Setup(machine_code, code_size); - *out_function = fn; + *out_function = fn; - result = 0; + result = 0; + } XECLEANUP: Reset(); diff --git a/src/alloy/backend/x64/x64_assembler.h b/src/alloy/backend/x64/x64_assembler.h index 3d6235254..063e19c63 100644 --- a/src/alloy/backend/x64/x64_assembler.h +++ b/src/alloy/backend/x64/x64_assembler.h @@ -21,6 +21,7 @@ namespace x64 { class X64Backend; class X64Emitter; +class XbyakAllocator; class X64Assembler : public Assembler { @@ -45,6 +46,7 @@ private: private: X64Backend* x64_backend_; X64Emitter* emitter_; + XbyakAllocator* allocator_; StringBuffer string_buffer_; }; diff --git a/src/alloy/backend/x64/x64_backend.cc b/src/alloy/backend/x64/x64_backend.cc index 560328750..40283f6d2 100644 --- a/src/alloy/backend/x64/x64_backend.cc +++ b/src/alloy/backend/x64/x64_backend.cc @@ -12,25 +12,23 @@ #include #include #include -#include -#include +#include +#include using namespace alloy; using namespace alloy::backend; using namespace alloy::backend::x64; -using namespace alloy::backend::x64::lowering; using namespace alloy::runtime; X64Backend::X64Backend(Runtime* runtime) : - code_cache_(0), lowering_table_(0), + code_cache_(0), Backend(runtime) { } X64Backend::~X64Backend() { alloy::tracing::WriteEvent(EventType::Deinit({ })); - delete lowering_table_; delete code_cache_; } @@ -40,14 +38,34 @@ int X64Backend::Initialize() { return result; } + RegisterSequences(); + + machine_info_.register_sets[0] = { + 0, + "gpr", + MachineInfo::RegisterSet::INT_TYPES, + X64Emitter::GPR_COUNT, + }; + machine_info_.register_sets[1] = { + 1, + "xmm", + MachineInfo::RegisterSet::FLOAT_TYPES | + MachineInfo::RegisterSet::VEC_TYPES, + X64Emitter::XMM_COUNT, + }; + code_cache_ = new X64CodeCache(); result = code_cache_->Initialize(); if (result) { return result; } - lowering_table_ = new LoweringTable(this); - RegisterSequences(lowering_table_); + auto allocator = new XbyakAllocator(); + auto thunk_emitter = new X64ThunkEmitter(this, allocator); + host_to_guest_thunk_ = thunk_emitter->EmitHostToGuestThunk(); + guest_to_host_thunk_ = thunk_emitter->EmitGuestToHostThunk(); + delete thunk_emitter; + delete allocator; alloy::tracing::WriteEvent(EventType::Init({ })); diff --git a/src/alloy/backend/x64/x64_backend.h b/src/alloy/backend/x64/x64_backend.h index b10f7e571..0ff3018cd 100644 --- a/src/alloy/backend/x64/x64_backend.h +++ b/src/alloy/backend/x64/x64_backend.h @@ -20,19 +20,22 @@ namespace backend { namespace x64 { class X64CodeCache; -namespace lowering { class LoweringTable; } #define ALLOY_HAS_X64_BACKEND 1 +typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); +typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); + class X64Backend : public Backend { public: X64Backend(runtime::Runtime* runtime); virtual ~X64Backend(); X64CodeCache* code_cache() const { return code_cache_; } - lowering::LoweringTable* lowering_table() const { return lowering_table_; } + HostToGuestThunk host_to_guest_thunk() const { return host_to_guest_thunk_; } + GuestToHostThunk guest_to_host_thunk() const { return guest_to_host_thunk_; } virtual int Initialize(); @@ -40,7 +43,8 @@ public: private: X64CodeCache* code_cache_; - lowering::LoweringTable* lowering_table_; + HostToGuestThunk host_to_guest_thunk_; + GuestToHostThunk guest_to_host_thunk_; }; diff --git a/src/alloy/backend/x64/x64_code_cache.cc b/src/alloy/backend/x64/x64_code_cache.cc index 7bbf91f2a..9d1c2ce60 100644 --- a/src/alloy/backend/x64/x64_code_cache.cc +++ b/src/alloy/backend/x64/x64_code_cache.cc @@ -34,14 +34,14 @@ public: const static uint32_t ESTIMATED_FN_SIZE = 512; // Size of unwind info per function. // TODO(benvanik): move this to emitter. - const static uint32_t UNWIND_INFO_SIZE = 4 + (2 * 1); + const static uint32_t UNWIND_INFO_SIZE = 4 + (2 * 1 + 2 + 2); void* fn_table_handle; RUNTIME_FUNCTION* fn_table; uint32_t fn_table_count; uint32_t fn_table_capacity; - void AddTableEntry(uint8_t* code, size_t code_size); + void AddTableEntry(uint8_t* code, size_t code_size, size_t stack_size); }; @@ -73,7 +73,10 @@ int X64CodeCache::Initialize() { return 0; } -void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size) { +void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size, + size_t stack_size) { + SCOPE_profile_cpu_f("alloy"); + // Add unwind info into the allocation size. Keep things 16b aligned. code_size += XEROUNDUP(X64CodeChunk::UNWIND_INFO_SIZE, 16); @@ -101,7 +104,7 @@ void* X64CodeCache::PlaceCode(void* machine_code, size_t code_size) { active_chunk_->offset += code_size; // Add entry to fn table. - active_chunk_->AddTableEntry(final_address, code_size); + active_chunk_->AddTableEntry(final_address, code_size, stack_size); UnlockMutex(lock_); @@ -156,6 +159,27 @@ typedef enum _UNWIND_OP_CODES { UWOP_SAVE_XMM128_FAR, /* info == XMM reg number, offset in next 2 slots */ UWOP_PUSH_MACHFRAME /* info == 0: no error-code, 1: error-code */ } UNWIND_CODE_OPS; +class UNWIND_REGISTER { +public: + enum _ { + RAX = 0, + RCX = 1, + RDX = 2, + RBX = 3, + RSP = 4, + RBP = 5, + RSI = 6, + RDI = 7, + R8 = 8, + R9 = 9, + R10 = 10, + R11 = 11, + R12 = 12, + R13 = 13, + R14 = 14, + R15 = 15, + }; +}; typedef union _UNWIND_CODE { struct { @@ -183,7 +207,8 @@ typedef struct _UNWIND_INFO { } UNWIND_INFO, *PUNWIND_INFO; } // namespace -void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size) { +void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size, + size_t stack_size) { // NOTE: we assume a chunk lock. if (fn_table_count + 1 > fn_table_capacity) { @@ -213,26 +238,57 @@ void X64CodeChunk::AddTableEntry(uint8_t* code, size_t code_size) { size_t unwind_info_offset = offset; offset += UNWIND_INFO_SIZE; - // TODO(benvanik): take as parameters? - bool has_prolog = true; - uint8_t prolog_size = 4; - uint8_t stack_bytes = 64; + if (!stack_size) { + uint8_t prolog_size = 0; - // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx - UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); - unwind_info->Version = 1; - unwind_info->Flags = 0; - unwind_info->SizeOfProlog = has_prolog ? prolog_size : 0; - unwind_info->CountOfCodes = has_prolog ? 1 : 0; - unwind_info->FrameRegister = 0; - unwind_info->FrameOffset = 0; + // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx + UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); + unwind_info->Version = 1; + unwind_info->Flags = 0; + unwind_info->SizeOfProlog = 0; + unwind_info->CountOfCodes = 0; + unwind_info->FrameRegister = 0; + unwind_info->FrameOffset = 0; + } else if (stack_size <= 128) { + uint8_t prolog_size = 4; - // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx - auto& code_0 = unwind_info->UnwindCode[0]; - code_0.CodeOffset = 4; // end of instruction + 1 == offset of next instruction - code_0.UnwindOp = UWOP_ALLOC_SMALL; - code_0.OpInfo = stack_bytes / 8 - 1; - XEASSERT(stack_bytes < 128); + // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx + UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); + unwind_info->Version = 1; + unwind_info->Flags = 0; + unwind_info->SizeOfProlog = prolog_size; + unwind_info->CountOfCodes = 1; + unwind_info->FrameRegister = 0; + unwind_info->FrameOffset = 0; + + // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx + size_t co = 0; + auto& unwind_code = unwind_info->UnwindCode[co++]; + unwind_code.CodeOffset = 14; // end of instruction + 1 == offset of next instruction + unwind_code.UnwindOp = UWOP_ALLOC_SMALL; + unwind_code.OpInfo = stack_size / 8 - 1; + } else { + // TODO(benvanik): take as parameters? + uint8_t prolog_size = 7; + + // http://msdn.microsoft.com/en-us/library/ddssxxy8.aspx + UNWIND_INFO* unwind_info = (UNWIND_INFO*)(buffer + unwind_info_offset); + unwind_info->Version = 1; + unwind_info->Flags = 0; + unwind_info->SizeOfProlog = prolog_size; + unwind_info->CountOfCodes = 3; + unwind_info->FrameRegister = 0; + unwind_info->FrameOffset = 0; + + // http://msdn.microsoft.com/en-us/library/ck9asaa9.aspx + size_t co = 0; + auto& unwind_code = unwind_info->UnwindCode[co++]; + unwind_code.CodeOffset = 7; // end of instruction + 1 == offset of next instruction + unwind_code.UnwindOp = UWOP_ALLOC_LARGE; + unwind_code.OpInfo = 0; + unwind_code = unwind_info->UnwindCode[co++]; + unwind_code.FrameOffset = (USHORT)(stack_size) / 8; + } // Add entry. auto& fn_entry = fn_table[fn_table_count++]; diff --git a/src/alloy/backend/x64/x64_code_cache.h b/src/alloy/backend/x64/x64_code_cache.h index 1d6140430..23ba2e639 100644 --- a/src/alloy/backend/x64/x64_code_cache.h +++ b/src/alloy/backend/x64/x64_code_cache.h @@ -30,7 +30,7 @@ public: // TODO(benvanik): keep track of code blocks // TODO(benvanik): padding/guards/etc - void* PlaceCode(void* machine_code, size_t code_size); + void* PlaceCode(void* machine_code, size_t code_size, size_t stack_size); private: const static size_t DEFAULT_CHUNK_SIZE = 4 * 1024 * 1024; diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 3d6b3cfa3..0e2d55860 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -11,9 +11,14 @@ #include #include -#include +#include +#include +#include #include #include +#include +#include +#include using namespace alloy; using namespace alloy::backend; @@ -30,22 +35,38 @@ namespace x64 { static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024; +static const size_t STASH_OFFSET = 32; + +// If we are running with tracing on we have to store the EFLAGS in the stack, +// otherwise our calls out to C to print will clear it before DID_CARRY/etc +// can get the value. +#define STORE_EFLAGS 1 + } // namespace x64 } // namespace backend } // namespace alloy +const uint32_t X64Emitter::gpr_reg_map_[X64Emitter::GPR_COUNT] = { + Operand::RBX, + Operand::R12, Operand::R13, Operand::R14, Operand::R15, +}; + +const uint32_t X64Emitter::xmm_reg_map_[X64Emitter::XMM_COUNT] = { + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, +}; + + X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) : + runtime_(backend->runtime()), backend_(backend), code_cache_(backend->code_cache()), allocator_(allocator), current_instr_(0), CodeGenerator(MAX_CODE_SIZE, AutoGrow, allocator) { - xe_zero_struct(®_state_, sizeof(reg_state_)); } X64Emitter::~X64Emitter() { - delete allocator_; } int X64Emitter::Initialize() { @@ -53,9 +74,11 @@ int X64Emitter::Initialize() { } int X64Emitter::Emit( - HIRBuilder* builder, + HIRBuilder* builder, uint32_t debug_info_flags, runtime::DebugInfo* debug_info, void*& out_code_address, size_t& out_code_size) { + SCOPE_profile_cpu_f("alloy"); + // Reset. if (debug_info_flags & DEBUG_INFO_SOURCE_MAP) { source_map_count_ = 0; @@ -63,14 +86,15 @@ int X64Emitter::Emit( } // Fill the generator with code. - int result = Emit(builder); + size_t stack_size = 0; + int result = Emit(builder, stack_size); if (result) { return result; } // Copy the final code to the cache and relocate it. out_code_size = getSize(); - out_code_address = Emplace(code_cache_); + out_code_address = Emplace(stack_size); // Stash source map. if (debug_info_flags & DEBUG_INFO_SOURCE_MAP) { @@ -82,13 +106,13 @@ int X64Emitter::Emit( return 0; } -void* X64Emitter::Emplace(X64CodeCache* code_cache) { +void* X64Emitter::Emplace(size_t stack_size) { // To avoid changing xbyak, we do a switcharoo here. // top_ points to the Xbyak buffer, and since we are in AutoGrow mode // it has pending relocations. We copy the top_ to our buffer, swap the // pointer, relocate, then return the original scratch pointer for use. uint8_t* old_address = top_; - void* new_address = code_cache->PlaceCode(top_, size_); + void* new_address = code_cache_->PlaceCode(top_, size_, stack_size); top_ = (uint8_t*)new_address; ready(); top_ = old_address; @@ -96,17 +120,22 @@ void* X64Emitter::Emplace(X64CodeCache* code_cache) { return new_address; } -int X64Emitter::Emit(HIRBuilder* builder) { - // These are the registers we will not be using. All others are fare game. - const uint32_t reserved_regs = - GetRegBit(rax) | - GetRegBit(rcx) | - GetRegBit(rdx) | - GetRegBit(rsp) | - GetRegBit(rbp) | - GetRegBit(rsi) | - GetRegBit(rdi) | - GetRegBit(xmm0); +int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) { + // Calculate stack size. We need to align things to their natural sizes. + // This could be much better (sort by type/etc). + auto locals = builder->locals(); + size_t stack_offset = StackLayout::GUEST_STACK_SIZE; + for (auto it = locals.begin(); it != locals.end(); ++it) { + auto slot = *it; + size_t type_size = GetTypeSize(slot->type); + // Align to natural size. + stack_offset = XEALIGN(stack_offset, type_size); + slot->set_constant((uint32_t)stack_offset); + stack_offset += type_size; + } + // Ensure 16b alignment. + stack_offset -= StackLayout::GUEST_STACK_SIZE; + stack_offset = XEALIGN(stack_offset, 16); // Function prolog. // Must be 16b aligned. @@ -120,20 +149,18 @@ int X64Emitter::Emit(HIRBuilder* builder) { // X64CodeCache, which dynamically generates exception information. // Adding or changing anything here must be matched! const bool emit_prolog = true; - const size_t stack_size = 64; + const size_t stack_size = StackLayout::GUEST_STACK_SIZE + stack_offset; + XEASSERT((stack_size + 8) % 16 == 0); + out_stack_size = stack_size; + stack_size_ = stack_size; if (emit_prolog) { - mov(qword[rsp + 16], rdx); - mov(qword[rsp + 8], rcx); - sub(rsp, stack_size); - mov(qword[rsp + 8 * 0], rbx); - mov(qword[rsp + 8 * 1], r12); - mov(qword[rsp + 8 * 2], r13); - mov(qword[rsp + 8 * 3], r14); - mov(qword[rsp + 8 * 4], r15); + sub(rsp, (uint32_t)stack_size); + mov(qword[rsp + StackLayout::GUEST_RCX_HOME], rcx); + mov(qword[rsp + StackLayout::GUEST_RET_ADDR], rdx); + mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], 0); + mov(rdx, qword[rcx + 8]); // membase } - auto lowering_table = backend_->lowering_table(); - // Body. auto block = builder->first_block(); while (block) { @@ -144,17 +171,17 @@ int X64Emitter::Emit(HIRBuilder* builder) { label = label->next; } - // Reset reg allocation state. - // If we start keeping regs across blocks this needs to change. - // We mark a few active so that the allocator doesn't use them. - reg_state_.active_regs = reg_state_.live_regs = reserved_regs; - - // Add instructions. - // The table will process sequences of instructions to (try to) - // generate optimal code. - current_instr_ = block->instr_head; - if (lowering_table->ProcessBlock(*this, block)) { - return 1; + // Process instructions. + const Instr* instr = block->instr_head; + while (instr) { + const Instr* new_tail = instr; + if (!SelectSequence(*this, instr, &new_tail)) { + // No sequence found! + XEASSERTALWAYS(); + XELOGE("Unable to process HIR opcode %s", instr->opcode->name); + break; + } + instr = new_tail; } block = block->next; @@ -163,12 +190,8 @@ int X64Emitter::Emit(HIRBuilder* builder) { // Function epilog. L("epilog"); if (emit_prolog) { - mov(rbx, qword[rsp + 8 * 0]); - mov(r12, qword[rsp + 8 * 1]); - mov(r13, qword[rsp + 8 * 2]); - mov(r14, qword[rsp + 8 * 3]); - mov(r15, qword[rsp + 8 * 4]); - add(rsp, stack_size); + mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); + add(rsp, (uint32_t)stack_size); } ret(); @@ -183,181 +206,398 @@ int X64Emitter::Emit(HIRBuilder* builder) { return 0; } -void X64Emitter::EvictStaleRegs() { - // NOTE: if we are getting called it's because we *need* a register. - // We must get rid of something. - - uint32_t current_ordinal = current_instr_->ordinal; - - // Remove any register with no more uses. - uint32_t new_live_regs = 0; - for (size_t n = 0; n < 32; n++) { - uint32_t bit = 1 << n; - if (bit & reg_state_.active_regs) { - // Register is active and cannot be freed. - new_live_regs |= bit; - continue; - } - if (!(bit & reg_state_.live_regs)) { - // Register is not alive - nothing to do. - continue; - } - - // Register is live, not active. Check and see if we get rid of it. - auto v = reg_state_.reg_values[n]; - if (v->last_use->ordinal < current_ordinal) { - reg_state_.reg_values[n] = NULL; - } - } - - // Hrm. We have spilled. - if (reg_state_.live_regs == new_live_regs) { - XEASSERTALWAYS(); - } - - reg_state_.live_regs = new_live_regs; -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags) { - // If the value is already in a register, use it. - if (v0->reg != -1) { - // Already in a register. Mark active and return. - v0_idx = v0->reg; - reg_state_.active_regs |= 1 << v0_idx; - return; - } - - uint32_t avail_regs = 0; - if (IsIntType(v0->type)) { - if (v0_flags & REG_ABCD) { - avail_regs = B00001111; - } else { - avail_regs = 0xFFFF; - } - } else { - avail_regs = 0xFFFF0000; - } - uint32_t free_regs = avail_regs & ~reg_state_.live_regs; - if (!free_regs) { - // Need to evict something. - EvictStaleRegs(); - } - - // Find the first available. - // We start from the MSB so that we get the non-rNx regs that are often - // in short supply. - _BitScanReverse((DWORD*)&v0_idx, free_regs); - - reg_state_.active_regs |= 1 << v0_idx; - reg_state_.live_regs |= 1 << v0_idx; - v0->reg = v0_idx; - reg_state_.reg_values[v0_idx] = v0; -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - Value* v1, uint32_t& v1_idx, uint32_t v1_flags) { - // TODO(benvanik): support REG_DEST reuse/etc. - // Grab all already-present registers first. - // This way we won't spill them trying to get new registers. - bool need_v0 = v0->reg == -1; - bool need_v1 = v1->reg == -1; - if (!need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (!need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - // Grab any registers we still need. These calls may evict. - if (need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - Value* v2, uint32_t& v2_idx, uint32_t v2_flags) { - // TODO(benvanik): support REG_DEST reuse/etc. - // Grab all already-present registers first. - // This way we won't spill them trying to get new registers. - bool need_v0 = v0->reg == -1; - bool need_v1 = v1->reg == -1; - bool need_v2 = v2->reg == -1; - if (!need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (!need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (!need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } - // Grab any registers we still need. These calls may evict. - if (need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } -} - -void X64Emitter::FindFreeRegs( - Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - Value* v2, uint32_t& v2_idx, uint32_t v2_flags, - Value* v3, uint32_t& v3_idx, uint32_t v3_flags) { - // TODO(benvanik): support REG_DEST reuse/etc. - // Grab all already-present registers first. - // This way we won't spill them trying to get new registers. - bool need_v0 = v0->reg == -1; - bool need_v1 = v1->reg == -1; - bool need_v2 = v2->reg == -1; - bool need_v3 = v3->reg == -1; - if (!need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (!need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (!need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } - if (!need_v3) { - FindFreeRegs(v3, v3_idx, v3_flags); - } - // Grab any registers we still need. These calls may evict. - if (need_v0) { - FindFreeRegs(v0, v0_idx, v0_flags); - } - if (need_v1) { - FindFreeRegs(v1, v1_idx, v1_flags); - } - if (need_v2) { - FindFreeRegs(v2, v2_idx, v2_flags); - } - if (need_v3) { - FindFreeRegs(v3, v3_idx, v3_flags); - } -} - -Instr* X64Emitter::Advance(Instr* i) { - auto next = i->next; - current_instr_ = next; - return next; -} - -void X64Emitter::MarkSourceOffset(Instr* i) { +void X64Emitter::MarkSourceOffset(const Instr* i) { auto entry = source_map_arena_.Alloc(); entry->source_offset = i->src1.offset; entry->hir_offset = uint32_t(i->block->ordinal << 16) | i->ordinal; entry->code_offset = getSize(); source_map_count_++; } + +void X64Emitter::DebugBreak() { + // TODO(benvanik): notify debugger. + db(0xCC); +} + +void X64Emitter::Trap() { + // 0x0FE00014 is a 'debug print' where r3 = buffer r4 = length + // TODO(benvanik): post software interrupt to debugger. + db(0xCC); +} + +void X64Emitter::UnimplementedInstr(const hir::Instr* i) { + // TODO(benvanik): notify debugger. + db(0xCC); + XEASSERTALWAYS(); +} + +// Total size of ResolveFunctionSymbol call site in bytes. +// Used to overwrite it with nops as needed. +const size_t TOTAL_RESOLVE_SIZE = 27; +const size_t ASM_OFFSET = 2 + 2 + 8 + 2 + 8; + +// Length Assembly Byte Sequence +// ================================================================================= +// 2 bytes 66 NOP 66 90H +// 3 bytes NOP DWORD ptr [EAX] 0F 1F 00H +// 4 bytes NOP DWORD ptr [EAX + 00H] 0F 1F 40 00H +// 5 bytes NOP DWORD ptr [EAX + EAX*1 + 00H] 0F 1F 44 00 00H +// 6 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00H] 66 0F 1F 44 00 00H +// 7 bytes NOP DWORD ptr [EAX + 00000000H] 0F 1F 80 00 00 00 00H +// 8 bytes NOP DWORD ptr [EAX + EAX*1 + 00000000H] 0F 1F 84 00 00 00 00 00H +// 9 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00H + +uint64_t ResolveFunctionSymbol(void* raw_context, uint64_t symbol_info_ptr) { + // TODO(benvanik): generate this thunk at runtime? or a shim? + auto thread_state = *reinterpret_cast(raw_context); + auto symbol_info = reinterpret_cast(symbol_info_ptr); + + // Resolve function. This will demand compile as required. + Function* fn = NULL; + thread_state->runtime()->ResolveFunction(symbol_info->address(), &fn); + XEASSERTNOTNULL(fn); + auto x64_fn = static_cast(fn); + uint64_t addr = reinterpret_cast(x64_fn->machine_code()); + + // Overwrite the call site. + // The return address points to ReloadRCX work after the call. + uint64_t return_address = reinterpret_cast(_ReturnAddress()); + #pragma pack(push, 1) + struct Asm { + uint16_t mov_rax; + uint64_t rax_constant; + uint16_t mov_rdx; + uint64_t rdx_constant; + uint16_t call_rax; + uint8_t mov_rcx[5]; + }; + #pragma pack(pop) + Asm* code = reinterpret_cast(return_address - ASM_OFFSET); + code->rax_constant = addr; + code->call_rax = 0x9066; + + // We need to return the target in rax so that it gets called. + return addr; +} + +void X64Emitter::Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info) { + auto fn = reinterpret_cast(symbol_info->function()); + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. + if (fn) { + mov(rax, reinterpret_cast(fn->machine_code())); + } else { + size_t start = getSize(); + // 2b + 8b constant + mov(rax, reinterpret_cast(ResolveFunctionSymbol)); + // 2b + 8b constant + mov(rdx, reinterpret_cast(symbol_info)); + // 2b + call(rax); + // 5b + ReloadECX(); + size_t total_size = getSize() - start; + XEASSERT(total_size == TOTAL_RESOLVE_SIZE); + // EDX overwritten, don't bother reloading. + } + + // Actually jump/call to rax. + if (instr->flags & CALL_TAIL) { + // Pass the callers return address over. + mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); + + add(rsp, static_cast(stack_size())); + jmp(rax); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + call(rax); + } +} + +uint64_t ResolveFunctionAddress(void* raw_context, uint64_t target_address) { + // TODO(benvanik): generate this thunk at runtime? or a shim? + auto thread_state = *reinterpret_cast(raw_context); + + // TODO(benvanik): required? + target_address &= 0xFFFFFFFF; + + Function* fn = NULL; + thread_state->runtime()->ResolveFunction(target_address, &fn); + XEASSERTNOTNULL(fn); + auto x64_fn = static_cast(fn); + return reinterpret_cast(x64_fn->machine_code()); +} + +void X64Emitter::CallIndirect(const hir::Instr* instr, const Reg64& reg) { + // Check if return. + if (instr->flags & CALL_POSSIBLE_RETURN) { + cmp(reg.cvt32(), dword[rsp + StackLayout::GUEST_RET_ADDR]); + je("epilog", CodeGenerator::T_NEAR); + } + + // Resolve address to the function to call and store in rax. + // TODO(benvanik): caching/etc. For now this makes debugging easier. + if (reg.getIdx() != rdx.getIdx()) { + mov(rdx, reg); + } + CallNative(ResolveFunctionAddress); + + // Actually jump/call to rax. + if (instr->flags & CALL_TAIL) { + // Pass the callers return address over. + mov(rdx, qword[rsp + StackLayout::GUEST_RET_ADDR]); + + add(rsp, static_cast(stack_size())); + jmp(rax); + } else { + // Return address is from the previous SET_RETURN_ADDRESS. + mov(rdx, qword[rsp + StackLayout::GUEST_CALL_RET_ADDR]); + call(rax); + } +} + +uint64_t UndefinedCallExtern(void* raw_context, uint64_t symbol_info_ptr) { + auto symbol_info = reinterpret_cast(symbol_info_ptr); + XELOGW("undefined extern call to %.8X %s", + symbol_info->address(), + symbol_info->name()); + return 0; +} +void X64Emitter::CallExtern(const hir::Instr* instr, const FunctionInfo* symbol_info) { + XEASSERT(symbol_info->behavior() == FunctionInfo::BEHAVIOR_EXTERN); + if (!symbol_info->extern_handler()) { + CallNative(UndefinedCallExtern, reinterpret_cast(symbol_info)); + } else { + // rcx = context + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + mov(rdx, reinterpret_cast(symbol_info->extern_handler())); + mov(r8, reinterpret_cast(symbol_info->extern_arg0())); + mov(r9, reinterpret_cast(symbol_info->extern_arg1())); + auto thunk = backend()->guest_to_host_thunk(); + mov(rax, reinterpret_cast(thunk)); + call(rax); + ReloadECX(); + ReloadEDX(); + // rax = host return + } +} + +void X64Emitter::CallNative(void* fn) { + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context)) { + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0)) { + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0) { + mov(rdx, arg0); + mov(rax, reinterpret_cast(fn)); + call(rax); + ReloadECX(); + ReloadEDX(); +} + +void X64Emitter::CallNativeSafe(void* fn) { + // rcx = context + // rdx = target host function + // r8 = arg0 + // r9 = arg1 + mov(rdx, reinterpret_cast(fn)); + auto thunk = backend()->guest_to_host_thunk(); + mov(rax, reinterpret_cast(thunk)); + call(rax); + ReloadECX(); + ReloadEDX(); + // rax = host return +} + +void X64Emitter::SetReturnAddress(uint64_t value) { + mov(qword[rsp + StackLayout::GUEST_CALL_RET_ADDR], value); +} + +void X64Emitter::ReloadECX() { + mov(rcx, qword[rsp + StackLayout::GUEST_RCX_HOME]); +} + +void X64Emitter::ReloadEDX() { + mov(rdx, qword[rcx + 8]); // membase +} + +void X64Emitter::LoadEflags() { +#if STORE_EFLAGS + mov(eax, dword[rsp + STASH_OFFSET]); + push(rax); + popf(); +#else + // EFLAGS already present. +#endif // STORE_EFLAGS +} + +void X64Emitter::StoreEflags() { +#if STORE_EFLAGS + pushf(); + pop(qword[rsp + STASH_OFFSET]); +#else + // EFLAGS should have CA set? + // (so long as we don't fuck with it) +#endif // STORE_EFLAGS +} + +uint32_t X64Emitter::page_table_address() const { + uint64_t addr = runtime_->memory()->page_table(); + return static_cast(addr); +} + +bool X64Emitter::ConstantFitsIn32Reg(uint64_t v) { + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; +} + +void X64Emitter::MovMem64(const RegExp& addr, uint64_t v) { + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + mov(qword[addr], v); + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + mov(qword[addr], v); + } else if (!(v >> 32)) { + // All high bits are zero. It'd be nice if we had a way to load a 32bit + // immediate without sign extending! + // TODO(benvanik): this is super common, find a better way. + mov(dword[addr], static_cast(v)); + mov(dword[addr + 4], 0); + } else { + // 64bit number that needs double movs. + mov(dword[addr], static_cast(v)); + mov(dword[addr + 4], static_cast(v >> 32)); + } +} + +Address X64Emitter::GetXmmConstPtr(XmmConst id) { + static const vec128_t xmm_consts[] = { + /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), + /* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f), + /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), + /* XMMMaskX16Y16 */ vec128i(0x0000FFFFu, 0xFFFF0000u, 0x00000000u, 0x00000000u), + /* XMMFlipX16Y16 */ vec128i(0x00008000u, 0x00000000u, 0x00000000u, 0x00000000u), + /* XMMFixX16Y16 */ vec128f(-32768.0f, 0.0f, 0.0f, 0.0f), + /* XMMNormalizeX16Y16 */ vec128f(1.0f / 32767.0f, 1.0f / (32767.0f * 65536.0f), 0.0f, 0.0f), + /* XMM0001 */ vec128f(0.0f, 0.0f, 0.0f, 1.0f), + /* XMM3301 */ vec128f(3.0f, 3.0f, 0.0f, 1.0f), + /* XMMSignMaskPS */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* XMMSignMaskPD */ vec128i(0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u), + /* XMMAbsMaskPS */ vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu), + /* XMMAbsMaskPD */ vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu), + /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), + /* XMMPermuteControl15 */ vec128b(15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15), + /* XMMPackD3DCOLOR */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), + /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF0Eu, 0xFFFFFF0Du, 0xFFFFFF0Cu, 0xFFFFFF0Fu), + /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), + /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), + /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), + /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), + /* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f), + /* XMMSignMaskI8 */ vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u), + /* XMMSignMaskI16 */ vec128i(0x80008000u, 0x80008000u, 0x80008000u, 0x80008000u), + /* XMMSignMaskI32 */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* XMMSignMaskF32 */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + }; + // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to + // prevent this move. + // TODO(benvanik): move to predictable location in PPCContext? could then + // just do rcx relative addression with no rax overwriting. + mov(rax, (uint64_t)&xmm_consts[id]); + return ptr[rax]; +} + +void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { + // http://www.agner.org/optimize/optimizing_assembly.pdf + // 13.4 Generating constants + if (!v.low && !v.high) { + // 0000... + vpxor(dest, dest); + } else if (v.low == ~0ull && v.high == ~0ull) { + // 1111... + vpcmpeqb(dest, dest); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + MovMem64(rsp + STASH_OFFSET, v.low); + MovMem64(rsp + STASH_OFFSET + 8, v.high); + vmovdqa(dest, ptr[rsp + STASH_OFFSET]); + } +} + +void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) { + union { + float f; + uint32_t i; + } x = { v }; + if (!v) { + // 0 + vpxor(dest, dest); + } else if (x.i == ~0UL) { + // 1111... + vpcmpeqb(dest, dest); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + mov(eax, x.i); + vmovd(dest, eax); + } +} + +void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) { + union { + double d; + uint64_t i; + } x = { v }; + if (!v) { + // 0 + vpxor(dest, dest); + } else if (x.i == ~0ULL) { + // 1111... + vpcmpeqb(dest, dest); + } else { + // TODO(benvanik): see what other common values are. + // TODO(benvanik): build constant table - 99% are reused. + mov(rax, x.i); + vmovq(dest, rax); + } +} + +Address X64Emitter::StashXmm(const Xmm& r) { + auto addr = ptr[rsp + STASH_OFFSET]; + vmovups(addr, r); + return addr; +} + +Address X64Emitter::StashXmm(const vec128_t& v) { + auto addr = ptr[rsp + STASH_OFFSET]; + LoadConstantXmm(xmm0, v); + vmovups(addr, xmm0); + return addr; +} diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 3125d0c07..e6ea7b7b5 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -19,6 +19,9 @@ XEDECLARECLASS2(alloy, hir, HIRBuilder); XEDECLARECLASS2(alloy, hir, Instr); XEDECLARECLASS2(alloy, runtime, DebugInfo); +XEDECLARECLASS2(alloy, runtime, FunctionInfo); +XEDECLARECLASS2(alloy, runtime, Runtime); +XEDECLARECLASS2(alloy, runtime, SymbolInfo); namespace alloy { namespace backend { @@ -32,6 +35,35 @@ enum RegisterFlags { REG_ABCD = (1 << 1), }; +enum XmmConst { + XMMZero = 0, + XMMOne, + XMMNegativeOne, + XMMMaskX16Y16, + XMMFlipX16Y16, + XMMFixX16Y16, + XMMNormalizeX16Y16, + XMM0001, + XMM3301, + XMMSignMaskPS, + XMMSignMaskPD, + XMMAbsMaskPS, + XMMAbsMaskPD, + XMMByteSwapMask, + XMMPermuteControl15, + XMMPackD3DCOLOR, + XMMUnpackD3DCOLOR, + XMMOneOver255, + XMMShiftMaskPS, + XMMShiftByteMask, + XMMUnsignedDwordMax, + XMM255, + XMMSignMaskI8, + XMMSignMaskI16, + XMMSignMaskI32, + XMMSignMaskF32, +}; + // Unfortunately due to the design of xbyak we have to pass this to the ctor. class XbyakAllocator : public Xbyak::Allocator { public: @@ -43,6 +75,9 @@ public: X64Emitter(X64Backend* backend, XbyakAllocator* allocator); virtual ~X64Emitter(); + runtime::Runtime* runtime() const { return runtime_; } + X64Backend* backend() const { return backend_; } + int Initialize(); int Emit(hir::HIRBuilder* builder, @@ -50,118 +85,93 @@ public: void*& out_code_address, size_t& out_code_size); public: - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags) { - uint32_t v0_idx; - FindFreeRegs(v0, v0_idx, r0_flags); - SetupReg(v0_idx, r0); + // Reserved: rsp + // Scratch: rax/rcx/rdx + // xmm0-2 (could be only xmm0 with some trickery) + // Available: rbx, r12-r15 (save to get r8-r11, rbp, rsi, rdi?) + // xmm6-xmm15 (save to get xmm3-xmm5) + static const int GPR_COUNT = 5; + static const int XMM_COUNT = 10; + + static void SetupReg(const hir::Value* v, Xbyak::Reg8& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg8(idx); } - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, - hir::Value* v1, V1& r1, uint32_t r1_flags) { - uint32_t v0_idx, v1_idx; - FindFreeRegs(v0, v0_idx, r0_flags, - v1, v1_idx, r1_flags); - SetupReg(v0_idx, r0); - SetupReg(v1_idx, r1); + static void SetupReg(const hir::Value* v, Xbyak::Reg16& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg16(idx); } - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, - hir::Value* v1, V1& r1, uint32_t r1_flags, - hir::Value* v2, V2& r2, uint32_t r2_flags) { - uint32_t v0_idx, v1_idx, v2_idx; - FindFreeRegs(v0, v0_idx, r0_flags, - v1, v1_idx, r1_flags, - v2, v2_idx, r2_flags); - SetupReg(v0_idx, r0); - SetupReg(v1_idx, r1); - SetupReg(v2_idx, r2); + static void SetupReg(const hir::Value* v, Xbyak::Reg32& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg32(idx); } - template - void BeginOp(hir::Value* v0, V0& r0, uint32_t r0_flags, - hir::Value* v1, V1& r1, uint32_t r1_flags, - hir::Value* v2, V2& r2, uint32_t r2_flags, - hir::Value* v3, V3& r3, uint32_t r3_flags) { - uint32_t v0_idx, v1_idx, v2_idx, v3_idx; - FindFreeRegs(v0, v0_idx, r0_flags, - v1, v1_idx, r1_flags, - v2, v2_idx, r2_flags, - v3, v3_idx, r3_flags); - SetupReg(v0_idx, r0); - SetupReg(v1_idx, r1); - SetupReg(v2_idx, r2); - SetupReg(v3_idx, r3); + static void SetupReg(const hir::Value* v, Xbyak::Reg64& r) { + auto idx = gpr_reg_map_[v->reg.index]; + r = Xbyak::Reg64(idx); } - template - void EndOp(V0& r0) { - reg_state_.active_regs = reg_state_.active_regs ^ GetRegBit(r0); - } - template - void EndOp(V0& r0, V1& r1) { - reg_state_.active_regs = reg_state_.active_regs ^ ( - GetRegBit(r0) | GetRegBit(r1)); - } - template - void EndOp(V0& r0, V1& r1, V2& r2) { - reg_state_.active_regs = reg_state_.active_regs ^ ( - GetRegBit(r0) | GetRegBit(r1) | GetRegBit(r2)); - } - template - void EndOp(V0& r0, V1& r1, V2& r2, V3& r3) { - reg_state_.active_regs = reg_state_.active_regs ^ ( - GetRegBit(r0) | GetRegBit(r1) | GetRegBit(r2) | GetRegBit(r3)); + static void SetupReg(const hir::Value* v, Xbyak::Xmm& r) { + auto idx = xmm_reg_map_[v->reg.index]; + r = Xbyak::Xmm(idx); } - void EvictStaleRegs(); + void MarkSourceOffset(const hir::Instr* i); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - hir::Value* v1, uint32_t& v1_idx, uint32_t v1_flags); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - hir::Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - hir::Value* v2, uint32_t& v2_idx, uint32_t v2_flags); - void FindFreeRegs(hir::Value* v0, uint32_t& v0_idx, uint32_t v0_flags, - hir::Value* v1, uint32_t& v1_idx, uint32_t v1_flags, - hir::Value* v2, uint32_t& v2_idx, uint32_t v2_flags, - hir::Value* v3, uint32_t& v3_idx, uint32_t v3_flags); + void DebugBreak(); + void Trap(); + void UnimplementedInstr(const hir::Instr* i); + void UnimplementedExtern(const hir::Instr* i); - static void SetupReg(uint32_t idx, Xbyak::Reg8& r) { r = Xbyak::Reg8(idx); } - static void SetupReg(uint32_t idx, Xbyak::Reg16& r) { r = Xbyak::Reg16(idx); } - static void SetupReg(uint32_t idx, Xbyak::Reg32& r) { r = Xbyak::Reg32(idx); } - static void SetupReg(uint32_t idx, Xbyak::Reg64& r) { r = Xbyak::Reg64(idx); } - static void SetupReg(uint32_t idx, Xbyak::Xmm& r) { r = Xbyak::Xmm(idx - 16); } - static uint32_t GetRegBit(const Xbyak::Reg8& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Reg16& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Reg32& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Reg64& r) { return 1 << r.getIdx(); } - static uint32_t GetRegBit(const Xbyak::Xmm& r) { return 1 << (16 + r.getIdx()); } + void Call(const hir::Instr* instr, runtime::FunctionInfo* symbol_info); + void CallIndirect(const hir::Instr* instr, const Xbyak::Reg64& reg); + void CallExtern(const hir::Instr* instr, const runtime::FunctionInfo* symbol_info); + void CallNative(void* fn); + void CallNative(uint64_t(*fn)(void* raw_context)); + void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0)); + void CallNative(uint64_t(*fn)(void* raw_context, uint64_t arg0), uint64_t arg0); + void CallNativeSafe(void* fn); + void SetReturnAddress(uint64_t value); + void ReloadECX(); + void ReloadEDX(); - hir::Instr* Advance(hir::Instr* i); + // TODO(benvanik): Label for epilog (don't use strings). - void MarkSourceOffset(hir::Instr* i); + void LoadEflags(); + void StoreEflags(); -private: - void* Emplace(X64CodeCache* code_cache); - int Emit(hir::HIRBuilder* builder); + uint32_t page_table_address() const; -private: - X64Backend* backend_; - X64CodeCache* code_cache_; - XbyakAllocator* allocator_; + // Moves a 64bit immediate into memory. + bool ConstantFitsIn32Reg(uint64_t v); + void MovMem64(const Xbyak::RegExp& addr, uint64_t v); + + Xbyak::Address GetXmmConstPtr(XmmConst id); + void LoadConstantXmm(Xbyak::Xmm dest, float v); + void LoadConstantXmm(Xbyak::Xmm dest, double v); + void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v); + Xbyak::Address StashXmm(const Xbyak::Xmm& r); + Xbyak::Address StashXmm(const vec128_t& v); + + size_t stack_size() const { return stack_size_; } + +protected: + void* Emplace(size_t stack_size); + int Emit(hir::HIRBuilder* builder, size_t& out_stack_size); + +protected: + runtime::Runtime* runtime_; + X64Backend* backend_; + X64CodeCache* code_cache_; + XbyakAllocator* allocator_; - struct { - // Registers currently active within a begin/end op block. These - // cannot be reused. - uint32_t active_regs; - // Registers with values in them. - uint32_t live_regs; - // Current register values. - hir::Value* reg_values[32]; - } reg_state_; hir::Instr* current_instr_; size_t source_map_count_; Arena source_map_arena_; + + size_t stack_size_; + + static const uint32_t gpr_reg_map_[GPR_COUNT]; + static const uint32_t xmm_reg_map_[XMM_COUNT]; }; diff --git a/src/alloy/backend/x64/x64_function.cc b/src/alloy/backend/x64/x64_function.cc index c668c14f1..71452ac14 100644 --- a/src/alloy/backend/x64/x64_function.cc +++ b/src/alloy/backend/x64/x64_function.cc @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -21,7 +22,7 @@ using namespace alloy::runtime; X64Function::X64Function(FunctionInfo* symbol_info) : machine_code_(NULL), code_size_(0), - GuestFunction(symbol_info) { + Function(symbol_info) { } X64Function::~X64Function() { @@ -41,8 +42,12 @@ int X64Function::RemoveBreakpointImpl(Breakpoint* breakpoint) { return 0; } -int X64Function::CallImpl(ThreadState* thread_state) { - typedef void(*call_t)(void* raw_context, uint8_t* membase); - ((call_t)machine_code_)(thread_state->raw_context(), thread_state->memory()->membase()); +int X64Function::CallImpl(ThreadState* thread_state, uint64_t return_address) { + auto backend = (X64Backend*)thread_state->runtime()->backend(); + auto thunk = backend->host_to_guest_thunk(); + thunk( + machine_code_, + thread_state->raw_context(), + (void*)return_address); return 0; } diff --git a/src/alloy/backend/x64/x64_function.h b/src/alloy/backend/x64/x64_function.h index cf06bb6c4..0f9659ca6 100644 --- a/src/alloy/backend/x64/x64_function.h +++ b/src/alloy/backend/x64/x64_function.h @@ -20,17 +20,21 @@ namespace backend { namespace x64 { -class X64Function : public runtime::GuestFunction { +class X64Function : public runtime::Function { public: X64Function(runtime::FunctionInfo* symbol_info); virtual ~X64Function(); + void* machine_code() const { return machine_code_; } + size_t code_size() const { return code_size_; } + void Setup(void* machine_code, size_t code_size); protected: virtual int AddBreakpointImpl(runtime::Breakpoint* breakpoint); virtual int RemoveBreakpointImpl(runtime::Breakpoint* breakpoint); - virtual int CallImpl(runtime::ThreadState* thread_state); + virtual int CallImpl(runtime::ThreadState* thread_state, + uint64_t return_address); private: void* machine_code_; diff --git a/src/alloy/backend/x64/x64_sequence.inl b/src/alloy/backend/x64/x64_sequence.inl new file mode 100644 index 000000000..eae1096eb --- /dev/null +++ b/src/alloy/backend/x64/x64_sequence.inl @@ -0,0 +1,744 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + + +namespace { + +enum KeyType { + KEY_TYPE_X = OPCODE_SIG_TYPE_X, + KEY_TYPE_L = OPCODE_SIG_TYPE_L, + KEY_TYPE_O = OPCODE_SIG_TYPE_O, + KEY_TYPE_S = OPCODE_SIG_TYPE_S, + KEY_TYPE_V_I8 = OPCODE_SIG_TYPE_V + INT8_TYPE, + KEY_TYPE_V_I16 = OPCODE_SIG_TYPE_V + INT16_TYPE, + KEY_TYPE_V_I32 = OPCODE_SIG_TYPE_V + INT32_TYPE, + KEY_TYPE_V_I64 = OPCODE_SIG_TYPE_V + INT64_TYPE, + KEY_TYPE_V_F32 = OPCODE_SIG_TYPE_V + FLOAT32_TYPE, + KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, + KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, +}; + +#pragma pack(push, 1) +union InstrKey { + struct { + uint32_t opcode : 8; + uint32_t dest : 5; + uint32_t src1 : 5; + uint32_t src2 : 5; + uint32_t src3 : 5; + uint32_t reserved : 4; + }; + uint32_t value; + + operator uint32_t() const { + return value; + } + + InstrKey() : value(0) {} + InstrKey(uint32_t v) : value(v) {} + InstrKey(const Instr* i) : value(0) { + opcode = i->opcode->num; + uint32_t sig = i->opcode->signature; + dest = GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0; + src1 = GET_OPCODE_SIG_TYPE_SRC1(sig); + if (src1 == OPCODE_SIG_TYPE_V) { + src1 += i->src1.value->type; + } + src2 = GET_OPCODE_SIG_TYPE_SRC2(sig); + if (src2 == OPCODE_SIG_TYPE_V) { + src2 += i->src2.value->type; + } + src3 = GET_OPCODE_SIG_TYPE_SRC3(sig); + if (src3 == OPCODE_SIG_TYPE_V) { + src3 += i->src3.value->type; + } + } + + template + struct Construct { + static const uint32_t value = + (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); + }; +}; +#pragma pack(pop) +static_assert(sizeof(InstrKey) <= 4, "Key must be 4 bytes"); + +template +struct CombinedStruct; +template <> +struct CombinedStruct<> {}; +template +struct CombinedStruct : T, CombinedStruct {}; + +struct OpBase {}; + +template +struct Op : OpBase { + static const KeyType key_type = KEY_TYPE; +}; + +struct VoidOp : Op { +protected: + template friend struct Op; + template friend struct I; + void Load(const Instr::Op& op) {} +}; + +struct OffsetOp : Op { + uint64_t value; +protected: + template friend struct Op; + template friend struct I; + void Load(const Instr::Op& op) { + this->value = op.offset; + } +}; + +struct SymbolOp : Op { + FunctionInfo* value; +protected: + template friend struct Op; + template friend struct I; + bool Load(const Instr::Op& op) { + this->value = op.symbol_info; + return true; + } +}; + +struct LabelOp : Op { + hir::Label* value; +protected: + template friend struct Op; + template friend struct I; + void Load(const Instr::Op& op) { + this->value = op.label; + } +}; + +template +struct ValueOp : Op, KEY_TYPE> { + typedef REG_TYPE reg_type; + static const int tag = TAG; + const Value* value; + bool is_constant; + virtual bool ConstantFitsIn32Reg() const { return true; } + const REG_TYPE& reg() const { + XEASSERT(!is_constant); + return reg_; + } + operator const REG_TYPE&() const { + return reg(); + } + bool IsEqual(const T& b) const { + if (is_constant && b.is_constant) { + return reinterpret_cast(this)->constant() == b.constant(); + } else if (!is_constant && !b.is_constant) { + return reg_.getIdx() == b.reg_.getIdx(); + } else { + return false; + } + } + bool IsEqual(const Xbyak::Reg& b) const { + if (is_constant) { + return false; + } else if (!is_constant) { + return reg_.getIdx() == b.getIdx(); + } else { + return false; + } + } + bool operator== (const T& b) const { + return IsEqual(b); + } + bool operator!= (const T& b) const { + return !IsEqual(b); + } + bool operator== (const Xbyak::Reg& b) const { + return IsEqual(b); + } + bool operator!= (const Xbyak::Reg& b) const { + return !IsEqual(b); + } + void Load(const Instr::Op& op) { + const Value* value = op.value; + this->value = value; + is_constant = value->IsConstant(); + if (!is_constant) { + X64Emitter::SetupReg(value, reg_); + } + } +protected: + REG_TYPE reg_; +}; + +template +struct I8 : ValueOp, KEY_TYPE_V_I8, Reg8, int8_t, TAG> { + const int8_t constant() const { + XEASSERT(is_constant); + return value->constant.i8; + } +}; +template +struct I16 : ValueOp, KEY_TYPE_V_I16, Reg16, int16_t, TAG> { + const int16_t constant() const { + XEASSERT(is_constant); + return value->constant.i16; + } +}; +template +struct I32 : ValueOp, KEY_TYPE_V_I32, Reg32, int32_t, TAG> { + const int32_t constant() const { + XEASSERT(is_constant); + return value->constant.i32; + } +}; +template +struct I64 : ValueOp, KEY_TYPE_V_I64, Reg64, int64_t, TAG> { + const int64_t constant() const { + XEASSERT(is_constant); + return value->constant.i64; + } + bool ConstantFitsIn32Reg() const override { + int64_t v = value->constant.i64; + if ((v & ~0x7FFFFFFF) == 0) { + // Fits under 31 bits, so just load using normal mov. + return true; + } else if ((v & ~0x7FFFFFFF) == ~0x7FFFFFFF) { + // Negative number that fits in 32bits. + return true; + } + return false; + } +}; +template +struct F32 : ValueOp, KEY_TYPE_V_F32, Xmm, float, TAG> { + const float constant() const { + XEASSERT(is_constant); + return value->constant.f32; + } +}; +template +struct F64 : ValueOp, KEY_TYPE_V_F64, Xmm, double, TAG> { + const double constant() const { + XEASSERT(is_constant); + return value->constant.f64; + } +}; +template +struct V128 : ValueOp, KEY_TYPE_V_V128, Xmm, vec128_t, TAG> { + const vec128_t& constant() const { + XEASSERT(is_constant); + return value->constant.v128; + } +}; + +struct TagTable { + struct { + bool valid; + Instr::Op op; + } table[16]; + + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template ::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + return true; + } + template = KEY_TYPE_V_I8>::type* = nullptr> + bool CheckTag(const Instr::Op& op) { + const Value* value = op.value; + if (T::tag == -1) { + return true; + } + if (table[T::tag].valid && + table[T::tag].op.value != value) { + return false; + } + table[T::tag].valid = true; + table[T::tag].op.value = (Value*)value; + return true; + } +}; + +template +struct DestField; +template +struct DestField { + DEST dest; +protected: + bool LoadDest(const Instr* i, TagTable& tag_table) { + Instr::Op op; + op.value = i->dest; + if (tag_table.CheckTag(op)) { + dest.Load(op); + return true; + } + return false; + } +}; +template <> +struct DestField { +protected: + bool LoadDest(const Instr* i, TagTable& tag_table) { + return true; + } +}; + +template +struct I; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + const Instr* instr; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table)) { + instr = i; + return true; + } + return false; + } +}; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + const Instr* instr; + SRC1 src1; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table) && + tag_table.CheckTag(i->src1)) { + instr = i; + src1.Load(i->src1); + return true; + } + return false; + } +}; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table) && + tag_table.CheckTag(i->src1) && + tag_table.CheckTag(i->src2)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + return true; + } + return false; + } +}; +template +struct I : DestField { + static const hir::Opcode opcode = OPCODE; + static const uint32_t key = InstrKey::Construct::value; + static const KeyType dest_type = DEST::key_type; + static const KeyType src1_type = SRC1::key_type; + static const KeyType src2_type = SRC2::key_type; + static const KeyType src3_type = SRC3::key_type; + const Instr* instr; + SRC1 src1; + SRC2 src2; + SRC3 src3; +protected: + template friend struct SequenceFields; + bool Load(const Instr* i, TagTable& tag_table) { + if (InstrKey(i).value == key && + LoadDest(i, tag_table) && + tag_table.CheckTag(i->src1) && + tag_table.CheckTag(i->src2) && + tag_table.CheckTag(i->src3)) { + instr = i; + src1.Load(i->src1); + src2.Load(i->src2); + src3.Load(i->src3); + return true; + } + return false; + } +}; + +template +struct SequenceFields; +template +struct SequenceFields { + I1 i1; + typedef typename I1 I1Type; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (i1.Load(i, tag_table)) { + *new_tail = i->next; + return true; + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I2 i2; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i2.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I3 i3; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i3.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I4 i4; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i4.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; +template +struct SequenceFields : SequenceFields { + I5 i5; +protected: + template friend struct Sequence; + bool Check(const Instr* i, TagTable& tag_table, const Instr** new_tail) { + if (SequenceFields::Check(i, tag_table, new_tail)) { + auto ni = i->next; + if (ni && i5.Load(ni, tag_table)) { + *new_tail = ni; + return i; + } + } + return false; + } +}; + +template +struct Sequence { + struct EmitArgs : SequenceFields {}; + + static bool Select(X64Emitter& e, const Instr* i, const Instr** new_tail) { + EmitArgs args; + TagTable tag_table; + if (!args.Check(i, tag_table, new_tail)) { + return false; + } + SEQ::Emit(e, args); + return true; + } +}; + +template +const T GetTempReg(X64Emitter& e); +template <> +const Reg8 GetTempReg(X64Emitter& e) { + return e.al; +} +template <> +const Reg16 GetTempReg(X64Emitter& e) { + return e.ax; +} +template <> +const Reg32 GetTempReg(X64Emitter& e) { + return e.eax; +} +template <> +const Reg64 GetTempReg(X64Emitter& e) { + return e.rax; +} + +template +struct SingleSequence : public Sequence, T> { + typedef T EmitArgType; + static const uint32_t head_key = T::key; + static void Emit(X64Emitter& e, const EmitArgs& _) { + SEQ::Emit(e, _.i1); + } + + template + static void EmitUnaryOp( + X64Emitter& e, const EmitArgType& i, + const REG_FN& reg_fn) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + reg_fn(e, i.dest); + } else { + if (i.dest != i.src1) { + e.mov(i.dest, i.src1); + } + reg_fn(e, i.dest); + } + } + + template + static void EmitCommutativeBinaryOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.dest == i.src2) { + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1); + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + reg_reg_fn(e, i.dest, i.src1); + } else { + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + template + static void EmitAssociativeBinaryOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.mov(temp, i.src2); + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, temp); + } else { + e.mov(i.dest, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2); + } + } else if (i.src2.is_constant) { + if (i.dest == i.src1) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } else { + e.mov(i.dest, i.src1); + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, temp); + } + } + } else { + if (i.dest == i.src1) { + reg_reg_fn(e, i.dest, i.src2); + } else if (i.dest == i.src2) { + auto temp = GetTempReg(e); + e.mov(temp, i.src2); + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, temp); + } else { + e.mov(i.dest, i.src1); + reg_reg_fn(e, i.dest, i.src2); + } + } + } + + template + static void EmitCommutativeBinaryXmmOp( + X64Emitter& e, const EmitArgType& i, const FN& fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + fn(e, i.dest, e.xmm0, i.src2); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + fn(e, i.dest, i.src1, e.xmm0); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitAssociativeBinaryXmmOp( + X64Emitter& e, const EmitArgType& i, const FN& fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + fn(e, i.dest, e.xmm0, i.src2); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + fn(e, i.dest, i.src1, e.xmm0); + } else { + fn(e, i.dest, i.src1, i.src2); + } + } + + template + static void EmitCommutativeCompareOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src2, static_cast(i.src1.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.src2, temp); + } + } else if (i.src2.is_constant) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.src1, static_cast(i.src2.constant())); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.src1, temp); + } + } else { + reg_reg_fn(e, i.src1, i.src2); + } + } + template + static void EmitAssociativeCompareOp( + X64Emitter& e, const EmitArgType& i, + const REG_REG_FN& reg_reg_fn, const REG_CONST_FN& reg_const_fn) { + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + if (i.src1.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src2, static_cast(i.src1.constant()), true); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src1.constant()); + reg_reg_fn(e, i.dest, i.src2, temp, true); + } + } else if (i.src2.is_constant) { + if (i.src2.ConstantFitsIn32Reg()) { + reg_const_fn(e, i.dest, i.src1, static_cast(i.src2.constant()), false); + } else { + auto temp = GetTempReg(e); + e.mov(temp, i.src2.constant()); + reg_reg_fn(e, i.dest, i.src1, temp, false); + } + } else { + reg_reg_fn(e, i.dest, i.src1, i.src2, false); + } + } +}; + +static const int ANY = -1; +typedef int tag_t; +static const tag_t TAG0 = 0; +static const tag_t TAG1 = 1; +static const tag_t TAG2 = 2; +static const tag_t TAG3 = 3; +static const tag_t TAG4 = 4; +static const tag_t TAG5 = 5; +static const tag_t TAG6 = 6; +static const tag_t TAG7 = 7; + +typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, const Instr**); + +template +void Register() { + sequence_table.insert({ T::head_key, T::Select }); +} +template +void Register() { + Register(); + Register(); +}; +#define EMITTER_OPCODE_TABLE(name, ...) \ + void Register_##name() { \ + Register<__VA_ARGS__>(); \ + } + +#define MATCH(...) __VA_ARGS__ +#define EMITTER(name, match) struct name : SingleSequence +#define SEQUENCE(name, match) struct name : Sequence + +} // namespace diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc new file mode 100644 index 000000000..689c8b3b3 --- /dev/null +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -0,0 +1,5119 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +// A note about vectors: +// Alloy represents vectors as xyzw pairs, with indices 0123. +// XMM registers are xyzw pairs with indices 3210, making them more like wzyx. +// This makes things somewhat confusing. It'd be nice to just shuffle the +// registers around on load/store, however certain operations require that +// data be in the right offset. +// Basically, this identity must hold: +// shuffle(vec, b00011011) -> {x,y,z,w} => {x,y,z,w} +// All indices and operations must respect that. +// +// Memory (big endian): +// [00 01 02 03] [04 05 06 07] [08 09 0A 0B] [0C 0D 0E 0F] (x, y, z, w) +// load into xmm register: +// [0F 0E 0D 0C] [0B 0A 09 08] [07 06 05 04] [03 02 01 00] (w, z, y, x) + +#include + +#include +#include +#include +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::backend::x64; +using namespace alloy::hir; +using namespace alloy::runtime; + +using namespace Xbyak; + +// Utilities/types used only in this file: +#include + +namespace { +static std::unordered_multimap sequence_table; +} // namespace + + +// Selects the right byte/word/etc from a vector. We need to flip logical +// indices (0,1,2,3,4,5,6,7,...) = (3,2,1,0,7,6,5,4,...) +#define VEC128_B(n) ((n) ^ 0x3) +#define VEC128_W(n) ((n) ^ 0x1) +#define VEC128_D(n) (n) +#define VEC128_F(n) (n) + + +// ============================================================================ +// OPCODE_COMMENT +// ============================================================================ +EMITTER(COMMENT, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (IsTracingInstr()) { + auto str = reinterpret_cast(i.src1.value); + // TODO(benvanik): pass through. + // TODO(benvanik): don't just leak this memory. + auto str_copy = xestrdupa(str); + e.mov(e.rdx, reinterpret_cast(str_copy)); + e.CallNative(TraceString); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_COMMENT, + COMMENT); + + +// ============================================================================ +// OPCODE_NOP +// ============================================================================ +EMITTER(NOP, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.nop(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_NOP, + NOP); + + +// ============================================================================ +// OPCODE_SOURCE_OFFSET +// ============================================================================ +EMITTER(SOURCE_OFFSET, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { +#if XE_DEBUG + e.nop(); + e.nop(); + e.mov(e.eax, (uint32_t)i.src1.value); + e.nop(); + e.nop(); +#endif // XE_DEBUG + e.MarkSourceOffset(i.instr); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SOURCE_OFFSET, + SOURCE_OFFSET); + + +// ============================================================================ +// OPCODE_DEBUG_BREAK +// ============================================================================ +EMITTER(DEBUG_BREAK, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.DebugBreak(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DEBUG_BREAK, + DEBUG_BREAK); + + +// ============================================================================ +// OPCODE_DEBUG_BREAK_TRUE +// ============================================================================ +EMITTER(DEBUG_BREAK_TRUE_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER(DEBUG_BREAK_TRUE_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.DebugBreak(); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DEBUG_BREAK_TRUE, + DEBUG_BREAK_TRUE_I8, + DEBUG_BREAK_TRUE_I16, + DEBUG_BREAK_TRUE_I32, + DEBUG_BREAK_TRUE_I64, + DEBUG_BREAK_TRUE_F32, + DEBUG_BREAK_TRUE_F64); + + +// ============================================================================ +// OPCODE_TRAP +// ============================================================================ +EMITTER(TRAP, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.Trap(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_TRAP, + TRAP); + + +// ============================================================================ +// OPCODE_TRAP_TRUE +// ============================================================================ +EMITTER(TRAP_TRUE_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER(TRAP_TRUE_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Trap(); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_TRAP_TRUE, + TRAP_TRUE_I8, + TRAP_TRUE_I16, + TRAP_TRUE_I32, + TRAP_TRUE_I64, + TRAP_TRUE_F32, + TRAP_TRUE_F64); + + +// ============================================================================ +// OPCODE_CALL +// ============================================================================ +EMITTER(CALL, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.Call(i.instr, i.src1.value); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL, + CALL); + + +// ============================================================================ +// OPCODE_CALL_TRUE +// ============================================================================ +EMITTER(CALL_TRUE_I8, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_I16, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_I32, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_I64, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_F32, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER(CALL_TRUE_F64, MATCH(I, SymbolOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.Call(i.instr, i.src2.value); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_TRUE, + CALL_TRUE_I8, + CALL_TRUE_I16, + CALL_TRUE_I32, + CALL_TRUE_I64, + CALL_TRUE_F32, + CALL_TRUE_F64); + + +// ============================================================================ +// OPCODE_CALL_INDIRECT +// ============================================================================ +EMITTER(CALL_INDIRECT, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.CallIndirect(i.instr, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_INDIRECT, + CALL_INDIRECT); + + +// ============================================================================ +// OPCODE_CALL_INDIRECT_TRUE +// ============================================================================ +EMITTER(CALL_INDIRECT_TRUE_I8, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_I16, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_I32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_F32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER(CALL_INDIRECT_TRUE_F64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + Xbyak::Label skip; + e.jz(skip); + e.CallIndirect(i.instr, i.src2); + e.L(skip); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_INDIRECT_TRUE, + CALL_INDIRECT_TRUE_I8, + CALL_INDIRECT_TRUE_I16, + CALL_INDIRECT_TRUE_I32, + CALL_INDIRECT_TRUE_I64, + CALL_INDIRECT_TRUE_F32, + CALL_INDIRECT_TRUE_F64); + + +// ============================================================================ +// OPCODE_CALL_EXTERN +// ============================================================================ +EMITTER(CALL_EXTERN, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.CallExtern(i.instr, i.src1.value); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CALL_EXTERN, + CALL_EXTERN); + + +// ============================================================================ +// OPCODE_RETURN +// ============================================================================ +EMITTER(RETURN, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // If this is the last instruction in the last block, just let us + // fall through. + if (i.instr->next || i.instr->block->next) { + e.jmp("epilog", CodeGenerator::T_NEAR); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_RETURN, + RETURN); + + +// ============================================================================ +// OPCODE_RETURN_TRUE +// ============================================================================ +EMITTER(RETURN_TRUE_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER(RETURN_TRUE_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz("epilog", CodeGenerator::T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_RETURN_TRUE, + RETURN_TRUE_I8, + RETURN_TRUE_I16, + RETURN_TRUE_I32, + RETURN_TRUE_I64, + RETURN_TRUE_F32, + RETURN_TRUE_F64); + + +// ============================================================================ +// OPCODE_SET_RETURN_ADDRESS +// ============================================================================ +EMITTER(SET_RETURN_ADDRESS, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.SetReturnAddress(i.src1.constant()); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SET_RETURN_ADDRESS, + SET_RETURN_ADDRESS); + + +// ============================================================================ +// OPCODE_BRANCH +// ============================================================================ +EMITTER(BRANCH, MATCH(I)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.jmp(i.src1.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BRANCH, + BRANCH); + + +// ============================================================================ +// OPCODE_BRANCH_TRUE +// ============================================================================ +EMITTER(BRANCH_TRUE_I8, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_I16, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_I32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_I64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_F32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_TRUE_F64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BRANCH_TRUE, + BRANCH_TRUE_I8, + BRANCH_TRUE_I16, + BRANCH_TRUE_I32, + BRANCH_TRUE_I64, + BRANCH_TRUE_F32, + BRANCH_TRUE_F64); + + +// ============================================================================ +// OPCODE_BRANCH_FALSE +// ============================================================================ +EMITTER(BRANCH_FALSE_I8, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_I16, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_I32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_I64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_F32, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER(BRANCH_FALSE_F64, MATCH(I, LabelOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.jz(i.src2.value->name, e.T_NEAR); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BRANCH_FALSE, + BRANCH_FALSE_I8, + BRANCH_FALSE_I16, + BRANCH_FALSE_I32, + BRANCH_FALSE_I64, + BRANCH_FALSE_F32, + BRANCH_FALSE_F64); + + +// ============================================================================ +// OPCODE_ASSIGN +// ============================================================================ +EMITTER(ASSIGN_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, i.src1); + } +}; +EMITTER(ASSIGN_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ASSIGN, + ASSIGN_I8, + ASSIGN_I16, + ASSIGN_I32, + ASSIGN_I64, + ASSIGN_F32, + ASSIGN_F64, + ASSIGN_V128); + + +// ============================================================================ +// OPCODE_CAST +// ============================================================================ +EMITTER(CAST_I32_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovd(i.dest, i.src1); + } +}; +EMITTER(CAST_I64_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovq(i.dest, i.src1); + } +}; +EMITTER(CAST_F32_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovd(i.dest, i.src1); + } +}; +EMITTER(CAST_F64_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovq(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CAST, + CAST_I32_F32, + CAST_I64_F64, + CAST_F32_I32, + CAST_F64_I64); + + +// ============================================================================ +// OPCODE_ZERO_EXTEND +// ============================================================================ +EMITTER(ZERO_EXTEND_I16_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I32_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I64_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I32_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I64_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest, i.src1); + } +}; +EMITTER(ZERO_EXTEND_I64_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest.reg().cvt32(), i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ZERO_EXTEND, + ZERO_EXTEND_I16_I8, + ZERO_EXTEND_I32_I8, + ZERO_EXTEND_I64_I8, + ZERO_EXTEND_I32_I16, + ZERO_EXTEND_I64_I16, + ZERO_EXTEND_I64_I32); + + +// ============================================================================ +// OPCODE_SIGN_EXTEND +// ============================================================================ +EMITTER(SIGN_EXTEND_I16_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I32_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I64_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I32_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I64_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsx(i.dest, i.src1); + } +}; +EMITTER(SIGN_EXTEND_I64_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movsxd(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SIGN_EXTEND, + SIGN_EXTEND_I16_I8, + SIGN_EXTEND_I32_I8, + SIGN_EXTEND_I64_I8, + SIGN_EXTEND_I32_I16, + SIGN_EXTEND_I64_I16, + SIGN_EXTEND_I64_I32); + + +// ============================================================================ +// OPCODE_TRUNCATE +// ============================================================================ +EMITTER(TRUNCATE_I8_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt8()); + } +}; +EMITTER(TRUNCATE_I8_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt8()); + } +}; +EMITTER(TRUNCATE_I8_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt8()); + } +}; +EMITTER(TRUNCATE_I16_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt16()); + } +}; +EMITTER(TRUNCATE_I16_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.movzx(i.dest.reg().cvt32(), i.src1.reg().cvt16()); + } +}; +EMITTER(TRUNCATE_I32_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, i.src1.reg().cvt32()); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_TRUNCATE, + TRUNCATE_I8_I16, + TRUNCATE_I8_I32, + TRUNCATE_I8_I64, + TRUNCATE_I16_I32, + TRUNCATE_I16_I64, + TRUNCATE_I32_I64); + + +// ============================================================================ +// OPCODE_CONVERT +// ============================================================================ +EMITTER(CONVERT_I32_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtss2si(i.dest, i.src1); + } +}; +EMITTER(CONVERT_I32_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvttsd2si(i.dest, i.src1); + } +}; +EMITTER(CONVERT_I64_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvttsd2si(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F32_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtsi2ss(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F32_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtsd2ss(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F64_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcvtsi2sd(i.dest, i.src1); + } +}; +EMITTER(CONVERT_F64_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcvtss2sd(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CONVERT, + CONVERT_I32_F32, + CONVERT_I32_F64, + CONVERT_I64_F64, + CONVERT_F32_I32, + CONVERT_F32_F64, + CONVERT_F64_I64, + CONVERT_F64_F32); + + +// ============================================================================ +// OPCODE_ROUND +// ============================================================================ +EMITTER(ROUND_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.vroundss(i.dest, i.src1, B00000011); + break; + case ROUND_TO_NEAREST: + e.vroundss(i.dest, i.src1, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.vroundss(i.dest, i.src1, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.vroundss(i.dest, i.src1, B00000010); + break; + } + } +}; +EMITTER(ROUND_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.vroundsd(i.dest, i.src1, B00000011); + break; + case ROUND_TO_NEAREST: + e.vroundsd(i.dest, i.src1, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.vroundsd(i.dest, i.src1, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.vroundsd(i.dest, i.src1, B00000010); + break; + } + } +}; +EMITTER(ROUND_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case ROUND_TO_ZERO: + e.vroundps(i.dest, i.src1, B00000011); + break; + case ROUND_TO_NEAREST: + e.vroundps(i.dest, i.src1, B00000000); + break; + case ROUND_TO_MINUS_INFINITY: + e.vroundps(i.dest, i.src1, B00000001); + break; + case ROUND_TO_POSITIVE_INFINITY: + e.vroundps(i.dest, i.src1, B00000010); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ROUND, + ROUND_F32, + ROUND_F64, + ROUND_V128); + + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_I2F +// ============================================================================ +EMITTER(VECTOR_CONVERT_I2F, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // flags = ARITHMETIC_UNSIGNED + // TODO(benvanik): are these really the same? VC++ thinks so. + e.vcvtdq2ps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_CONVERT_I2F, + VECTOR_CONVERT_I2F); + + +// ============================================================================ +// OPCODE_VECTOR_CONVERT_F2I +// ============================================================================ +EMITTER(VECTOR_CONVERT_F2I, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // flags = ARITHMETIC_UNSIGNED | ARITHMETIC_UNSIGNED + // TODO(benvanik): are these really the same? VC++ thinks so. + e.vcvttps2dq(i.dest, i.src1); + if (i.instr->flags & ARITHMETIC_SATURATE) { + // TODO(benvanik): check saturation. + // In theory cvt throws if it saturates. + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_CONVERT_F2I, + VECTOR_CONVERT_F2I); + + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHL +// ============================================================================ +static vec128_t lvsl_table[17] = { + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), +}; +EMITTER(LOAD_VECTOR_SHL_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + XEASSERT(sh < XECOUNT(lvsl_table)); + e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); + e.vmovaps(i.dest, e.ptr[e.rax]); + } else { +#if XE_DEBUG + // We should only ever be getting values in [0,16]. Assert that. + Xbyak::Label skip; + e.cmp(i.src1, 17); + e.jb(skip); + e.Trap(); + e.L(skip); +#endif // XE_DEBUG + // TODO(benvanik): find a cheaper way of doing this. + e.movzx(e.rdx, i.src1); + e.shl(e.rdx, 4); + e.mov(e.rax, (uintptr_t)lvsl_table); + e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + e.ReloadEDX(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_VECTOR_SHL, + LOAD_VECTOR_SHL_I8); + + +// ============================================================================ +// OPCODE_LOAD_VECTOR_SHR +// ============================================================================ +static vec128_t lvsr_table[17] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), +}; +EMITTER(LOAD_VECTOR_SHR_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + auto sh = i.src1.constant(); + XEASSERT(sh < XECOUNT(lvsr_table)); + e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); + e.vmovaps(i.dest, e.ptr[e.rax]); + } else { +#if XE_DEBUG + // We should only ever be getting values in [0,16]. Assert that. + Xbyak::Label skip; + e.cmp(i.src1, 17); + e.jb(skip); + e.Trap(); + e.L(skip); +#endif // XE_DEBUG + // TODO(benvanik): find a cheaper way of doing this. + e.movzx(e.rdx, i.src1); + e.shl(e.rdx, 4); + e.mov(e.rax, (uintptr_t)lvsr_table); + e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); + e.ReloadEDX(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_VECTOR_SHR, + LOAD_VECTOR_SHR_I8); + + +// ============================================================================ +// OPCODE_LOAD_CLOCK +// ============================================================================ +EMITTER(LOAD_CLOCK, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // It'd be cool to call QueryPerformanceCounter directly, but w/e. + e.CallNative(LoadClock); + e.mov(i.dest, e.rax); + } + static uint64_t LoadClock(void* raw_context) { + LARGE_INTEGER counter; + uint64_t time = 0; + if (QueryPerformanceCounter(&counter)) { + time = counter.QuadPart; + } + return time; + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_CLOCK, + LOAD_CLOCK); + + +// ============================================================================ +// OPCODE_LOAD_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +EMITTER(LOAD_LOCAL_I8, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.byte[e.rsp + i.src1.constant()]); + //e.TraceLoadI8(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_I16, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.word[e.rsp + i.src1.constant()]); + //e.TraceLoadI16(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.dword[e.rsp + i.src1.constant()]); + //e.TraceLoadI32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_I64, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(i.dest, e.qword[e.rsp + i.src1.constant()]); + //e.TraceLoadI64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_F32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovss(i.dest, e.dword[e.rsp + i.src1.constant()]); + //e.TraceLoadF32(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_F64, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovsd(i.dest, e.qword[e.rsp + i.src1.constant()]); + //e.TraceLoadF64(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER(LOAD_LOCAL_V128, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vmovaps(i.dest, e.ptr[e.rsp + i.src1.constant()]); + //e.TraceLoadV128(DATA_LOCAL, i.src1.constant, i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_LOCAL, + LOAD_LOCAL_I8, + LOAD_LOCAL_I16, + LOAD_LOCAL_I32, + LOAD_LOCAL_I64, + LOAD_LOCAL_F32, + LOAD_LOCAL_F64, + LOAD_LOCAL_V128); + + +// ============================================================================ +// OPCODE_STORE_LOCAL +// ============================================================================ +// Note: all types are always aligned on the stack. +EMITTER(STORE_LOCAL_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI8(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.byte[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI16(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.word[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI32(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.dword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreI64(DATA_LOCAL, i.src1.constant, i.src2); + e.mov(e.qword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreF32(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovss(e.dword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreF64(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovsd(e.qword[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER(STORE_LOCAL_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + //e.TraceStoreV128(DATA_LOCAL, i.src1.constant, i.src2); + e.vmovaps(e.ptr[e.rsp + i.src1.constant()], i.src2); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_STORE_LOCAL, + STORE_LOCAL_I8, + STORE_LOCAL_I16, + STORE_LOCAL_I32, + STORE_LOCAL_I64, + STORE_LOCAL_F32, + STORE_LOCAL_F64, + STORE_LOCAL_V128); + + +// ============================================================================ +// OPCODE_LOAD_CONTEXT +// ============================================================================ +// Note: all types are always aligned in the context. +RegExp ComputeContextAddress(X64Emitter& e, const OffsetOp& offset) { + return e.rcx + offset.value; +} +EMITTER(LOAD_CONTEXT_I8, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.byte[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.byte[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI8); + } + } +}; +EMITTER(LOAD_CONTEXT_I16, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.word[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.word[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI16); + } + } +}; +EMITTER(LOAD_CONTEXT_I32, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI32); + } + } +}; +EMITTER(LOAD_CONTEXT_I64, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.mov(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadI64); + } + } +}; +EMITTER(LOAD_CONTEXT_F32, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovss(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadF32); + } + } +}; +EMITTER(LOAD_CONTEXT_F64, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovsd(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadF64); + } + } +}; +EMITTER(LOAD_CONTEXT_V128, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + e.vmovaps(i.dest, e.ptr[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextLoadV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD_CONTEXT, + LOAD_CONTEXT_I8, + LOAD_CONTEXT_I16, + LOAD_CONTEXT_I32, + LOAD_CONTEXT_I64, + LOAD_CONTEXT_F32, + LOAD_CONTEXT_F64, + LOAD_CONTEXT_V128); + + +// ============================================================================ +// OPCODE_STORE_CONTEXT +// ============================================================================ +// Note: all types are always aligned on the stack. +EMITTER(STORE_CONTEXT_I8, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.byte[addr], i.src2.constant()); + } else { + e.mov(e.byte[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.byte[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI8); + } + } +}; +EMITTER(STORE_CONTEXT_I16, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.word[addr], i.src2.constant()); + } else { + e.mov(e.word[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.word[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI16); + } + } +}; +EMITTER(STORE_CONTEXT_I32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI32); + } + } +}; +EMITTER(STORE_CONTEXT_I64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.mov(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreI64); + } + } +}; +EMITTER(STORE_CONTEXT_F32, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.value->constant.i32); + } else { + e.vmovss(e.dword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.dword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreF32); + } + } +}; +EMITTER(STORE_CONTEXT_F64, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.value->constant.i64); + } else { + e.vmovsd(e.qword[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.qword[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreF64); + } + } +}; +EMITTER(STORE_CONTEXT_V128, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeContextAddress(e, i.src1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + e.vmovaps(e.ptr[addr], i.src2); + } + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.mov(e.rdx, i.src1.value); + e.CallNative(TraceContextStoreV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_STORE_CONTEXT, + STORE_CONTEXT_I8, + STORE_CONTEXT_I16, + STORE_CONTEXT_I32, + STORE_CONTEXT_I64, + STORE_CONTEXT_F32, + STORE_CONTEXT_F64, + STORE_CONTEXT_V128); + + +// ============================================================================ +// OPCODE_LOAD +// ============================================================================ +// Note: most *should* be aligned, but needs to be checked! +template +RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { + if (guest.is_constant) { + // TODO(benvanik): figure out how to do this without a temp. + // Since the constant is often 0x8... if we tried to use that as a + // displacement it would be sign extended and mess things up. + e.mov(e.eax, static_cast(guest.constant())); + return e.rdx + e.rax; + } else { + // Clear the top 32 bits, as they are likely garbage. + // TODO(benvanik): find a way to avoid doing this. + e.mov(e.eax, guest.reg().cvt32()); + return e.rdx + e.rax; + } +} +EMITTER(LOAD_I8, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.byte[addr]); + if (IsTracingData()) { + e.mov(e.r8b, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI8); + } + } +}; +EMITTER(LOAD_I16, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.word[addr]); + if (IsTracingData()) { + e.mov(e.r8w, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI16); + } + } +}; +EMITTER(LOAD_I32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.mov(e.r8d, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI32); + } + } +}; +EMITTER(LOAD_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.mov(e.r8, i.dest); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadI64); + } + } +}; +EMITTER(LOAD_F32, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.vmovss(i.dest, e.dword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.dword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadF32); + } + } +}; +EMITTER(LOAD_F64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.vmovsd(i.dest, e.qword[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.qword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadF64); + } + } +}; +EMITTER(LOAD_V128, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + // TODO(benvanik): we should try to stick to movaps if possible. + e.vmovups(i.dest, e.ptr[addr]); + if (IsTracingData()) { + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryLoadV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOAD, + LOAD_I8, + LOAD_I16, + LOAD_I32, + LOAD_I64, + LOAD_F32, + LOAD_F64, + LOAD_V128); + + +// ============================================================================ +// OPCODE_STORE +// ============================================================================ +// Note: most *should* be aligned, but needs to be checked! +void EmitMarkPageDirty(X64Emitter& e, RegExp& addr) { + // 16KB pages. + e.shr(e.eax, 14); + e.and(e.eax, 0x7FFF); + e.mov(e.byte[e.rdx + e.rax + e.page_table_address()], 1); +} +EMITTER(STORE_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.byte[addr], i.src2.constant()); + } else { + e.mov(e.byte[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.r8b, e.byte[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI8); + } + } +}; +EMITTER(STORE_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.word[addr], i.src2.constant()); + } else { + e.mov(e.word[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.r8w, e.word[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI16); + } + } +}; +EMITTER(STORE_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.constant()); + } else { + e.mov(e.dword[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.r8d, e.dword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI32); + } + } +}; +EMITTER(STORE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.constant()); + } else { + e.mov(e.qword[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.mov(e.r8, e.qword[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreI64); + } + } +}; +EMITTER(STORE_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.mov(e.dword[addr], i.src2.value->constant.i32); + } else { + e.vmovss(e.dword[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreF32); + } + } +}; +EMITTER(STORE_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.MovMem64(addr, i.src2.value->constant.i64); + } else { + e.vmovsd(e.qword[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreF64); + } + } +}; +EMITTER(STORE_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vmovaps(e.ptr[addr], e.xmm0); + } else { + e.vmovaps(e.ptr[addr], i.src2); + } + EmitMarkPageDirty(e, addr); + if (IsTracingData()) { + auto addr = ComputeMemoryAddress(e, i.src1); + e.lea(e.r8, e.ptr[addr]); + e.lea(e.rdx, e.ptr[addr]); + e.CallNative(TraceMemoryStoreV128); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_STORE, + STORE_I8, + STORE_I16, + STORE_I32, + STORE_I64, + STORE_F32, + STORE_F64, + STORE_V128); + + +// ============================================================================ +// OPCODE_PREFETCH +// ============================================================================ +EMITTER(PREFETCH, MATCH(I, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): prefetch addr -> length. + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_PREFETCH, + PREFETCH); + + +// ============================================================================ +// OPCODE_MAX +// ============================================================================ +EMITTER(MAX_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmaxss(dest, src1, src2); + }); + } +}; +EMITTER(MAX_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmaxsd(dest, src1, src2); + }); + } +}; +EMITTER(MAX_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmaxps(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MAX, + MAX_F32, + MAX_F64, + MAX_V128); + + +// ============================================================================ +// OPCODE_MIN +// ============================================================================ +EMITTER(MIN_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vminss(dest, src1, src2); + }); + } +}; +EMITTER(MIN_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vminsd(dest, src1, src2); + }); + } +}; +EMITTER(MIN_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vminps(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MIN, + MIN_F32, + MIN_F64, + MIN_V128); + + +// ============================================================================ +// OPCODE_SELECT +// ============================================================================ +// dest = src1 ? src2 : src3 +// TODO(benvanik): match compare + select sequences, as often it's something +// like SELECT(VECTOR_COMPARE_SGE(a, b), a, b) +EMITTER(SELECT_I8, MATCH(I, I8<>, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest.reg().cvt32(), i.src2.reg().cvt32()); + e.cmovz(i.dest.reg().cvt32(), i.src3.reg().cvt32()); + } +}; +EMITTER(SELECT_I16, MATCH(I, I8<>, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest.reg().cvt32(), i.src2.reg().cvt32()); + e.cmovz(i.dest.reg().cvt32(), i.src3.reg().cvt32()); + } +}; +EMITTER(SELECT_I32, MATCH(I, I8<>, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest, i.src2); + e.cmovz(i.dest, i.src3); + } +}; +EMITTER(SELECT_I64, MATCH(I, I8<>, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.cmovnz(i.dest, i.src2); + e.cmovz(i.dest, i.src3); + } +}; +EMITTER(SELECT_F32, MATCH(I, I8<>, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find a shorter sequence. + // xmm0 = src1 != 0 ? 1111... : 0000.... + e.movzx(e.eax, i.src1); + e.vmovd(e.xmm1, e.eax); + e.vxorps(e.xmm0, e.xmm0); + e.vcmpneqss(e.xmm0, e.xmm1); + e.vpand(e.xmm1, e.xmm0, i.src2); + e.vpandn(i.dest, e.xmm0, i.src3); + e.vpor(i.dest, e.xmm1); + } +}; +EMITTER(SELECT_F64, MATCH(I, I8<>, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // xmm0 = src1 != 0 ? 1111... : 0000.... + e.movzx(e.eax, i.src1); + e.vmovd(e.xmm1, e.eax); + e.vxorpd(e.xmm0, e.xmm0); + e.vcmpneqsd(e.xmm0, e.xmm1); + e.vpand(e.xmm1, e.xmm0, i.src2); + e.vpandn(i.dest, e.xmm0, i.src3); + e.vpor(i.dest, e.xmm1); + } +}; +EMITTER(SELECT_V128, MATCH(I, I8<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find a shorter sequence. + // xmm0 = src1 != 0 ? 1111... : 0000.... + e.movzx(e.eax, i.src1); + e.vmovd(e.xmm1, e.eax); + e.vpbroadcastd(e.xmm1, e.xmm1); + e.vxorps(e.xmm0, e.xmm0); + e.vcmpneqps(e.xmm0, e.xmm1); + e.vpand(e.xmm1, e.xmm0, i.src2); + e.vpandn(i.dest, e.xmm0, i.src3); + e.vpor(i.dest, e.xmm1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SELECT, + SELECT_I8, + SELECT_I16, + SELECT_I32, + SELECT_I64, + SELECT_F32, + SELECT_F64, + SELECT_V128); + + +// ============================================================================ +// OPCODE_IS_TRUE +// ============================================================================ +EMITTER(IS_TRUE_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER(IS_TRUE_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setnz(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_IS_TRUE, + IS_TRUE_I8, + IS_TRUE_I16, + IS_TRUE_I32, + IS_TRUE_I64, + IS_TRUE_F32, + IS_TRUE_F64, + IS_TRUE_V128); + + +// ============================================================================ +// OPCODE_IS_FALSE +// ============================================================================ +EMITTER(IS_FALSE_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.test(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER(IS_FALSE_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vptest(i.src1, i.src1); + e.setz(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_IS_FALSE, + IS_FALSE_I8, + IS_FALSE_I16, + IS_FALSE_I32, + IS_FALSE_I64, + IS_FALSE_F32, + IS_FALSE_F64, + IS_FALSE_V128); + + +// ============================================================================ +// OPCODE_COMPARE_EQ +// ============================================================================ +EMITTER(COMPARE_EQ_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg8& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg16& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg32& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg64& src1, int32_t constant) { e.cmp(src1, constant); }); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomiss(i.src1, i.src2); + e.sete(i.dest); + } +}; +EMITTER(COMPARE_EQ_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomisd(i.src1, i.src2); + e.sete(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_COMPARE_EQ, + COMPARE_EQ_I8, + COMPARE_EQ_I16, + COMPARE_EQ_I32, + COMPARE_EQ_I64, + COMPARE_EQ_F32, + COMPARE_EQ_F64); + + +// ============================================================================ +// OPCODE_COMPARE_NE +// ============================================================================ +EMITTER(COMPARE_NE_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg8& src1, const Reg8& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg8& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg16& src1, const Reg16& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg16& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg32& src1, const Reg32& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg32& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeCompareOp( + e, i, + [](X64Emitter& e, const Reg64& src1, const Reg64& src2) { e.cmp(src1, src2); }, + [](X64Emitter& e, const Reg64& src1, int32_t constant) { e.cmp(src1, constant); }); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomiss(i.src1, i.src2); + e.setne(i.dest); + } +}; +EMITTER(COMPARE_NE_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcomisd(i.src1, i.src2); + e.setne(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_COMPARE_NE, + COMPARE_NE_I8, + COMPARE_NE_I16, + COMPARE_NE_I32, + COMPARE_NE_I64, + COMPARE_NE_F32, + COMPARE_NE_F64); + + +// ============================================================================ +// OPCODE_COMPARE_* +// ============================================================================ +#define EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, type, reg_type) \ + EMITTER(COMPARE_##op##_##type, MATCH(I, type<>, type<>>)) { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + EmitAssociativeCompareOp( \ + e, i, \ + [](X64Emitter& e, const Reg8& dest, const reg_type& src1, const reg_type& src2, bool inverse) { \ + e.cmp(src1, src2); \ + if (!inverse) { e.instr(dest); } else { e.inverse_instr(dest); } \ + }, \ + [](X64Emitter& e, const Reg8& dest, const reg_type& src1, int32_t constant, bool inverse) { \ + e.cmp(src1, constant); \ + if (!inverse) { e.instr(dest); } else { e.inverse_instr(dest); } \ + }); \ + } \ + }; +#define EMITTER_ASSOCIATIVE_COMPARE_XX(op, instr, inverse_instr) \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I8, Reg8); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I16, Reg16); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I32, Reg32); \ + EMITTER_ASSOCIATIVE_COMPARE_INT(op, instr, inverse_instr, I64, Reg64); \ + EMITTER_OPCODE_TABLE( \ + OPCODE_COMPARE_##op##, \ + COMPARE_##op##_I8, \ + COMPARE_##op##_I16, \ + COMPARE_##op##_I32, \ + COMPARE_##op##_I64); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLT, setl, setge); +EMITTER_ASSOCIATIVE_COMPARE_XX(SLE, setle, setg); +EMITTER_ASSOCIATIVE_COMPARE_XX(SGT, setg, setle); +EMITTER_ASSOCIATIVE_COMPARE_XX(SGE, setge, setl); +EMITTER_ASSOCIATIVE_COMPARE_XX(ULT, setb, setae); +EMITTER_ASSOCIATIVE_COMPARE_XX(ULE, setbe, seta); +EMITTER_ASSOCIATIVE_COMPARE_XX(UGT, seta, setbe); +EMITTER_ASSOCIATIVE_COMPARE_XX(UGE, setae, setb); + +// http://x86.renejeschke.de/html/file_module_x86_id_288.html +#define EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(op, instr) \ + EMITTER(COMPARE_##op##_F32, MATCH(I, F32<>, F32<>>)) { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + e.vcomiss(i.src1, i.src2); \ + e.instr(i.dest); \ + } \ + }; \ + EMITTER(COMPARE_##op##_F64, MATCH(I, F64<>, F64<>>)) { \ + static void Emit(X64Emitter& e, const EmitArgType& i) { \ + if (i.src1.is_constant) { \ + e.LoadConstantXmm(e.xmm0, i.src1.constant()); \ + e.vcomisd(e.xmm0, i.src2); \ + } else if (i.src2.is_constant) { \ + e.LoadConstantXmm(e.xmm0, i.src2.constant()); \ + e.vcomisd(i.src1, e.xmm0); \ + } else { \ + e.vcomisd(i.src1, i.src2); \ + } \ + e.instr(i.dest); \ + } \ + }; \ + EMITTER_OPCODE_TABLE( \ + OPCODE_COMPARE_##op##_FLT, \ + COMPARE_##op##_F32, \ + COMPARE_##op##_F64); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLT, setb); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SLE, setbe); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGT, seta); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(SGE, setae); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULT, setb); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(ULE, setbe); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGT, seta); +EMITTER_ASSOCIATIVE_COMPARE_FLT_XX(UGE, setae); + + +// ============================================================================ +// OPCODE_DID_CARRY +// ============================================================================ +// TODO(benvanik): salc/setalc +// https://code.google.com/p/corkami/wiki/x86oddities +EMITTER(DID_CARRY_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER(DID_CARRY_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER(DID_CARRY_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER(DID_CARRY_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.src1.is_constant); + e.LoadEflags(); + e.setc(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DID_CARRY, + DID_CARRY_I8, + DID_CARRY_I16, + DID_CARRY_I32, + DID_CARRY_I64); + + +// ============================================================================ +// OPCODE_DID_OVERFLOW +// ============================================================================ +EMITTER(DID_OVERFLOW, MATCH(I>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.LoadEflags(); + e.seto(i.dest); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DID_OVERFLOW, + DID_OVERFLOW); + + +// ============================================================================ +// OPCODE_DID_SATURATE +// ============================================================================ +EMITTER(DID_SATURATE, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): implement saturation check (VECTOR_ADD, etc). + e.xor(i.dest, i.dest); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE, + DID_SATURATE); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_EQ +// ============================================================================ +EMITTER(VECTOR_COMPARE_EQ_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpcmpeqw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpcmpeqd(dest, src1, src2); + break; + case FLOAT32_TYPE: + e.vcmpeqps(dest, src1, src2); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_EQ, + VECTOR_COMPARE_EQ_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGT +// ============================================================================ +EMITTER(VECTOR_COMPARE_SGT_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(dest, src1, src2); + break; + case INT16_TYPE: + e.vpcmpgtw(dest, src1, src2); + break; + case INT32_TYPE: + e.vpcmpgtd(dest, src1, src2); + break; + case FLOAT32_TYPE: + e.vcmpgtps(dest, src1, src2); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_SGT, + VECTOR_COMPARE_SGT_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_SGE +// ============================================================================ +EMITTER(VECTOR_COMPARE_SGE_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAssociativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm0, src1, src2); + e.vpcmpgtb(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm0, src1, src2); + e.vpcmpgtw(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm0, src1, src2); + e.vpcmpgtd(dest, src1, src2); + e.vpor(dest, e.xmm0); + break; + case FLOAT32_TYPE: + e.vcmpgeps(dest, src1, src2); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_SGE, + VECTOR_COMPARE_SGE_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGT +// ============================================================================ +EMITTER(VECTOR_COMPARE_UGT_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + break; + case INT16_TYPE: + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + break; + case INT32_TYPE: + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + break; + case FLOAT32_TYPE: + e.vcmpgtps(i.dest, e.xmm0, e.xmm1); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_UGT, + VECTOR_COMPARE_UGT_V128); + + +// ============================================================================ +// OPCODE_VECTOR_COMPARE_UGE +// ============================================================================ +EMITTER(VECTOR_COMPARE_UGE_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy + switch (i.instr->flags) { + case INT8_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI8); + break; + case INT16_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI16); + break; + case INT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskI32); + break; + case FLOAT32_TYPE: + sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); + break; + } + if (i.src1.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpxor(e.xmm0, sign_addr); + } else { + e.vpxor(e.xmm0, i.src1, sign_addr); + } + if (i.src2.is_constant) { + // TODO(benvanik): make this constant. + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.vpxor(e.xmm1, sign_addr); + } else { + e.vpxor(e.xmm1, i.src2, sign_addr); + } + switch (i.instr->flags) { + case INT8_TYPE: + e.vpcmpeqb(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtb(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case INT16_TYPE: + e.vpcmpeqw(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtw(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case INT32_TYPE: + e.vpcmpeqd(e.xmm2, e.xmm0, e.xmm1); + e.vpcmpgtd(i.dest, e.xmm0, e.xmm1); + e.vpor(i.dest, e.xmm2); + break; + case FLOAT32_TYPE: + e.vcmpgeps(i.dest, e.xmm0, e.xmm1); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_COMPARE_UGE, + VECTOR_COMPARE_UGE_V128); + + +// ============================================================================ +// OPCODE_ADD +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAddXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.add(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.add(dest_src, constant); }); + if (i.instr->flags & ARITHMETIC_SET_CARRY) { + // CF is set if carried. + e.StoreEflags(); + } +} +EMITTER(ADD_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddXX(e, i); + } +}; +EMITTER(ADD_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vaddss(dest, src1, src2); + }); + } +}; +EMITTER(ADD_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vaddsd(dest, src1, src2); + }); + } +}; +EMITTER(ADD_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vaddps(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ADD, + ADD_I8, + ADD_I16, + ADD_I32, + ADD_I64, + ADD_F32, + ADD_F64, + ADD_V128); + + +// ============================================================================ +// OPCODE_ADD_CARRY +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAddCarryXX(X64Emitter& e, const ARGS& i) { + // TODO(benvanik): faster setting? we could probably do some fun math tricks + // here to get the carry flag set. + if (i.src3.is_constant) { + if (i.src3.constant()) { + e.stc(); + } else { + e.clc(); + } + } else { + if (i.src3.reg().getIdx() <= 4) { + // Can move from A/B/C/DX to AH. + e.mov(e.ah, i.src3.reg().cvt8()); + } else { + e.mov(e.al, i.src3); + e.mov(e.ah, e.al); + } + e.sahf(); + } + if (i.src1.is_constant && i.src2.is_constant) { + auto ab = i.src1.constant() + i.src2.constant(); + if (!ab) { + e.xor(i.dest, i.dest); + } else { + e.mov(i.dest, ab); + } + e.adc(i.dest, 0); + } else { + SEQ::EmitCommutativeBinaryOp( + e, i, [](X64Emitter& e, const REG& dest_src, const REG& src) { + e.adc(dest_src, src); + }, [](X64Emitter& e, const REG& dest_src, int32_t constant) { + e.adc(dest_src, constant); + }); + } + if (i.instr->flags & ARITHMETIC_SET_CARRY) { + // CF is set if carried. + e.StoreEflags(); + } +} +EMITTER(ADD_CARRY_I8, MATCH(I, I8<>, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER(ADD_CARRY_I16, MATCH(I, I16<>, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER(ADD_CARRY_I32, MATCH(I, I32<>, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER(ADD_CARRY_I64, MATCH(I, I64<>, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAddCarryXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ADD_CARRY, + ADD_CARRY_I8, + ADD_CARRY_I16, + ADD_CARRY_I32, + ADD_CARRY_I64); + + +// ============================================================================ +// OPCODE_VECTOR_ADD +// ============================================================================ +EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { + const TypeName part_type = static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + bool saturate = !!(arithmetic_flags & ARITHMETIC_SATURATE); + switch (part_type) { + case INT8_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpaddusb(dest, src1, src2); + } else { + e.vpaddsb(dest, src1, src2); + } + } else { + e.vpaddb(dest, src1, src2); + } + break; + case INT16_TYPE: + if (saturate) { + // TODO(benvanik): trace DID_SATURATE + if (is_unsigned) { + e.vpaddusw(dest, src1, src2); + } else { + e.vpaddsw(dest, src1, src2); + } + } else { + e.vpaddw(dest, src1, src2); + } + break; + case INT32_TYPE: + if (saturate) { + if (is_unsigned) { + // We reuse all these temps... + XEASSERT(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); + XEASSERT(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); + // Clamp to 0xFFFFFFFF. + // Wish there was a vpaddusd... + // | A | B | C | D | + // | B | D | + e.vpsllq(e.xmm0, src1, 32); + e.vpsllq(e.xmm1, src2, 32); + e.vpsrlq(e.xmm0, 32); + e.vpsrlq(e.xmm1, 32); + e.vpaddq(e.xmm0, e.xmm1); + e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + e.vpsllq(e.xmm0, 32); + e.vpsrlq(e.xmm0, 32); + // | A | C | + e.vpsrlq(e.xmm1, src1, 32); + e.vpsrlq(e.xmm2, src2, 32); + e.vpaddq(e.xmm1, e.xmm2); + e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + e.vpsllq(e.xmm1, 32); + // xmm0 = mask for with saturated dwords == 111... + e.vpor(e.xmm0, e.xmm1); + e.vpaddd(dest, src1, src2); + // dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n]; + e.vblendvps(dest, dest, e.xmm1, e.xmm1); + } else { + XEASSERTALWAYS(); + } + } else { + e.vpaddd(dest, src1, src2); + } + break; + case FLOAT32_TYPE: + e.vaddps(dest, src1, src2); + break; + default: XEASSERTALWAYS(); break; + } + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_ADD, + VECTOR_ADD); + + +// ============================================================================ +// OPCODE_SUB +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitSubXX(X64Emitter& e, const ARGS& i) { + if (i.instr->flags & ARITHMETIC_SET_CARRY) { + // TODO(benvanik): faster way of doing sub with CF set? + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { + auto temp = GetTempReg(e); + e.mov(temp, src); + e.not(temp); + e.stc(); + e.adc(dest_src, temp); + }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { + auto temp = GetTempReg(e); + e.mov(temp, constant); + e.not(temp); + e.stc(); + e.adc(dest_src, temp); + }); + e.StoreEflags(); + } else { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.sub(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.sub(dest_src, constant); }); + } +} +EMITTER(SUB_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSubXX(e, i); + } +}; +EMITTER(SUB_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vsubss(dest, src1, src2); + }); + } +}; +EMITTER(SUB_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vsubsd(dest, src1, src2); + }); + } +}; +EMITTER(SUB_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vsubps(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SUB, + SUB_I8, + SUB_I16, + SUB_I32, + SUB_I64, + SUB_F32, + SUB_F64, + SUB_V128); + + +// ============================================================================ +// OPCODE_MUL +// ============================================================================ +// Sign doesn't matter here, as we don't use the high bits. +// We exploit mulx here to avoid creating too much register pressure. +EMITTER(MUL_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * edx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.movzx(e.edx, i.src2); + e.mov(e.eax, static_cast(i.src1.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else if (i.src2.is_constant) { + e.movzx(e.edx, i.src1); + e.mov(e.eax, static_cast(i.src2.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else { + e.movzx(e.edx, i.src2); + e.mulx(e.edx, i.dest.reg().cvt32(), i.src1.reg().cvt32()); + } + } +}; +EMITTER(MUL_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * edx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.movzx(e.edx, i.src2); + e.mov(e.ax, static_cast(i.src1.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else if (i.src2.is_constant) { + e.movzx(e.edx, i.src1); + e.mov(e.ax, static_cast(i.src2.constant())); + e.mulx(e.edx, i.dest.reg().cvt32(), e.eax); + } else { + e.movzx(e.edx, i.src2); + e.mulx(e.edx, i.dest.reg().cvt32(), i.src1.reg().cvt32()); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * edx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.mov(e.edx, i.src2); + e.mov(e.eax, i.src1.constant()); + e.mulx(e.edx, i.dest, e.eax); + } else if (i.src2.is_constant) { + e.mov(e.edx, i.src1); + e.mov(e.eax, i.src2.constant()); + e.mulx(e.edx, i.dest, e.eax); + } else { + e.mov(e.edx, i.src2); + e.mulx(e.edx, i.dest, i.src1); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest hi, dest low = src * rdx + // TODO(benvanik): place src2 in edx? + if (i.src1.is_constant) { + XEASSERT(!i.src2.is_constant); + e.mov(e.rdx, i.src2); + e.mov(e.rax, i.src1.constant()); + e.mulx(e.rdx, i.dest, e.rax); + } else if (i.src2.is_constant) { + e.mov(e.rdx, i.src1); + e.mov(e.rax, i.src2.constant()); + e.mulx(e.rdx, i.dest, e.rax); + } else { + e.mov(e.rdx, i.src2); + e.mulx(e.rdx, i.dest, i.src1); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulss(dest, src1, src2); + }); + } +}; +EMITTER(MUL_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulsd(dest, src1, src2); + }); + } +}; +EMITTER(MUL_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulps(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL, + MUL_I8, + MUL_I16, + MUL_I32, + MUL_I64, + MUL_F32, + MUL_F64, + MUL_V128); + + +// ============================================================================ +// OPCODE_MUL_HI +// ============================================================================ +EMITTER(MUL_HI_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.movzx(e.edx, i.src1); + e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); + } else { + e.mov(e.al, i.src1); + if (i.src2.is_constant) { + e.mov(e.al, i.src2.constant()); + e.imul(e.al); + } else { + e.imul(i.src2); + } + e.mov(i.dest, e.ah); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_HI_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.movzx(e.edx, i.src1); + e.mulx(i.dest.reg().cvt32(), e.eax, i.src2.reg().cvt32()); + } else { + e.mov(e.ax, i.src1); + if (i.src2.is_constant) { + e.mov(e.dx, i.src2.constant()); + e.imul(e.dx); + } else { + e.imul(i.src2); + } + e.mov(i.dest, e.dx); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_HI_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.mov(e.edx, i.src1); + if (i.src2.is_constant) { + e.mov(e.eax, i.src2.constant()); + e.mulx(i.dest, e.edx, e.eax); + } else { + e.mulx(i.dest, e.edx, i.src2); + } + } else { + e.mov(e.eax, i.src1); + if (i.src2.is_constant) { + e.mov(e.edx, i.src2.constant()); + e.imul(e.edx); + } else { + e.imul(i.src2); + } + e.mov(i.dest, e.edx); + } + e.ReloadEDX(); + } +}; +EMITTER(MUL_HI_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // TODO(benvanik): place src1 in eax? still need to sign extend + e.mov(e.rdx, i.src1); + if (i.src2.is_constant) { + e.mov(e.rax, i.src2.constant()); + e.mulx(i.dest, e.rdx, e.rax); + } else { + e.mulx(i.dest, e.rax, i.src2); + } + } else { + e.mov(e.rax, i.src1); + if (i.src2.is_constant) { + e.mov(e.rdx, i.src2.constant()); + e.imul(e.rdx); + } else { + e.imul(i.src2); + } + e.mov(i.dest, e.rdx); + } + e.ReloadEDX(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL_HI, + MUL_HI_I8, + MUL_HI_I16, + MUL_HI_I32, + MUL_HI_I64); + + +// ============================================================================ +// OPCODE_DIV +// ============================================================================ +// TODO(benvanik): optimize common constant cases. +// TODO(benvanik): simplify code! +EMITTER(DIV_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.cl, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.movzx(e.ax, i.src1); + e.div(e.cl); + } else { + e.movsx(e.ax, i.src1); + e.idiv(e.cl); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.ax, static_cast(i.src1.constant())); + } else { + e.movzx(e.ax, i.src1); + } + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.ax, static_cast(i.src1.constant())); + } else { + e.movsx(e.ax, i.src1); + } + e.idiv(i.src2); + } + } + e.mov(i.dest, e.al); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.cx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.ax, i.src1); + // Zero upper bits. + e.xor(e.dx, e.dx); + e.div(e.cx); + } else { + e.mov(e.ax, i.src1); + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.dx, e.ax); + e.sar(e.dx, 15); + e.idiv(e.cx); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.ax, i.src1.constant()); + } else { + e.mov(e.ax, i.src1); + } + // Zero upper bits. + e.xor(e.dx, e.dx); + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.ax, i.src1.constant()); + } else { + e.mov(e.ax, i.src1); + } + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.dx, e.ax); + e.sar(e.dx, 15); + e.idiv(i.src2); + } + } + e.mov(i.dest, e.ax); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.ecx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.eax, i.src1); + // Zero upper bits. + e.xor(e.edx, e.edx); + e.div(e.ecx); + } else { + e.mov(e.eax, i.src1); + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.edx, e.eax); + e.sar(e.edx, 31); + e.idiv(e.ecx); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.eax, i.src1.constant()); + } else { + e.mov(e.eax, i.src1); + } + // Zero upper bits. + e.xor(e.edx, e.edx); + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.eax, i.src1.constant()); + } else { + e.mov(e.eax, i.src1); + } + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.edx, e.eax); + e.sar(e.edx, 31); + e.idiv(i.src2); + } + } + e.mov(i.dest, e.eax); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // NOTE: RDX clobbered. + bool clobbered_rcx = false; + if (i.src2.is_constant) { + XEASSERT(!i.src1.is_constant); + clobbered_rcx = true; + e.mov(e.rcx, i.src2.constant()); + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + e.mov(e.rax, i.src1); + // Zero upper bits. + e.xor(e.rdx, e.rdx); + e.div(e.rcx); + } else { + e.mov(e.rax, i.src1); + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.rdx, e.rax); + e.sar(e.rdx, 63); + e.idiv(e.rcx); + } + } else { + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + if (i.src1.is_constant) { + e.mov(e.rax, i.src1.constant()); + } else { + e.mov(e.rax, i.src1); + } + // Zero upper bits. + e.xor(e.rdx, e.rdx); + e.div(i.src2); + } else { + if (i.src1.is_constant) { + e.mov(e.rax, i.src1.constant()); + } else { + e.mov(e.rax, i.src1); + } + // Set dx to sign bit of src1 (dx:ax = dx:ax / src). + e.mov(e.rdx, e.rax); + e.sar(e.rdx, 63); + e.idiv(i.src2); + } + } + e.mov(i.dest, e.rax); + if (clobbered_rcx) { + e.ReloadECX(); + } + e.ReloadEDX(); + } +}; +EMITTER(DIV_F32, MATCH(I, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vdivss(dest, src1, src2); + }); + } +}; +EMITTER(DIV_F64, MATCH(I, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vdivsd(dest, src1, src2); + }); + } +}; +EMITTER(DIV_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + EmitAssociativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vdivps(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DIV, + DIV_I8, + DIV_I16, + DIV_I32, + DIV_I64, + DIV_F32, + DIV_F64, + DIV_V128); + + +// ============================================================================ +// OPCODE_MUL_ADD +// ============================================================================ +// d = 1 * 2 + 3 +// $0 = $1x$0 + $2 +// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. +// dest could be src2 or src3 - need to ensure it's not before overwriting dest +// perhaps use other 132/213/etc +EMITTER(MUL_ADD_F32, MATCH(I, F32<>, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmadd213ss(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovss(i.dest, i.src1); + e.vfmadd213ss(i.dest, i.src2, i.src3); + } else { + e.vmovss(e.xmm0, i.src1); + e.vfmadd213ss(e.xmm0, i.src2, i.src3); + e.vmovss(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_ADD_F64, MATCH(I, F64<>, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmadd213sd(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovsd(i.dest, i.src1); + e.vfmadd213sd(i.dest, i.src2, i.src3); + } else { + e.vmovsd(e.xmm0, i.src1); + e.vfmadd213sd(e.xmm0, i.src2, i.src3); + e.vmovsd(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_ADD_V128, MATCH(I, V128<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmadd213ps(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovdqa(i.dest, i.src1); + e.vfmadd213ps(i.dest, i.src2, i.src3); + } else { + e.vmovdqa(e.xmm0, i.src1); + e.vfmadd213ps(e.xmm0, i.src2, i.src3); + e.vmovdqa(i.dest, e.xmm0); + } + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL_ADD, + MUL_ADD_F32, + MUL_ADD_F64, + MUL_ADD_V128); + + +// ============================================================================ +// OPCODE_MUL_SUB +// ============================================================================ +// d = 1 * 2 - 3 +// $0 = $2x$0 - $3 +// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. +// dest could be src2 or src3 - need to ensure it's not before overwriting dest +// perhaps use other 132/213/etc +EMITTER(MUL_SUB_F32, MATCH(I, F32<>, F32<>, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmsub213ss(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovss(i.dest, i.src1); + e.vfmsub213ss(i.dest, i.src2, i.src3); + } else { + e.vmovss(e.xmm0, i.src1); + e.vfmsub213ss(e.xmm0, i.src2, i.src3); + e.vmovss(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_SUB_F64, MATCH(I, F64<>, F64<>, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmsub213sd(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovsd(i.dest, i.src1); + e.vfmsub213sd(i.dest, i.src2, i.src3); + } else { + e.vmovsd(e.xmm0, i.src1); + e.vfmsub213sd(e.xmm0, i.src2, i.src3); + e.vmovsd(i.dest, e.xmm0); + } + } + } +}; +EMITTER(MUL_SUB_V128, MATCH(I, V128<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.dest == i.src1) { + e.vfmsub213ps(i.dest, i.src2, i.src3); + } else { + if (i.dest != i.src2 && i.dest != i.src3) { + e.vmovdqa(i.dest, i.src1); + e.vfmsub213ps(i.dest, i.src2, i.src3); + } else { + e.vmovdqa(e.xmm0, i.src1); + e.vfmsub213ps(e.xmm0, i.src2, i.src3); + e.vmovdqa(i.dest, e.xmm0); + } + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_MUL_SUB, + MUL_SUB_F32, + MUL_SUB_F64, + MUL_SUB_V128); + + +// ============================================================================ +// OPCODE_NEG +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +template +void EmitNegXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitUnaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src) { e.neg(dest_src); }); +} +EMITTER(NEG_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNegXX(e, i); + } +}; +EMITTER(NEG_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); + } +}; +EMITTER(NEG_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vxorpd(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPD)); + } +}; +EMITTER(NEG_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERT(!i.instr->flags); + e.vxorps(i.dest, i.src1, e.GetXmmConstPtr(XMMSignMaskPS)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_NEG, + NEG_I8, + NEG_I16, + NEG_I32, + NEG_I64, + NEG_F32, + NEG_F64, + NEG_V128); + + +// ============================================================================ +// OPCODE_ABS +// ============================================================================ +EMITTER(ABS_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); + } +}; +EMITTER(ABS_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPD)); + } +}; +EMITTER(ABS_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vpand(i.dest, i.src1, e.GetXmmConstPtr(XMMAbsMaskPS)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ABS, + ABS_F32, + ABS_F64, + ABS_V128); + + +// ============================================================================ +// OPCODE_SQRT +// ============================================================================ +EMITTER(SQRT_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vsqrtss(i.dest, i.src1); + } +}; +EMITTER(SQRT_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vsqrtsd(i.dest, i.src1); + } +}; +EMITTER(SQRT_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vsqrtps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SQRT, + SQRT_F32, + SQRT_F64, + SQRT_V128); + + +// ============================================================================ +// OPCODE_RSQRT +// ============================================================================ +EMITTER(RSQRT_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vrsqrtss(i.dest, i.src1); + } +}; +EMITTER(RSQRT_F64, MATCH(I, F64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vcvtsd2ss(i.dest, i.src1); + e.vrsqrtss(i.dest, i.dest); + e.vcvtss2sd(i.dest, i.dest); + } +}; +EMITTER(RSQRT_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.vrsqrtps(i.dest, i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_RSQRT, + RSQRT_F32, + RSQRT_F64, + RSQRT_V128); + + +// ============================================================================ +// OPCODE_POW2 +// ============================================================================ +// TODO(benvanik): use approx here: +// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +EMITTER(POW2_F32, MATCH(I, F32<>>)) { + static __m128 EmulatePow2(__m128 src) { + float result = static_cast(pow(2, src.m128_f32[0])); + return _mm_load_ss(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNativeSafe(EmulatePow2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(POW2_F64, MATCH(I, F64<>>)) { + static __m128d EmulatePow2(__m128 src) { + double result = pow(2, src.m128_f32[0]); + return _mm_load_sd(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNativeSafe(EmulatePow2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(POW2_V128, MATCH(I, V128<>>)) { + static __m128 EmulatePow2(__m128 src) { + __m128 result; + for (size_t i = 0; i < 4; ++i) { + result.m128_f32[i] = static_cast(pow(2, src.m128_f32[i])); + } + return result; + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNativeSafe(EmulatePow2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_POW2, + POW2_F32, + POW2_F64, + POW2_V128); + + +// ============================================================================ +// OPCODE_LOG2 +// ============================================================================ +// TODO(benvanik): use approx here: +// http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html +// TODO(benvanik): this emulated fn destroys all xmm registers! don't do it! +EMITTER(LOG2_F32, MATCH(I, F32<>>)) { + static __m128 EmulateLog2(__m128 src) { + float result = log2(src.m128_f32[0]); + return _mm_load_ss(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNativeSafe(EmulateLog2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(LOG2_F64, MATCH(I, F64<>>)) { + static __m128d EmulateLog2(__m128d src) { + double result = log2(src.m128d_f64[0]); + return _mm_load_sd(&result); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNativeSafe(EmulateLog2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER(LOG2_V128, MATCH(I, V128<>>)) { + static __m128 EmulateLog2(__m128 src) { + __m128 result; + for (size_t i = 0; i < 4; ++i) { + result.m128_f32[i] = log2(src.m128_f32[i]); + } + return result; + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lea(e.r8, e.StashXmm(i.src1)); + e.CallNativeSafe(EmulateLog2); + e.vmovaps(i.dest, e.xmm0); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_LOG2, + LOG2_F32, + LOG2_F64, + LOG2_V128); + + +// ============================================================================ +// OPCODE_DOT_PRODUCT_3 +// ============================================================================ +EMITTER(DOT_PRODUCT_3_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + // TODO(benvanik): apparently this is very slow - find alternative? + e.vdpps(dest, src1, src2, B01110001); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DOT_PRODUCT_3, + DOT_PRODUCT_3_V128); + + +// ============================================================================ +// OPCODE_DOT_PRODUCT_4 +// ============================================================================ +EMITTER(DOT_PRODUCT_4_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // http://msdn.microsoft.com/en-us/library/bb514054(v=vs.90).aspx + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + // TODO(benvanik): apparently this is very slow - find alternative? + e.vdpps(dest, src1, src2, B11110001); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_DOT_PRODUCT_4, + DOT_PRODUCT_4_V128); + + +// ============================================================================ +// OPCODE_AND +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitAndXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.and(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.and(dest_src, constant); }); +} +EMITTER(AND_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAndXX(e, i); + } +}; +EMITTER(AND_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vpand(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_AND, + AND_I8, + AND_I16, + AND_I32, + AND_I64, + AND_V128); + + +// ============================================================================ +// OPCODE_OR +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitOrXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.or(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.or(dest_src, constant); }); +} +EMITTER(OR_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitOrXX(e, i); + } +}; +EMITTER(OR_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vpor(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_OR, + OR_I8, + OR_I16, + OR_I32, + OR_I64, + OR_V128); + + +// ============================================================================ +// OPCODE_XOR +// ============================================================================ +// TODO(benvanik): put dest/src1|2 together. +template +void EmitXorXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitCommutativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const REG& src) { e.xor(dest_src, src); }, + [](X64Emitter& e, const REG& dest_src, int32_t constant) { e.xor(dest_src, constant); }); +} +EMITTER(XOR_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_I16, MATCH(I, I16<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_I32, MATCH(I, I32<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitXorXX(e, i); + } +}; +EMITTER(XOR_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, + [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vpxor(dest, src1, src2); + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_XOR, + XOR_I8, + XOR_I16, + XOR_I32, + XOR_I64, + XOR_V128); + + +// ============================================================================ +// OPCODE_NOT +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +template +void EmitNotXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitUnaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src) { e.not(dest_src); }); +} +EMITTER(NOT_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitNotXX(e, i); + } +}; +EMITTER(NOT_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // dest = src ^ 0xFFFF... + e.vpxor(i.dest, i.src1, e.GetXmmConstPtr(XMMOne)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_NOT, + NOT_I8, + NOT_I16, + NOT_I32, + NOT_I64, + NOT_V128); + + +// ============================================================================ +// OPCODE_SHL +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitShlXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const Reg8& src) { + if (dest_src.getBit() == 64) { + e.shlx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); + } else { + e.shlx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } + }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { + e.shl(dest_src, constant); + }); +} +EMITTER(SHL_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER(SHL_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER(SHL_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER(SHL_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShlXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SHL, + SHL_I8, + SHL_I16, + SHL_I32, + SHL_I64); + + +// ============================================================================ +// OPCODE_SHR +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitShrXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const Reg8& src) { + if (dest_src.getBit() == 64) { + e.shrx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); + } else if (dest_src.getBit() == 32) { + e.shrx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } else { + e.movzx(dest_src.cvt32(), dest_src); + e.shrx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } + }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { + e.shr(dest_src, constant); + }); +} +EMITTER(SHR_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER(SHR_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER(SHR_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER(SHR_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitShrXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SHR, + SHR_I8, + SHR_I16, + SHR_I32, + SHR_I64); + + +// ============================================================================ +// OPCODE_SHA +// ============================================================================ +// TODO(benvanik): optimize common shifts. +template +void EmitSarXX(X64Emitter& e, const ARGS& i) { + SEQ::EmitAssociativeBinaryOp( + e, i, + [](X64Emitter& e, const REG& dest_src, const Reg8& src) { + if (dest_src.getBit() == 64) { + e.sarx(dest_src.cvt64(), dest_src.cvt64(), src.cvt64()); + } else if (dest_src.getBit() == 32) { + e.sarx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } else { + e.movsx(dest_src.cvt32(), dest_src); + e.sarx(dest_src.cvt32(), dest_src.cvt32(), src.cvt32()); + } + }, [](X64Emitter& e, const REG& dest_src, int8_t constant) { + e.sar(dest_src, constant); + }); +} +EMITTER(SHA_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER(SHA_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER(SHA_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER(SHA_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitSarXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SHA, + SHA_I8, + SHA_I16, + SHA_I32, + SHA_I64); + + +// ============================================================================ +// OPCODE_VECTOR_SHL +// ============================================================================ +EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + XEASSERTALWAYS(); + break; + } + } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.b16[n] != shamt.b16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same. + uint8_t sh = shamt.b16[0] & 0x7; + if (!sh) { + // No shift? + e.vmovaps(i.dest, i.src1); + } else { + // Even bytes. + e.vpsrlw(e.xmm0, i.src1, 8); + e.vpsllw(e.xmm0, sh + 8); + // Odd bytes. + e.vpsllw(i.dest, i.src1, 8); + e.vpsrlw(i.dest, 8 - sh); + // Mix. + e.vpor(i.dest, e.xmm0); + } + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + // TODO(benvanik): find a better sequence. + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + auto byte_mask = e.GetXmmConstPtr(XMMShiftByteMask); + // AABBCCDD|EEFFGGHH|IIJJKKLL|MMNNOOPP + // DD| HH| LL| PP + e.vpand(e.xmm0, i.src1, byte_mask); + e.vpand(e.xmm1, i.src2, byte_mask); + e.vpsllvd(temp, e.xmm0, e.xmm1); + // CC | GG | KK | OO + e.vpsrld(e.xmm0, i.src1, 8); + e.vpand(e.xmm0, byte_mask); + e.vpsrld(e.xmm1, i.src2, 8); + e.vpand(e.xmm1, byte_mask); + e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(e.xmm0, 8); + e.vpor(temp, e.xmm0); + // BB | FF | JJ | NN + e.vpsrld(e.xmm0, i.src1, 16); + e.vpand(e.xmm0, byte_mask); + e.vpsrld(e.xmm1, i.src2, 16); + e.vpand(e.xmm1, byte_mask); + e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(e.xmm0, 16); + e.vpor(temp, e.xmm0); + // AA |EE |II |MM + e.vpsrld(e.xmm0, i.src1, 24); + e.vpand(e.xmm0, byte_mask); + e.vpsrld(e.xmm1, i.src2, 24); + e.vpand(e.xmm1, byte_mask); + e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(e.xmm0, 24); + e.vpor(i.dest, temp, e.xmm0); + } + } + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.s8[n] != shamt.s8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsllw. + e.vpsllw(i.dest, i.src1, shamt.s8[0] & 0xF); + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.i4[n] != shamt.i4[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpslld(i.dest, i.src1, shamt.b16[0] & 0x1F); + } else { + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.i4[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsllvd(i.dest, i.src1, e.xmm0); + } + } else { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(i.dest, i.src1, e.xmm0); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_SHL, + VECTOR_SHL_V128); + + +// ============================================================================ +// OPCODE_VECTOR_SHR +// ============================================================================ +EMITTER(VECTOR_SHR_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + XEASSERTALWAYS(); + break; + } + } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.b16[n] != shamt.b16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same. + uint8_t sh = shamt.b16[0] & 0x7; + if (!sh) { + // No shift? + e.vmovaps(i.dest, i.src1); + } else { + // Even bytes. + e.vpsllw(e.xmm0, i.src1, 8); + e.vpsrlw(e.xmm0, sh + 8); + // Odd bytes. + e.vpsrlw(i.dest, i.src1, 8); + e.vpsllw(i.dest, 8 - sh); + // Mix. + e.vpor(i.dest, e.xmm0); + } + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.s8[n] != shamt.s8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsllw. + e.vpsrlw(i.dest, i.src1, shamt.s8[0] & 0xF); + } else { + // Counts differ, so pre-mask and load constant. + XEASSERTALWAYS(); + } + } else { + // Fully variable shift. + XEASSERTALWAYS(); + } + } + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.i4[n] != shamt.i4[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpsrld(i.dest, i.src1, shamt.b16[0] & 0x1F); + } else { + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.i4[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsrlvd(i.dest, i.src1, e.xmm0); + } + } else { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsrlvd(i.dest, i.src1, e.xmm0); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_SHR, + VECTOR_SHR_V128); + + +// ============================================================================ +// OPCODE_VECTOR_SHA +// ============================================================================ +EMITTER(VECTOR_SHA_V128, MATCH(I, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT32_TYPE: + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsravd(i.dest, i.src1, e.xmm0); + break; + default: + XEASSERTALWAYS(); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_SHA, + VECTOR_SHA_V128); + + +// ============================================================================ +// OPCODE_ROTATE_LEFT +// ============================================================================ +// TODO(benvanik): put dest/src1 together, src2 in cl. +template +void EmitRotateLeftXX(X64Emitter& e, const ARGS& i) { + if (i.src2.is_constant) { + // Constant rotate. + if (i.dest != i.src1) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + } else { + e.mov(i.dest, i.src1); + } + } + e.rol(i.dest, i.src2.constant()); + } else { + // Variable rotate. + if (i.src2.reg().getIdx() != e.cl.getIdx()) { + e.mov(e.cl, i.src2); + } + if (i.dest != i.src1) { + if (i.src1.is_constant) { + e.mov(i.dest, i.src1.constant()); + } else { + e.mov(i.dest, i.src1); + } + } + e.rol(i.dest, e.cl); + e.ReloadECX(); + } +} +EMITTER(ROTATE_LEFT_I8, MATCH(I, I8<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER(ROTATE_LEFT_I16, MATCH(I, I16<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER(ROTATE_LEFT_I32, MATCH(I, I32<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER(ROTATE_LEFT_I64, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitRotateLeftXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ROTATE_LEFT, + ROTATE_LEFT_I8, + ROTATE_LEFT_I16, + ROTATE_LEFT_I32, + ROTATE_LEFT_I64); + + +// ============================================================================ +// OPCODE_BYTE_SWAP +// ============================================================================ +// TODO(benvanik): put dest/src1 together. +EMITTER(BYTE_SWAP_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, + [](X64Emitter& e, const Reg16& dest_src) { e.ror(dest_src, 8); }); + } +}; +EMITTER(BYTE_SWAP_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, + [](X64Emitter& e, const Reg32& dest_src) { e.bswap(dest_src); }); + } +}; +EMITTER(BYTE_SWAP_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitUnaryOp( + e, i, + [](X64Emitter& e, const Reg64& dest_src) { e.bswap(dest_src); }); + } +}; +EMITTER(BYTE_SWAP_V128, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find a way to do this without the memory load. + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_BYTE_SWAP, + BYTE_SWAP_I16, + BYTE_SWAP_I32, + BYTE_SWAP_I64, + BYTE_SWAP_V128); + + +// ============================================================================ +// OPCODE_CNTLZ +// ============================================================================ +EMITTER(CNTLZ_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // No 8bit lzcnt, so do 16 and sub 8. + e.movzx(i.dest.reg().cvt16(), i.src1); + e.lzcnt(i.dest.reg().cvt16(), i.dest.reg().cvt16()); + e.sub(i.dest, 8); + } +}; +EMITTER(CNTLZ_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lzcnt(i.dest.reg().cvt32(), i.src1); + } +}; +EMITTER(CNTLZ_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lzcnt(i.dest.reg().cvt32(), i.src1); + } +}; +EMITTER(CNTLZ_I64, MATCH(I, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.lzcnt(i.dest.reg().cvt64(), i.src1); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_CNTLZ, + CNTLZ_I8, + CNTLZ_I16, + CNTLZ_I32, + CNTLZ_I64); + + +// ============================================================================ +// OPCODE_INSERT +// ============================================================================ + + +// ============================================================================ +// OPCODE_EXTRACT +// ============================================================================ +// TODO(benvanik): sequence extract/splat: +// v0.i32 = extract v0.v128, 0 +// v0.v128 = splat v0.i32 +// This can be a single broadcast. +EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant())); + } else { + XEASSERTALWAYS(); + // TODO(benvanik): try out hlide's version: + // e.mov(e.eax, 0x80808003); + // e.xor(e.al, i.src2); + // e.and(e.al, 15); + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest.reg().cvt32(), e.xmm0); + } + } +}; +EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); + } else { + // TODO(benvanik): try out hlide's version: + e.mov(e.al, i.src2); + e.xor(e.al, 0x1); + e.mov(e.ah, e.al); + e.add(e.ah, 1); + e.vmovd(e.xmm0, e.eax); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vmovd(i.dest.reg().cvt32(), e.xmm0); + } + } +}; +EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + static vec128_t extract_table_32[4] = { + vec128b( 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b( 7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + vec128b(15, 14, 13, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), + }; + if (i.src2.is_constant) { + if (i.src2.constant() == 0) { + e.vmovd(i.dest, i.src1); + } else { + e.vpextrd(i.dest, i.src1, VEC128_D(i.src2.constant())); + } + } else { + // TODO(benvanik): try out hlide's version: + // e.mov(e.eax, 3); + // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] + // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] + // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest.reg().cvt32(), e.xmm0); + // Get the desired word in xmm0, then extract that. + e.xor(e.rax, e.rax); + e.mov(e.al, i.src2); + e.and(e.al, 0x03); + e.shl(e.al, 4); + e.mov(e.rdx, reinterpret_cast(extract_table_32)); + e.vmovaps(e.xmm0, e.ptr[e.rdx + e.rax]); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vpextrd(i.dest, e.xmm0, 0); + e.ReloadEDX(); + } + } +}; +EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + e.vextractps(i.dest, i.src1, VEC128_F(i.src2.constant())); + } else { + XEASSERTALWAYS(); + // TODO(benvanik): try out hlide's version: + // e.mov(e.eax, 3); + // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] + // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] + // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] + // e.vmovd(e.xmm0, e.eax); + // e.vpshufb(e.xmm0, i.src1, e.xmm0); + // e.vmovd(i.dest, e.xmm0); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_EXTRACT, + EXTRACT_I8, + EXTRACT_I16, + EXTRACT_I32, + EXTRACT_F32); + + +// ============================================================================ +// OPCODE_SPLAT +// ============================================================================ +EMITTER(SPLAT_I8, MATCH(I, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.al, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastb(i.dest, e.xmm0); + } else { + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + e.vpbroadcastb(i.dest, e.xmm0); + } + } +}; +EMITTER(SPLAT_I16, MATCH(I, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.ax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastw(i.dest, e.xmm0); + } else { + e.vmovd(e.xmm0, i.src1.reg().cvt32()); + e.vpbroadcastw(i.dest, e.xmm0); + } + } +}; +EMITTER(SPLAT_I32, MATCH(I, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.constant()); + e.vmovd(e.xmm0, e.eax); + e.vpbroadcastd(i.dest, e.xmm0); + } else { + e.vmovd(e.xmm0, i.src1); + e.vpbroadcastd(i.dest, e.xmm0); + } + } +}; +EMITTER(SPLAT_F32, MATCH(I, F32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant) { + // TODO(benvanik): faster constant splats. + e.mov(e.eax, i.src1.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.vbroadcastss(i.dest, e.xmm0); + } else { + e.vbroadcastss(i.dest, i.src1); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SPLAT, + SPLAT_I8, + SPLAT_I16, + SPLAT_I32, + SPLAT_F32); + + +// ============================================================================ +// OPCODE_PERMUTE +// ============================================================================ +EMITTER(PERMUTE_I32, MATCH(I, I32<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // Permute words between src2 and src3. + // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. + if (i.src1.is_constant) { + uint32_t control = i.src1.constant(); + // Shuffle things into the right places in dest & xmm0, + // then we blend them together. + uint32_t src_control = + (((control >> 24) & 0x3) << 0) | + (((control >> 16) & 0x3) << 2) | + (((control >> 8) & 0x3) << 4) | + (((control >> 0) & 0x3) << 6); + uint32_t blend_control = + (((control >> 26) & 0x1) << 0) | + (((control >> 18) & 0x1) << 1) | + (((control >> 10) & 0x1) << 2) | + (((control >> 2) & 0x1) << 3); + // TODO(benvanik): if src2/src3 are constants, shuffle now! + Xmm src2; + if (i.src2.is_constant) { + src2 = e.xmm1; + e.LoadConstantXmm(src2, i.src2.constant()); + } else { + src2 = i.src2; + } + Xmm src3; + if (i.src3.is_constant) { + src3 = e.xmm2; + e.LoadConstantXmm(src3, i.src3.constant()); + } else { + src3 = i.src3; + } + if (i.dest != src3) { + e.vpshufd(i.dest, src2, src_control); + e.vpshufd(e.xmm0, src3, src_control); + e.vpblendd(i.dest, e.xmm0, blend_control); + } else { + e.vmovaps(e.xmm0, src3); + e.vpshufd(i.dest, src2, src_control); + e.vpshufd(e.xmm0, e.xmm0, src_control); + e.vpblendd(i.dest, e.xmm0, blend_control); + } + } else { + // Permute by non-constant. + XEASSERTALWAYS(); + } + } +}; +EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): find out how to do this with only one temp register! + // Permute bytes between src2 and src3. + if (i.src3.value->IsConstantZero()) { + // Permuting with src2/zero, so just shuffle/mask. + if (i.src2.value->IsConstantZero()) { + // src2 & src3 are zero, so result will always be zero. + e.vpxor(i.dest, i.dest); + } else { + // Control mask needs to be shuffled. + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMByteSwapMask)); + } else { + e.vpshufb(e.xmm0, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + } + if (i.src2.is_constant) { + e.LoadConstantXmm(i.dest, i.src2.constant()); + e.vpshufb(i.dest, i.dest, e.xmm0); + } else { + e.vpshufb(i.dest, i.src2, e.xmm0); + } + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15)); + e.vpandn(i.dest, e.xmm0, i.dest); + } + } else { + // General permute. + // Control mask needs to be shuffled. + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src1.constant()); + e.vpshufb(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMByteSwapMask)); + } else { + e.vpshufb(e.xmm2, i.src1, e.GetXmmConstPtr(XMMByteSwapMask)); + } + Xmm src2_shuf = e.xmm0; + if (i.src2.value->IsConstantZero()) { + e.vpxor(src2_shuf, src2_shuf); + } else if (i.src2.is_constant) { + e.LoadConstantXmm(src2_shuf, i.src2.constant()); + e.vpshufb(src2_shuf, src2_shuf, e.xmm2); + } else { + e.vpshufb(src2_shuf, i.src2, e.xmm2); + } + Xmm src3_shuf = e.xmm1; + if (i.src3.value->IsConstantZero()) { + e.vpxor(src3_shuf, src3_shuf); + } else if (i.src3.is_constant) { + e.LoadConstantXmm(src3_shuf, i.src3.constant()); + e.vpshufb(src3_shuf, src3_shuf, e.xmm2); + } else { + e.vpshufb(src3_shuf, i.src3, e.xmm2); + } + // Build a mask with values in src2 having 0 and values in src3 having 1. + e.vpcmpgtb(i.dest, e.xmm2, e.GetXmmConstPtr(XMMPermuteControl15)); + e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_PERMUTE, + PERMUTE_I32, + PERMUTE_V128); + + +// ============================================================================ +// OPCODE_SWIZZLE +// ============================================================================ +EMITTER(SWIZZLE, MATCH(I, V128<>, OffsetOp>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + auto element_type = i.instr->flags; + if (element_type == INT8_TYPE) { + XEASSERTALWAYS(); + } else if (element_type == INT16_TYPE) { + XEASSERTALWAYS(); + } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) { + uint8_t swizzle_mask = static_cast(i.src2.value); + swizzle_mask = + (((swizzle_mask >> 6) & 0x3) << 0) | + (((swizzle_mask >> 4) & 0x3) << 2) | + (((swizzle_mask >> 2) & 0x3) << 4) | + (((swizzle_mask >> 0) & 0x3) << 6); + e.vpshufd(i.dest, i.src1, swizzle_mask); + } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { + XEASSERTALWAYS(); + } else { + XEASSERTALWAYS(); + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_SWIZZLE, + SWIZZLE); + + +// ============================================================================ +// OPCODE_PACK +// ============================================================================ +EMITTER(PACK, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_S8_IN_16_LO: + EmitS8_IN_16_LO(e, i); + break; + case PACK_TYPE_S8_IN_16_HI: + EmitS8_IN_16_HI(e, i); + break; + case PACK_TYPE_S16_IN_32_LO: + EmitS16_IN_32_LO(e, i); + break; + case PACK_TYPE_S16_IN_32_HI: + EmitS16_IN_32_HI(e, i); + break; + default: XEASSERTALWAYS(); break; + } + } + static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { + // RGBA (XYZW) -> ARGB (WXYZ) + // float r = roundf(((src1.x < 0) ? 0 : ((1 < src1.x) ? 1 : src1.x)) * 255); + // float g = roundf(((src1.y < 0) ? 0 : ((1 < src1.y) ? 1 : src1.y)) * 255); + // float b = roundf(((src1.z < 0) ? 0 : ((1 < src1.z) ? 1 : src1.z)) * 255); + // float a = roundf(((src1.w < 0) ? 0 : ((1 < src1.w) ? 1 : src1.w)) * 255); + // dest.iw = ((uint32_t)a << 24) | + // ((uint32_t)r << 16) | + // ((uint32_t)g << 8) | + // ((uint32_t)b); + // f2i(clamp(src, 0, 1) * 255) + e.vpxor(e.xmm0, e.xmm0); + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + e.vmaxps(e.xmm0, e.xmm1); + } else { + e.vmaxps(e.xmm0, i.src1); + } + e.vminps(e.xmm0, e.GetXmmConstPtr(XMMOne)); + e.vmulps(e.xmm0, e.GetXmmConstPtr(XMM255)); + e.vcvttps2dq(e.xmm0, e.xmm0); + e.vpshufb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMPackD3DCOLOR)); + } + static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { + // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // dest = [(src1.x | src1.y), 0, 0, 0] + // 0|0|0|0|W|Z|Y|X + e.vcvtps2ph(e.xmm0, i.src1, B00000011); + // Y|X|W|Z|0|0|0|0 + e.vpshufd(e.xmm0, e.xmm0, B00011011); + // Shuffle to X|Y|Z|W|0|0|0|0 + e.vpshufhw(e.xmm0, e.xmm0, B10110001); + // Select just X|Y + e.vxorps(i.dest, i.dest); + e.vpblendw(i.dest, e.xmm0, B11000000); + } + static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { + // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] + // 0|0|0|0|W|Z|Y|X + e.vcvtps2ph(e.xmm0, i.src1, B00000011); + // Y|X|W|Z|0|0|0|0 + e.vpshufd(e.xmm0, e.xmm0, B00011011); + // Shuffle to X|Y|Z|W|0|0|0|0 + e.vpshufhw(e.xmm0, e.xmm0, B10110001); + // Select just X|Y|Z|W + e.vxorps(i.dest, i.dest); + e.vpblendw(i.dest, e.xmm0, B11110000); + } + static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS8_IN_16_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS8_IN_16_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_PACK, + PACK); + + +// ============================================================================ +// OPCODE_UNPACK +// ============================================================================ +EMITTER(UNPACK, MATCH(I, V128<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case PACK_TYPE_D3DCOLOR: + EmitD3DCOLOR(e, i); + break; + case PACK_TYPE_FLOAT16_2: + EmitFLOAT16_2(e, i); + break; + case PACK_TYPE_FLOAT16_4: + EmitFLOAT16_4(e, i); + break; + case PACK_TYPE_SHORT_2: + EmitSHORT_2(e, i); + break; + case PACK_TYPE_S8_IN_16_LO: + EmitS8_IN_16_LO(e, i); + break; + case PACK_TYPE_S8_IN_16_HI: + EmitS8_IN_16_HI(e, i); + break; + case PACK_TYPE_S16_IN_32_LO: + EmitS16_IN_32_LO(e, i); + break; + case PACK_TYPE_S16_IN_32_HI: + EmitS16_IN_32_HI(e, i); + break; + default: XEASSERTALWAYS(); break; + } + } + static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { + // ARGB (WXYZ) -> RGBA (XYZW) + // XMLoadColor + // int32_t src = (int32_t)src1.iw; + // dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); + // dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); + // dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); + // dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); + if (i.src1.is_constant) { + e.vpxor(i.dest, i.dest); + return; + } + + // src = ZZYYXXWW + // unpack to 000000ZZ,000000YY,000000XX,000000WW + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); + // int -> float + e.vcvtdq2ps(i.dest, i.dest); + // mult by 1/255 + e.vmulps(i.dest, e.GetXmmConstPtr(XMMOneOver255)); + } + static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { + // 1 bit sign, 5 bit exponent, 10 bit mantissa + // D3D10 half float format + // TODO(benvanik): http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx + // Use _mm_cvtph_ps -- requires very modern processors (SSE5+) + // Unpacking half floats: http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ + // Packing half floats: https://gist.github.com/rygorous/2156668 + // Load source, move from tight pack of X16Y16.... to X16...Y16... + // Also zero out the high end. + // TODO(benvanik): special case constant unpacks that just get 0/1/etc. + + // sx = src.iw >> 16; + // sy = src.iw & 0xFFFF; + // dest = { XMConvertHalfToFloat(sx), + // XMConvertHalfToFloat(sy), + // 0.0, + // 1.0 }; + e.vcvtph2ps(i.dest, i.src1); + e.vpshufd(i.dest, i.dest, B10100100); + e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); + } + static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { + // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] + e.vcvtph2ps(i.dest, i.src1); + } + static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 0.0 + // (VD.w) = 1.0 + + // XMLoadShortN2 plus 3,3,0,3 (for some reason) + // src is (xx,xx,xx,VALUE) + // (VALUE,VALUE,VALUE,VALUE) + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vpxor(i.dest, i.dest); + } else { + // TODO(benvanik): check other common constants. + e.LoadConstantXmm(i.dest, i.src1.constant()); + e.vbroadcastss(i.dest, i.src1); + } + } else { + e.vbroadcastss(i.dest, i.src1); + } + // (VALUE&0xFFFF,VALUE&0xFFFF0000,0,0) + e.vandps(i.dest, e.GetXmmConstPtr(XMMMaskX16Y16)); + // Sign extend. + e.vxorps(i.dest, e.GetXmmConstPtr(XMMFlipX16Y16)); + // Convert int->float. + e.cvtpi2ps(i.dest, e.StashXmm(i.dest)); + // 0x8000 to undo sign. + e.vaddps(i.dest, e.GetXmmConstPtr(XMMFixX16Y16)); + // Normalize. + e.vmulps(i.dest, e.GetXmmConstPtr(XMMNormalizeX16Y16)); + // Clamp. + e.vmaxps(i.dest, e.GetXmmConstPtr(XMMNegativeOne)); + // Add 3,3,0,1. + e.vaddps(i.dest, e.GetXmmConstPtr(XMM3301)); + } + static void EmitS8_IN_16_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS8_IN_16_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_LO(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } + static void EmitS16_IN_32_HI(X64Emitter& e, const EmitArgType& i) { + XEASSERTALWAYS(); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_UNPACK, + UNPACK); + + +// ============================================================================ +// OPCODE_COMPARE_EXCHANGE +// ============================================================================ + + +// ============================================================================ +// OPCODE_ATOMIC_EXCHANGE +// ============================================================================ +// Note that the address we use here is a real, host address! +// This is weird, and should be fixed. +template +void EmitAtomicExchangeXX(X64Emitter& e, const ARGS& i) { + if (i.dest == i.src1) { + e.mov(e.rax, i.src1); + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.mov(i.dest, i.src2.constant()); + } else { + e.mov(i.dest, i.src2); + } + } + e.lock(); + e.xchg(e.dword[e.rax], i.dest); + } else { + if (i.dest != i.src2) { + if (i.src2.is_constant) { + e.mov(i.dest, i.src2.constant()); + } else { + e.mov(i.dest, i.src2); + } + } + e.lock(); + e.xchg(e.dword[i.src1.reg()], i.dest); + } +} +EMITTER(ATOMIC_EXCHANGE_I8, MATCH(I, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER(ATOMIC_EXCHANGE_I16, MATCH(I, I64<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER(ATOMIC_EXCHANGE_I32, MATCH(I, I64<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER(ATOMIC_EXCHANGE_I64, MATCH(I, I64<>, I64<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitAtomicExchangeXX(e, i); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_ATOMIC_EXCHANGE, + ATOMIC_EXCHANGE_I8, + ATOMIC_EXCHANGE_I16, + ATOMIC_EXCHANGE_I32, + ATOMIC_EXCHANGE_I64); + + +// ============================================================================ +// OPCODE_ATOMIC_ADD +// ============================================================================ + + +// ============================================================================ +// OPCODE_ATOMIC_SUB +// ============================================================================ + + + + +//SEQUENCE(ADD_ADD_BRANCH, MATCH( +// I, I32<>, I32C<>>, +// I, I32, I32C<>>, +// I)) { +// static void Emit(X64Emitter& e, const EmitArgs& _) { +// } +//}; + + + +void alloy::backend::x64::RegisterSequences() { + #define REGISTER_EMITTER_OPCODE_TABLE(opcode) Register_##opcode() + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMMENT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_NOP); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SOURCE_OFFSET); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DEBUG_BREAK_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_TRAP); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_TRAP_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CALL_EXTERN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RETURN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SET_RETURN_ADDRESS); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BRANCH); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BRANCH_FALSE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ASSIGN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CAST); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ZERO_EXTEND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SIGN_EXTEND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_TRUNCATE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CONVERT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROUND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_I2F); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_VECTOR_SHR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_CLOCK); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_LOCAL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_STORE_LOCAL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD_CONTEXT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_STORE_CONTEXT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOAD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_STORE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PREFETCH); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MAX); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MIN); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SELECT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EQ); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_NE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SLE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_SGE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_ULE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGT_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_UGE_FLT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_CARRY); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_OVERFLOW); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DID_SATURATE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_EQ); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_SGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_COMPARE_UGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SUB); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_HI); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DIV); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_NEG); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ABS); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SQRT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_RSQRT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_POW2); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_LOG2); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_3); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_DOT_PRODUCT_4); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_AND); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_OR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_XOR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_NOT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SHL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SHR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SHA); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_EXTRACT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SPLAT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PERMUTE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SWIZZLE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PACK); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_UNPACK); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_COMPARE_EXCHANGE); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_ADD); + //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_SUB); +} + +bool alloy::backend::x64::SelectSequence(X64Emitter& e, const Instr* i, const Instr** new_tail) { + const InstrKey key(i); + const auto its = sequence_table.equal_range(key); + for (auto it = its.first; it != its.second; ++it) { + if (it->second(e, i, new_tail)) { + return true; + } + } + XELOGE("No sequence match for variant %s", i->opcode->name); + return false; +} diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.h b/src/alloy/backend/x64/x64_sequences.h similarity index 59% rename from src/alloy/backend/x64/lowering/lowering_sequences.h rename to src/alloy/backend/x64/x64_sequences.h index 634d52f47..5a77e9987 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.h +++ b/src/alloy/backend/x64/x64_sequences.h @@ -2,32 +2,32 @@ ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * + * Copyright 2014 Ben Vanik. All rights reserved. * * Released under the BSD license - see LICENSE in the root for more details. * ****************************************************************************** */ -#ifndef ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_ -#define ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_ +#ifndef ALLOY_BACKEND_X64_X64_SEQUENCES_H_ +#define ALLOY_BACKEND_X64_X64_SEQUENCES_H_ #include -#include +XEDECLARECLASS2(alloy, hir, Instr); namespace alloy { namespace backend { namespace x64 { -namespace lowering { -class LoweringTable; - -void RegisterSequences(LoweringTable* table); +class X64Emitter; + + +void RegisterSequences(); +bool SelectSequence(X64Emitter& e, const hir::Instr* i, const hir::Instr** new_tail); -} // namespace lowering } // namespace x64 } // namespace backend } // namespace alloy -#endif // ALLOY_BACKEND_X64_X64_LOWERING_LOWERING_SEQUENCES_H_ +#endif // ALLOY_BACKEND_X64_X64_SEQUENCES_H_ diff --git a/src/alloy/backend/x64/x64_thunk_emitter.cc b/src/alloy/backend/x64/x64_thunk_emitter.cc new file mode 100644 index 000000000..0e1922581 --- /dev/null +++ b/src/alloy/backend/x64/x64_thunk_emitter.cc @@ -0,0 +1,145 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::backend::x64; + +using namespace Xbyak; + + +X64ThunkEmitter::X64ThunkEmitter( + X64Backend* backend, XbyakAllocator* allocator) : + X64Emitter(backend, allocator) { +} + +X64ThunkEmitter::~X64ThunkEmitter() { +} + +HostToGuestThunk X64ThunkEmitter::EmitHostToGuestThunk() { + // rcx = target + // rdx = arg0 + // r8 = arg1 + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + // rsp + 0 = return address + mov(qword[rsp + 8 * 3], r8); + mov(qword[rsp + 8 * 2], rdx); + mov(qword[rsp + 8 * 1], rcx); + sub(rsp, stack_size); + + mov(qword[rsp + 48], rbx); + mov(qword[rsp + 56], rcx); + mov(qword[rsp + 64], rbp); + mov(qword[rsp + 72], rsi); + mov(qword[rsp + 80], rdi); + mov(qword[rsp + 88], r12); + mov(qword[rsp + 96], r13); + mov(qword[rsp + 104], r14); + mov(qword[rsp + 112], r15); + + /*movaps(ptr[rsp + 128], xmm6); + movaps(ptr[rsp + 144], xmm7); + movaps(ptr[rsp + 160], xmm8); + movaps(ptr[rsp + 176], xmm9); + movaps(ptr[rsp + 192], xmm10); + movaps(ptr[rsp + 208], xmm11); + movaps(ptr[rsp + 224], xmm12); + movaps(ptr[rsp + 240], xmm13); + movaps(ptr[rsp + 256], xmm14); + movaps(ptr[rsp + 272], xmm15);*/ + + mov(rax, rcx); + mov(rcx, rdx); + mov(rdx, r8); + call(rax); + + /*movaps(xmm6, ptr[rsp + 128]); + movaps(xmm7, ptr[rsp + 144]); + movaps(xmm8, ptr[rsp + 160]); + movaps(xmm9, ptr[rsp + 176]); + movaps(xmm10, ptr[rsp + 192]); + movaps(xmm11, ptr[rsp + 208]); + movaps(xmm12, ptr[rsp + 224]); + movaps(xmm13, ptr[rsp + 240]); + movaps(xmm14, ptr[rsp + 256]); + movaps(xmm15, ptr[rsp + 272]);*/ + + mov(rbx, qword[rsp + 48]); + mov(rcx, qword[rsp + 56]); + mov(rbp, qword[rsp + 64]); + mov(rsi, qword[rsp + 72]); + mov(rdi, qword[rsp + 80]); + mov(r12, qword[rsp + 88]); + mov(r13, qword[rsp + 96]); + mov(r14, qword[rsp + 104]); + mov(r15, qword[rsp + 112]); + + add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); + mov(r8, qword[rsp + 8 * 3]); + ret(); + + void* fn = Emplace(stack_size); + return (HostToGuestThunk)fn; +} + +GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { + // rcx = context + // rdx = target function + // r8 = arg0 + // r9 = arg1 + + const size_t stack_size = StackLayout::THUNK_STACK_SIZE; + // rsp + 0 = return address + mov(qword[rsp + 8 * 2], rdx); + mov(qword[rsp + 8 * 1], rcx); + sub(rsp, stack_size); + + mov(qword[rsp + 48], rbx); + mov(qword[rsp + 56], rcx); + mov(qword[rsp + 64], rbp); + mov(qword[rsp + 72], rsi); + mov(qword[rsp + 80], rdi); + mov(qword[rsp + 88], r12); + mov(qword[rsp + 96], r13); + mov(qword[rsp + 104], r14); + mov(qword[rsp + 112], r15); + + // TODO(benvanik): save things? XMM0-5? + + mov(rax, rdx); + mov(rdx, r8); + mov(r8, r9); + call(rax); + + mov(rbx, qword[rsp + 48]); + mov(rcx, qword[rsp + 56]); + mov(rbp, qword[rsp + 64]); + mov(rsi, qword[rsp + 72]); + mov(rdi, qword[rsp + 80]); + mov(r12, qword[rsp + 88]); + mov(r13, qword[rsp + 96]); + mov(r14, qword[rsp + 104]); + mov(r15, qword[rsp + 112]); + + add(rsp, stack_size); + mov(rcx, qword[rsp + 8 * 1]); + mov(rdx, qword[rsp + 8 * 2]); + ret(); + + void* fn = Emplace(stack_size); + return (HostToGuestThunk)fn; +} diff --git a/src/alloy/backend/x64/x64_thunk_emitter.h b/src/alloy/backend/x64/x64_thunk_emitter.h new file mode 100644 index 000000000..ae9c7b967 --- /dev/null +++ b/src/alloy/backend/x64/x64_thunk_emitter.h @@ -0,0 +1,147 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_CPU_X64_X64_THUNK_EMITTER_H_ +#define XENIA_CPU_X64_X64_THUNK_EMITTER_H_ + +#include +#include +#include + + +namespace alloy { +namespace backend { +namespace x64 { + + +/** + * Stack Layout + * ---------------------------- + * NOTE: stack must always be 16b aligned. + * + * Thunk stack: + * +------------------+ + * | arg temp, 3 * 8 | rsp + 0 + * | | + * | | + * +------------------+ + * | scratch, 16b | rsp + 32 + * | | + * +------------------+ + * | rbx | rsp + 48 + * +------------------+ + * | rcx / context | rsp + 56 + * +------------------+ + * | rbp | rsp + 64 + * +------------------+ + * | rsi | rsp + 72 + * +------------------+ + * | rdi | rsp + 80 + * +------------------+ + * | r12 | rsp + 88 + * +------------------+ + * | r13 | rsp + 96 + * +------------------+ + * | r14 | rsp + 104 + * +------------------+ + * | r15 | rsp + 112 + * +------------------+ + * | (return address) | rsp + 120 + * +------------------+ + * | (rcx home) | rsp + 128 + * +------------------+ + * | (rdx home) | rsp + 136 + * +------------------+ + * + * + * TODO: + * +------------------+ + * | xmm6 | rsp + 128 + * | | + * +------------------+ + * | xmm7 | rsp + 144 + * | | + * +------------------+ + * | xmm8 | rsp + 160 + * | | + * +------------------+ + * | xmm9 | rsp + 176 + * | | + * +------------------+ + * | xmm10 | rsp + 192 + * | | + * +------------------+ + * | xmm11 | rsp + 208 + * | | + * +------------------+ + * | xmm12 | rsp + 224 + * | | + * +------------------+ + * | xmm13 | rsp + 240 + * | | + * +------------------+ + * | xmm14 | rsp + 256 + * | | + * +------------------+ + * | xmm15 | rsp + 272 + * | | + * +------------------+ + * + * Guest stack: + * +------------------+ + * | arg temp, 3 * 8 | rsp + 0 + * | | + * | | + * +------------------+ + * | scratch, 32b | rsp + 32 + * | | + * +------------------+ + * | rcx / context | rsp + 64 + * +------------------+ + * | guest ret addr | rsp + 72 + * +------------------+ + * | call ret addr | rsp + 80 + * +------------------+ + * ... locals ... + * +------------------+ + * | (return address) | + * +------------------+ + * + */ + +class StackLayout { +public: + const static size_t THUNK_STACK_SIZE = 120; + + const static size_t GUEST_STACK_SIZE = 88; + const static size_t GUEST_RCX_HOME = 64; + const static size_t GUEST_RET_ADDR = 72; + const static size_t GUEST_CALL_RET_ADDR = 80; +}; + + +class X64ThunkEmitter : public X64Emitter { +public: + X64ThunkEmitter(X64Backend* backend, XbyakAllocator* allocator); + virtual ~X64ThunkEmitter(); + + // Call a generated function, saving all stack parameters. + HostToGuestThunk EmitHostToGuestThunk(); + + // Function that guest code can call to transition into host code. + GuestToHostThunk EmitGuestToHostThunk(); +}; + + +} // namespace x64 +} // namespace backend +} // namespace alloy + + +#endif // XENIA_CPU_X64_X64_THUNK_EMITTER_H_ diff --git a/src/alloy/backend/x64/x64_tracers.cc b/src/alloy/backend/x64/x64_tracers.cc new file mode 100644 index 000000000..0ebb699cb --- /dev/null +++ b/src/alloy/backend/x64/x64_tracers.cc @@ -0,0 +1,200 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +using namespace alloy; +using namespace alloy::backend::x64; +using namespace alloy::runtime; + +namespace alloy { +namespace backend { +namespace x64 { + +#define ITRACE 0 +#define DTRACE 0 + +#define TARGET_THREAD 1 + +#define IFLUSH() fflush(stdout) +#define IPRINT if (thread_state->thread_id() == TARGET_THREAD) printf +#define DFLUSH() fflush(stdout) +#define DPRINT DFLUSH(); if (thread_state->thread_id() == TARGET_THREAD) printf + +uint32_t GetTracingMode() { + uint32_t mode = 0; +#if ITRACE + mode |= TRACING_INSTR; +#endif // ITRACE +#if DTRACE + mode |= TRACING_DATA; +#endif // DTRACE + return mode; +} + +void TraceString(void* raw_context, const char* str) { + auto thread_state = *((ThreadState**)raw_context); + IPRINT("XE[t] :%d: %s\n", thread_state->thread_id(), str); + IFLUSH(); +} + +void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = ctx i8 +%d\n", (int8_t)value, value, offset); +} +void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = ctx i16 +%d\n", (int16_t)value, value, offset); +} +void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = ctx i32 +%d\n", (int32_t)value, value, offset); +} +void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%lld (%llX) = ctx i64 +%d\n", (int64_t)value, value, offset); +} +void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%e (%X) = ctx f32 +%d\n", value.m128_f32[0], value.m128_i32[0], offset); +} +void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("%lle (%llX) = ctx f64 +%d\n", f.d, value.m128_i64[0], offset); +} +void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = ctx v128 +%d\n", + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3], + offset); +} + +void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i8 +%d = %d (%X)\n", offset, (int8_t)value, value); +} +void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i16 +%d = %d (%X)\n", offset, (int16_t)value, value); +} +void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i32 +%d = %d (%X)\n", offset, (int32_t)value, value); +} +void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx i64 +%d = %lld (%llX)\n", offset, (int64_t)value, value); +} +void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx f32 +%d = %e (%X)\n", offset, value.m128_i32[0], value.m128_f32[0]); +} +void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("ctx f64 +%d = %lle (%llX)\n", offset, value.m128_i64[0], f.d); +} +void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", offset, + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3]); +} + +void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = load.i8 %.8X\n", (int8_t)value, value, address); +} +void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = load.i16 %.8X\n", (int16_t)value, value, address); +} +void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%d (%X) = load.i32 %.8X\n", (int32_t)value, value, address); +} +void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%lld (%llX) = load.i64 %.8X\n", (int64_t)value, value, address); +} +void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("%e (%X) = load.f32 %.8X\n", value.m128_f32[0], value.m128_i32[0], address); +} +void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("%lle (%llX) = load.f64 %.8X\n", f.d, value.m128_i64[0], address); +} +void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load.v128 %.8X\n", + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3], + address); +} + +void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i8 %.8X = %d (%X)\n", address, (int8_t)value, value); +} +void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i16 %.8X = %d (%X)\n", address, (int16_t)value, value); +} +void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i32 %.8X = %d (%X)\n", address, (int32_t)value, value); +} +void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.i64 %.8X = %lld (%llX)\n", address, (int64_t)value, value); +} +void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.f32 %.8X = %e (%X)\n", address, value.m128_f32[0], value.m128_i32[0]); +} +void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + union { + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("store.f64 %.8X = %lle (%llX)\n", address, f.d, value.m128_i64[0]); +} +void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) { + auto thread_state = *((ThreadState**)raw_context); + DPRINT("store.v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", address, + value.m128_f32[0], value.m128_f32[1], value.m128_f32[2], value.m128_f32[3], + value.m128_i32[0], value.m128_i32[1], value.m128_i32[2], value.m128_i32[3]); +} + + +} // namespace x64 +} // namespace backend +} // namespace alloy diff --git a/src/alloy/backend/x64/x64_tracers.h b/src/alloy/backend/x64/x64_tracers.h new file mode 100644 index 000000000..64c788ff3 --- /dev/null +++ b/src/alloy/backend/x64/x64_tracers.h @@ -0,0 +1,85 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_BACKEND_X64_X64_TRACERS_H_ +#define ALLOY_BACKEND_X64_X64_TRACERS_H_ + +#include + +#if XE_LIKE_WIN32 +#include +#else +typedef union __declspec(align(16)) __m128 { + float m128_f32[4]; + uint64_t m128_u64[2]; + int8_t m128_i8[16]; + int16_t m128_i16[8]; + int32_t m128_i32[4]; + int64_t m128_i64[2]; + uint8_t m128_u8[16]; + uint16_t m128_u16[8]; + uint32_t m128_u32[4]; +} __m128; +#endif + + +namespace alloy { +namespace backend { +namespace x64 { +class X64Emitter; + +enum TracingMode { + TRACING_INSTR = (1 << 1), + TRACING_DATA = (1 << 2), +}; + +uint32_t GetTracingMode(); +inline bool IsTracingInstr() { return (GetTracingMode() & TRACING_INSTR) != 0; } +inline bool IsTracingData() { return (GetTracingMode() & TRACING_DATA) != 0; } + +void TraceString(void* raw_context, const char* str); + +void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value); +void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value); +void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value); +void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value); +void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value); +void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value); +void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value); + +void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value); +void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value); +void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value); +void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value); +void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value); +void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value); +void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value); + +void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value); +void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value); +void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value); +void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value); +void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value); + +void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value); +void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value); +void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value); +void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value); +void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value); + +} // namespace x64 +} // namespace backend +} // namespace alloy + + +#endif // ALLOY_BACKEND_X64_X64_TRACERS_H_ diff --git a/src/alloy/compiler/compiler.cc b/src/alloy/compiler/compiler.cc index 07e786ade..62c6e5a4b 100644 --- a/src/alloy/compiler/compiler.cc +++ b/src/alloy/compiler/compiler.cc @@ -20,6 +20,8 @@ using namespace alloy::runtime; Compiler::Compiler(Runtime* runtime) : runtime_(runtime) { + scratch_arena_ = new Arena(); + alloy::tracing::WriteEvent(EventType::Init({ })); } @@ -32,6 +34,8 @@ Compiler::~Compiler() { delete pass; } + delete scratch_arena_; + alloy::tracing::WriteEvent(EventType::Deinit({ })); } @@ -45,10 +49,13 @@ void Compiler::Reset() { } int Compiler::Compile(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // TODO(benvanik): sophisticated stuff. Run passes in parallel, run until they // stop changing things, etc. for (auto it = passes_.begin(); it != passes_.end(); ++it) { CompilerPass* pass = *it; + scratch_arena_->Reset(); if (pass->Run(builder)) { return 1; } diff --git a/src/alloy/compiler/compiler.h b/src/alloy/compiler/compiler.h index ae6b48455..d2874cceb 100644 --- a/src/alloy/compiler/compiler.h +++ b/src/alloy/compiler/compiler.h @@ -28,6 +28,7 @@ public: ~Compiler(); runtime::Runtime* runtime() const { return runtime_; } + Arena* scratch_arena() const { return scratch_arena_; } void AddPass(CompilerPass* pass); @@ -37,6 +38,7 @@ public: private: runtime::Runtime* runtime_; + Arena* scratch_arena_; typedef std::vector PassList; PassList passes_; diff --git a/src/alloy/compiler/compiler_pass.cc b/src/alloy/compiler/compiler_pass.cc index 535bcb490..59f71902c 100644 --- a/src/alloy/compiler/compiler_pass.cc +++ b/src/alloy/compiler/compiler_pass.cc @@ -27,3 +27,7 @@ int CompilerPass::Initialize(Compiler* compiler) { compiler_ = compiler; return 0; } + +Arena* CompilerPass::scratch_arena() const { + return compiler_->scratch_arena(); +} diff --git a/src/alloy/compiler/compiler_pass.h b/src/alloy/compiler/compiler_pass.h index 1ed1b8144..4ba38b6c4 100644 --- a/src/alloy/compiler/compiler_pass.h +++ b/src/alloy/compiler/compiler_pass.h @@ -32,6 +32,9 @@ public: virtual int Run(hir::HIRBuilder* builder) = 0; +protected: + Arena* scratch_arena() const; + protected: runtime::Runtime* runtime_; Compiler* compiler_; diff --git a/src/alloy/compiler/compiler_passes.h b/src/alloy/compiler/compiler_passes.h index 200159ac2..20ec91c66 100644 --- a/src/alloy/compiler/compiler_passes.h +++ b/src/alloy/compiler/compiler_passes.h @@ -11,11 +11,15 @@ #define ALLOY_COMPILER_COMPILER_PASSES_H_ #include +#include #include +#include #include + //#include #include -//#include +#include #include +#include #include // TODO: @@ -134,5 +138,42 @@ // store_context +302, v5 // branch_true v5, ... // +// - X86Canonicalization +// For various opcodes add copies/commute the arguments to match x86 +// operand semantics. This makes code generation easier and if done +// before register allocation can prevent a lot of extra shuffling in +// the emitted code. +// +// Example: +// : +// v0 = ... +// v1 = ... +// v2 = add v0, v1 <-- v1 now unused +// Becomes: +// v0 = ... +// v1 = ... +// v1 = add v1, v0 <-- src1 = dest/src, so reuse for both +// by commuting and setting dest = src1 +// +// - RegisterAllocation +// Given a machine description (register classes, counts) run over values +// and assign them to registers, adding spills as needed. It should be +// possible to directly emit code from this form. +// +// Example: +// : +// v0 = load_context +0 +// v1 = load_context +1 +// v0 = add v0, v1 +// ... +// v2 = mul v0, v1 +// Becomes: +// reg0 = load_context +0 +// reg1 = load_context +1 +// reg2 = add reg0, reg1 +// store_local +123, reg2 <-- spill inserted +// ... +// reg0 = load_local +123 <-- load inserted +// reg0 = mul reg0, reg1 #endif // ALLOY_COMPILER_COMPILER_PASSES_H_ diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc index 0bf269334..13001a904 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.cc +++ b/src/alloy/compiler/passes/constant_propagation_pass.cc @@ -9,6 +9,9 @@ #include +#include +#include + using namespace alloy; using namespace alloy::compiler; using namespace alloy::compiler::passes; @@ -23,6 +26,8 @@ ConstantPropagationPass::~ConstantPropagationPass() { } int ConstantPropagationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Once ContextPromotion has run there will likely be a whole slew of // constants that can be pushed through the function. // Example: @@ -41,6 +46,14 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { // v1 = add 1000, 1000 // store_context +200, 2000 // A DCE run after this should clean up any of the values no longer needed. + // + // Special care needs to be taken with paired instructions. For example, + // DID_CARRY needs to be set as a constant: + // v1 = sub.2 20, 1 + // v2 = did_carry v1 + // should become: + // v1 = 19 + // v2 = 0 Block* block = builder->first_block(); while (block) { @@ -79,6 +92,17 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { } } break; + case OPCODE_CALL_INDIRECT: + if (i->src1.value->IsConstant()) { + runtime::FunctionInfo* symbol_info; + if (runtime_->LookupFunctionInfo( + (uint32_t)i->src1.value->constant.i32, &symbol_info)) { + break; + } + i->Replace(&OPCODE_CALL_info, i->flags); + i->src1.symbol_info = symbol_info; + } + break; case OPCODE_CALL_INDIRECT_TRUE: if (i->src1.value->IsConstant()) { if (i->src1.value->IsConstantTrue()) { @@ -179,20 +203,112 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { break; // TODO(benvanik): compares + case OPCODE_COMPARE_EQ: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantEQ(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_NE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantNE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SLT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSLT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SLE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSLE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SGT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSGT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_SGE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantSGE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_ULT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantULT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_ULE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantULE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_UGT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantUGT(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + case OPCODE_COMPARE_UGE: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + bool value = i->src1.value->IsConstantUGE(i->src2.value); + i->dest->set_constant(value); + i->Remove(); + } + break; + + case OPCODE_DID_CARRY: + XEASSERT(!i->src1.value->IsConstant()); + break; + case OPCODE_DID_OVERFLOW: + XEASSERT(!i->src1.value->IsConstant()); + break; + case OPCODE_DID_SATURATE: + XEASSERT(!i->src1.value->IsConstant()); + break; case OPCODE_ADD: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); - v->Add(i->src2.value); + bool did_carry = v->Add(i->src2.value); + bool propagate_carry = !!(i->flags & ARITHMETIC_SET_CARRY); i->Remove(); + + // If carry is set find the DID_CARRY and fix it. + if (propagate_carry) { + PropagateCarry(v, did_carry); + } } break; - // TODO(benvanik): ADD_CARRY + // TODO(benvanik): ADD_CARRY (w/ ARITHMETIC_SET_CARRY) case OPCODE_SUB: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); - v->Sub(i->src2.value); + bool did_carry = v->Sub(i->src2.value); + bool propagate_carry = !!(i->flags & ARITHMETIC_SET_CARRY); i->Remove(); + + // If carry is set find the DID_CARRY and fix it. + if (propagate_carry) { + PropagateCarry(v, did_carry); + } } break; case OPCODE_MUL: @@ -298,6 +414,13 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { i->Remove(); } break; + case OPCODE_CNTLZ: + if (i->src1.value->IsConstant()) { + v->set_zero(v->type); + v->CountLeadingZeros(i->src1.value); + i->Remove(); + } + break; // TODO(benvanik): INSERT/EXTRACT // TODO(benvanik): SPLAT/PERMUTE/SWIZZLE case OPCODE_SPLAT: @@ -314,3 +437,16 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) { return 0; } + +void ConstantPropagationPass::PropagateCarry(hir::Value* v, bool did_carry) { + auto next = v->use_head; + while (next) { + auto use = next; + next = use->next; + if (use->instr->opcode == &OPCODE_DID_CARRY_info) { + // Replace carry value. + use->instr->dest->set_constant(did_carry ? 1 : 0); + use->instr->Remove(); + } + } +} diff --git a/src/alloy/compiler/passes/constant_propagation_pass.h b/src/alloy/compiler/passes/constant_propagation_pass.h index ce705522b..2220394ad 100644 --- a/src/alloy/compiler/passes/constant_propagation_pass.h +++ b/src/alloy/compiler/passes/constant_propagation_pass.h @@ -26,6 +26,7 @@ public: virtual int Run(hir::HIRBuilder* builder); private: + void PropagateCarry(hir::Value* v, bool did_carry); }; diff --git a/src/alloy/compiler/passes/context_promotion_pass.cc b/src/alloy/compiler/passes/context_promotion_pass.cc index a5123486b..dc225aea6 100644 --- a/src/alloy/compiler/passes/context_promotion_pass.cc +++ b/src/alloy/compiler/passes/context_promotion_pass.cc @@ -9,6 +9,8 @@ #include +#include + #include #include @@ -20,6 +22,10 @@ using namespace alloy::hir; using namespace alloy::runtime; +DEFINE_bool(store_all_context_values, false, + "Don't strip dead context stores to aid in debugging."); + + ContextPromotionPass::ContextPromotionPass() : context_values_size_(0), context_values_(0), CompilerPass() { @@ -45,6 +51,8 @@ int ContextPromotionPass::Initialize(Compiler* compiler) { } int ContextPromotionPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Like mem2reg, but because context memory is unaliasable it's easier to // check and convert LoadContext/StoreContext into value operations. // Example of load->value promotion: @@ -69,10 +77,12 @@ int ContextPromotionPass::Run(HIRBuilder* builder) { } // Remove all dead stores. - block = builder->first_block(); - while (block) { - RemoveDeadStoresBlock(block); - block = block->next; + if (!FLAGS_store_all_context_values) { + block = builder->first_block(); + while (block) { + RemoveDeadStoresBlock(block); + block = block->next; + } } return 0; diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.cc b/src/alloy/compiler/passes/control_flow_analysis_pass.cc new file mode 100644 index 000000000..5cf6ea6a6 --- /dev/null +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.cc @@ -0,0 +1,69 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::frontend; +using namespace alloy::hir; +using namespace alloy::runtime; + + +ControlFlowAnalysisPass::ControlFlowAnalysisPass() : + CompilerPass() { +} + +ControlFlowAnalysisPass::~ControlFlowAnalysisPass() { +} + +int ControlFlowAnalysisPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + + // TODO(benvanik): reset edges for all blocks? Needed to be re-runnable. + + // Add edges. + auto block = builder->first_block(); + while (block) { + auto instr = block->instr_tail; + while (instr) { + if ((instr->opcode->flags & OPCODE_FLAG_BRANCH) == 0) { + break; + } + if (instr->opcode == &OPCODE_BRANCH_info) { + auto label = instr->src1.label; + builder->AddEdge(block, label->block, Edge::UNCONDITIONAL); + } else if (instr->opcode == &OPCODE_BRANCH_TRUE_info || + instr->opcode == &OPCODE_BRANCH_FALSE_info) { + auto label = instr->src2.label; + builder->AddEdge(block, label->block, 0); + } + instr = instr->prev; + } + block = block->next; + } + + // Mark dominators. + block = builder->first_block(); + while (block) { + if (block->incoming_edge_head && + !block->incoming_edge_head->incoming_next) { + block->incoming_edge_head->flags |= Edge::DOMINATES; + } + block = block->next; + } + + return 0; +} diff --git a/src/alloy/compiler/passes/control_flow_analysis_pass.h b/src/alloy/compiler/passes/control_flow_analysis_pass.h new file mode 100644 index 000000000..c639db5cb --- /dev/null +++ b/src/alloy/compiler/passes/control_flow_analysis_pass.h @@ -0,0 +1,37 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ +#define ALLOY_COMPILER_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ + +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class ControlFlowAnalysisPass : public CompilerPass { +public: + ControlFlowAnalysisPass(); + virtual ~ControlFlowAnalysisPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_CONTROL_FLOW_ANALYSIS_PASS_H_ diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.cc b/src/alloy/compiler/passes/data_flow_analysis_pass.cc new file mode 100644 index 000000000..209410016 --- /dev/null +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.cc @@ -0,0 +1,203 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +#pragma warning(push) +#pragma warning(disable : 4244) +#pragma warning(disable : 4267) +#include +#pragma warning(pop) + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::frontend; +using namespace alloy::hir; +using namespace alloy::runtime; + + +DataFlowAnalysisPass::DataFlowAnalysisPass() : + CompilerPass() { +} + +DataFlowAnalysisPass::~DataFlowAnalysisPass() { +} + +int DataFlowAnalysisPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + + // Linearize blocks so that we can detect cycles and propagate dependencies. + uint32_t block_count = LinearizeBlocks(builder); + + // Analyze value flow and add locals as needed. + AnalyzeFlow(builder, block_count); + + return 0; +} + +uint32_t DataFlowAnalysisPass::LinearizeBlocks(HIRBuilder* builder) { + // TODO(benvanik): actually do this - we cheat now knowing that they are in + // sequential order. + uint32_t block_ordinal = 0; + auto block = builder->first_block(); + while (block) { + block->ordinal = block_ordinal++; + block = block->next; + } + return block_ordinal; +} + +void DataFlowAnalysisPass::AnalyzeFlow(HIRBuilder* builder, + uint32_t block_count) { + uint32_t max_value_estimate = + builder->max_value_ordinal() + 1 + block_count * 4; + + // Stash for value map. We may want to maintain this during building. + auto arena = builder->arena(); + Value** value_map = (Value**)arena->Alloc( + sizeof(Value*) * max_value_estimate); + + // Allocate incoming bitvectors for use by blocks. We don't need outgoing + // because they are only used during the block iteration. + // Mapped by block ordinal. + // TODO(benvanik): cache this list, grow as needed, etc. + auto incoming_bitvectors = (llvm::BitVector**)arena->Alloc( + sizeof(llvm::BitVector*) * block_count); + for (auto n = 0u; n < block_count; n++) { + incoming_bitvectors[n] = new llvm::BitVector(max_value_estimate); + } + + // Walk blocks in reverse and calculate incoming/outgoing values. + auto block = builder->last_block(); + while (block) { + // Allocate bitsets based on max value number. + block->incoming_values = incoming_bitvectors[block->ordinal]; + auto& incoming_values = *block->incoming_values; + + // Walk instructions and gather up incoming values. + auto instr = block->instr_head; + while (instr) { + uint32_t signature = instr->opcode->signature; +#define SET_INCOMING_VALUE(v) \ + if (v->def && v->def->block != block) { \ + incoming_values.set(v->ordinal); \ + } \ + XEASSERT(v->ordinal < max_value_estimate); \ + value_map[v->ordinal] = v; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + SET_INCOMING_VALUE(instr->src1.value); + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + SET_INCOMING_VALUE(instr->src2.value); + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + SET_INCOMING_VALUE(instr->src3.value); + } +#undef SET_INCOMING_VALUE + instr = instr->next; + } + + // Add all successor incoming values to our outgoing, as we need to + // pass them through. + llvm::BitVector outgoing_values(max_value_estimate); + auto outgoing_edge = block->outgoing_edge_head; + while (outgoing_edge) { + if (outgoing_edge->dest->ordinal > block->ordinal) { + outgoing_values |= *outgoing_edge->dest->incoming_values; + } + outgoing_edge = outgoing_edge->outgoing_next; + } + incoming_values |= outgoing_values; + + // Add stores for all outgoing values. + auto outgoing_ordinal = outgoing_values.find_first(); + while (outgoing_ordinal != -1) { + Value* src_value = value_map[outgoing_ordinal]; + XEASSERTNOTNULL(src_value); + if (!src_value->local_slot) { + src_value->local_slot = builder->AllocLocal(src_value->type); + } + builder->StoreLocal(src_value->local_slot, src_value); + + // If we are in the block the value was defined in: + if (src_value->def->block == block) { + // Move the store to right after the def, or as soon after + // as we can (respecting PAIRED flags). + auto def_next = src_value->def->next; + while (def_next && def_next->opcode->flags & OPCODE_FLAG_PAIRED_PREV) { + def_next = def_next->next; + } + XEASSERTNOTNULL(def_next); + builder->last_instr()->MoveBefore(def_next); + + // We don't need it in the incoming list. + incoming_values.reset(outgoing_ordinal); + } else { + // Eh, just throw at the end, before the first branch. + auto tail = block->instr_tail; + while (tail && tail->opcode->flags & OPCODE_FLAG_BRANCH) { + tail = tail->prev; + } + XEASSERTNOTZERO(tail); + builder->last_instr()->MoveBefore(tail->next); + } + + outgoing_ordinal = outgoing_values.find_next(outgoing_ordinal); + } + + // Add loads for all incoming values and rename them in the block. + auto incoming_ordinal = incoming_values.find_first(); + while (incoming_ordinal != -1) { + Value* src_value = value_map[incoming_ordinal]; + XEASSERTNOTNULL(src_value); + if (!src_value->local_slot) { + src_value->local_slot = builder->AllocLocal(src_value->type); + } + Value* local_value = builder->LoadLocal(src_value->local_slot); + builder->last_instr()->MoveBefore(block->instr_head); + + // Swap uses of original value with the local value. + auto instr = block->instr_head; + while (instr) { + uint32_t signature = instr->opcode->signature; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src1.value == src_value) { + instr->set_src1(local_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src2.value == src_value) { + instr->set_src2(local_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src3.value == src_value) { + instr->set_src3(local_value); + } + } + instr = instr->next; + } + + incoming_ordinal = incoming_values.find_next(incoming_ordinal); + } + + block = block->prev; + } + + // Cleanup bitvectors. + for (auto n = 0u; n < block_count; n++) { + delete incoming_bitvectors[n]; + } +} diff --git a/src/alloy/compiler/passes/data_flow_analysis_pass.h b/src/alloy/compiler/passes/data_flow_analysis_pass.h new file mode 100644 index 000000000..d19dc6e1c --- /dev/null +++ b/src/alloy/compiler/passes/data_flow_analysis_pass.h @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_DATA_FLOW_ANALYSIS_PASS_H_ +#define ALLOY_COMPILER_PASSES_DATA_FLOW_ANALYSIS_PASS_H_ + +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class DataFlowAnalysisPass : public CompilerPass { +public: + DataFlowAnalysisPass(); + virtual ~DataFlowAnalysisPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: + uint32_t LinearizeBlocks(hir::HIRBuilder* builder); + void AnalyzeFlow(hir::HIRBuilder* builder, uint32_t block_count); +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_DATA_FLOW_ANALYSIS_PASS_H_ diff --git a/src/alloy/compiler/passes/dead_code_elimination_pass.cc b/src/alloy/compiler/passes/dead_code_elimination_pass.cc index a9b7c7bdb..afb8d87b2 100644 --- a/src/alloy/compiler/passes/dead_code_elimination_pass.cc +++ b/src/alloy/compiler/passes/dead_code_elimination_pass.cc @@ -23,6 +23,8 @@ DeadCodeEliminationPass::~DeadCodeEliminationPass() { } int DeadCodeEliminationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // ContextPromotion/DSE will likely leave around a lot of dead statements. // Code generated for comparison/testing produces many unused statements and // with proper use analysis it should be possible to remove most of them: @@ -59,20 +61,21 @@ int DeadCodeEliminationPass::Run(HIRBuilder* builder) { // all removed ops with NOP and then do a single pass that removes them // all. - bool any_removed = false; + bool any_instr_removed = false; + bool any_locals_removed = false; Block* block = builder->first_block(); while (block) { + // Walk instructions in reverse. Instr* i = block->instr_tail; while (i) { - Instr* prev = i->prev; + auto prev = i->prev; - const OpcodeInfo* opcode = i->opcode; - uint32_t signature = opcode->signature; + auto opcode = i->opcode; if (!(opcode->flags & OPCODE_FLAG_VOLATILE) && i->dest && !i->dest->use_head) { // Has no uses and is not volatile. This instruction can die! MakeNopRecursive(i); - any_removed = true; + any_instr_removed = true; } else if (opcode == &OPCODE_ASSIGN_info) { // Assignment. These are useless, so just try to remove by completely // replacing the value. @@ -82,11 +85,31 @@ int DeadCodeEliminationPass::Run(HIRBuilder* builder) { i = prev; } + // Walk instructions forward. + i = block->instr_head; + while (i) { + auto next = i->next; + + auto opcode = i->opcode; + if (opcode == &OPCODE_STORE_LOCAL_info) { + // Check to see if the store has any interceeding uses after the load. + // If not, it can be removed (as the local is just passing through the + // function). + // We do this after the previous pass so that removed code doesn't keep + // the local alive. + if (!CheckLocalUse(i)) { + any_locals_removed = true; + } + } + + i = next; + } + block = block->next; } // Remove all nops. - if (any_removed) { + if (any_instr_removed) { Block* block = builder->first_block(); while (block) { Instr* i = block->instr_head; @@ -102,6 +125,21 @@ int DeadCodeEliminationPass::Run(HIRBuilder* builder) { } } + // Remove any locals that no longer have uses. + if (any_locals_removed) { + // TODO(benvanik): local removal/dealloc. + auto locals = builder->locals(); + for (auto it = locals.begin(); it != locals.end();) { + auto next = ++it; + auto value = *it; + if (!value->use_head) { + // Unused, can be removed. + locals.erase(it); + } + it = next; + } + } + return 0; } @@ -150,3 +188,24 @@ void DeadCodeEliminationPass::ReplaceAssignment(Instr* i) { i->Remove(); } + +bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) { + auto slot = i->src1.value; + auto src = i->src2.value; + + auto use = src->use_head; + if (use) { + auto use_instr = use->instr; + if (use_instr->opcode != &OPCODE_LOAD_LOCAL_info) { + // A valid use (probably). Keep it. + return true; + } + + // Load/store are paired. They can both be removed. + use_instr->Remove(); + } + + i->Remove(); + + return false; +} diff --git a/src/alloy/compiler/passes/dead_code_elimination_pass.h b/src/alloy/compiler/passes/dead_code_elimination_pass.h index 9a8cfc43a..9c3100f8c 100644 --- a/src/alloy/compiler/passes/dead_code_elimination_pass.h +++ b/src/alloy/compiler/passes/dead_code_elimination_pass.h @@ -28,6 +28,7 @@ public: private: void MakeNopRecursive(hir::Instr* i); void ReplaceAssignment(hir::Instr* i); + bool CheckLocalUse(hir::Instr* i); }; diff --git a/src/alloy/compiler/passes/finalization_pass.cc b/src/alloy/compiler/passes/finalization_pass.cc index 3fa3fc1b6..e6358f242 100644 --- a/src/alloy/compiler/passes/finalization_pass.cc +++ b/src/alloy/compiler/passes/finalization_pass.cc @@ -30,6 +30,8 @@ FinalizationPass::~FinalizationPass() { } int FinalizationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Process the HIR and prepare it for lowering. // After this is done the HIR should be ready for emitting. @@ -44,9 +46,9 @@ int FinalizationPass::Run(HIRBuilder* builder) { auto label = block->label_head; while (label) { if (!label->name) { - char* name = (char*)arena->Alloc(6 + 4 + 1); - xestrcpya(name, 6 + 1, "_label"); - char* part = _itoa(label->id, name + 6, 10); + const size_t label_len = 6 + 4 + 1; + char* name = (char*)arena->Alloc(label_len); + xesnprintfa(name, label_len, "_label%d", label->id); label->name = name; } label = label->next; diff --git a/src/alloy/compiler/passes/register_allocation_pass.cc b/src/alloy/compiler/passes/register_allocation_pass.cc new file mode 100644 index 000000000..7c3a0a7a9 --- /dev/null +++ b/src/alloy/compiler/passes/register_allocation_pass.cc @@ -0,0 +1,539 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::hir; + + +#define ASSERT_NO_CYCLES 0 + + +RegisterAllocationPass::RegisterAllocationPass( + const MachineInfo* machine_info) : + machine_info_(machine_info), + CompilerPass() { + // Initialize register sets. + // TODO(benvanik): rewrite in a way that makes sense - this is terrible. + auto mi_sets = machine_info->register_sets; + xe_zero_struct(&usage_sets_, sizeof(usage_sets_)); + uint32_t n = 0; + while (mi_sets[n].count) { + auto& mi_set = mi_sets[n]; + auto usage_set = new RegisterSetUsage(); + usage_sets_.all_sets[n] = usage_set; + usage_set->count = mi_set.count; + usage_set->set = &mi_set; + if (mi_set.types & MachineInfo::RegisterSet::INT_TYPES) { + usage_sets_.int_set = usage_set; + } + if (mi_set.types & MachineInfo::RegisterSet::FLOAT_TYPES) { + usage_sets_.float_set = usage_set; + } + if (mi_set.types & MachineInfo::RegisterSet::VEC_TYPES) { + usage_sets_.vec_set = usage_set; + } + n++; + } +} + +RegisterAllocationPass::~RegisterAllocationPass() { + for (size_t n = 0; n < XECOUNT(usage_sets_.all_sets); n++) { + if (!usage_sets_.all_sets[n]) { + break; + } + delete usage_sets_.all_sets[n]; + } +} + +int RegisterAllocationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + + // Simple per-block allocator that operates on SSA form. + // Registers do not move across blocks, though this could be + // optimized with some intra-block analysis (dominators/etc). + // Really, it'd just be nice to have someone who knew what they + // were doing lower SSA and do this right. + + uint32_t block_ordinal = 0; + uint32_t instr_ordinal = 0; + auto block = builder->first_block(); + while (block) { + // Sequential block ordinals. + block->ordinal = block_ordinal++; + + // Reset all state. + PrepareBlockState(); + + // Renumber all instructions in the block. This is required so that + // we can sort the usage pointers below. + auto instr = block->instr_head; + while (instr) { + // Sequential global instruction ordinals. + instr->ordinal = instr_ordinal++; + instr = instr->next; + } + + instr = block->instr_head; + while (instr) { + const OpcodeInfo* info = instr->opcode; + uint32_t signature = info->signature; + + // Update the register use heaps. + AdvanceUses(instr); + + // Check sources for retirement. If any are unused after this instruction + // we can eagerly evict them to speed up register allocation. + // Since X64 (and other platforms) can often take advantage of dest==src1 + // register mappings we track retired src1 so that we can attempt to + // reuse it. + // NOTE: these checks require that the usage list be sorted! + bool has_preferred_reg = false; + RegAssignment preferred_reg = { 0 }; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V && + !instr->src1.value->IsConstant()) { + if (!instr->src1_use->next) { + // Pull off preferred register. We will try to reuse this for the + // dest. + has_preferred_reg = true; + preferred_reg = instr->src1.value->reg; + XEASSERTNOTNULL(preferred_reg.set); + } + } + + if (GET_OPCODE_SIG_TYPE_DEST(signature) == OPCODE_SIG_TYPE_V) { + // Must not have been set already. + XEASSERTNULL(instr->dest->reg.set); + + // Sort the usage list. We depend on this in future uses of this variable. + SortUsageList(instr->dest); + + // If we have a preferred register, use that. + // This way we can help along the stupid X86 two opcode instructions. + bool allocated; + if (has_preferred_reg) { + // Allocate with the given preferred register. If the register is in + // the wrong set it will not be reused. + allocated = TryAllocateRegister(instr->dest, preferred_reg); + } else { + // Allocate a register. This will either reserve a free one or + // spill and reuse an active one. + allocated = TryAllocateRegister(instr->dest); + } + if (!allocated) { + // Failed to allocate register -- need to spill and try again. + // We spill only those registers we aren't using. + if (!SpillOneRegister(builder, instr->dest->type)) { + // Unable to spill anything - this shouldn't happen. + XELOGE("Unable to spill any registers"); + XEASSERTALWAYS(); + return 1; + } + + // Demand allocation. + if (!TryAllocateRegister(instr->dest)) { + // Boned. + XELOGE("Register allocation failed"); + XEASSERTALWAYS(); + return 1; + } + } + } + + instr = instr->next; + } + block = block->next; + } + + return 0; +} + +void RegisterAllocationPass::DumpUsage(const char* name) { +#if 0 + fprintf(stdout, "\n%s:\n", name); + for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) { + auto usage_set = usage_sets_.all_sets[i]; + if (usage_set) { + fprintf(stdout, "set %s:\n", usage_set->set->name); + fprintf(stdout, " avail: %s\n", usage_set->availability.to_string().c_str()); + fprintf(stdout, " upcoming uses:\n"); + for (auto it = usage_set->upcoming_uses.begin(); + it != usage_set->upcoming_uses.end(); ++it) { + fprintf(stdout, " v%d, used at %d\n", + it->value->ordinal, + it->use->instr->ordinal); + } + } + } + fflush(stdout); +#endif +} + + +void RegisterAllocationPass::PrepareBlockState() { + for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) { + auto usage_set = usage_sets_.all_sets[i]; + if (usage_set) { + usage_set->availability.set(); + usage_set->upcoming_uses.clear(); + } + } + DumpUsage("PrepareBlockState"); +} + +void RegisterAllocationPass::AdvanceUses(Instr* instr) { + for (size_t i = 0; i < XECOUNT(usage_sets_.all_sets); ++i) { + auto usage_set = usage_sets_.all_sets[i]; + if (!usage_set) { + break; + } + auto& upcoming_uses = usage_set->upcoming_uses; + for (auto it = upcoming_uses.begin(); it != upcoming_uses.end();) { + if (!it->use) { + // No uses at all - we can remove right away. + // This comes up from instructions where the dest is never used, + // like the ATOMIC ops. + MarkRegAvailable(it->value->reg); + it = upcoming_uses.erase(it); + continue; + } + if (it->use->instr != instr) { + // Not yet at this instruction. + ++it; + continue; + } + // The use is from this instruction. + if (!it->use->next) { + // Last use of the value. We can retire it now. + MarkRegAvailable(it->value->reg); + it = upcoming_uses.erase(it); + } else { + // Used again. Push back the next use. + // Note that we may be used multiple times this instruction, so + // eat those. + auto next_use = it->use->next; + while (next_use->next && next_use->instr == instr) { + next_use = next_use->next; + } + // Remove the iterator. + auto value = it->value; + it = upcoming_uses.erase(it); + upcoming_uses.emplace_back(value, next_use); + } + } + } + DumpUsage("AdvanceUses"); +} + +bool RegisterAllocationPass::IsRegInUse(const RegAssignment& reg) { + RegisterSetUsage* usage_set; + if (reg.set == usage_sets_.int_set->set) { + usage_set = usage_sets_.int_set; + } else if (reg.set == usage_sets_.float_set->set) { + usage_set = usage_sets_.float_set; + } else { + usage_set = usage_sets_.vec_set; + } + return !usage_set->availability.test(reg.index); +} + +RegisterAllocationPass::RegisterSetUsage* +RegisterAllocationPass::MarkRegUsed(const RegAssignment& reg, + Value* value, Value::Use* use) { + auto usage_set = RegisterSetForValue(value); + usage_set->availability.set(reg.index, false); + usage_set->upcoming_uses.emplace_back(value, use); + DumpUsage("MarkRegUsed"); + return usage_set; +} + +RegisterAllocationPass::RegisterSetUsage* +RegisterAllocationPass::MarkRegAvailable(const hir::RegAssignment& reg) { + RegisterSetUsage* usage_set; + if (reg.set == usage_sets_.int_set->set) { + usage_set = usage_sets_.int_set; + } else if (reg.set == usage_sets_.float_set->set) { + usage_set = usage_sets_.float_set; + } else { + usage_set = usage_sets_.vec_set; + } + usage_set->availability.set(reg.index, true); + return usage_set; +} + +bool RegisterAllocationPass::TryAllocateRegister( + Value* value, const RegAssignment& preferred_reg) { + // If the preferred register matches type and is available, use it. + auto usage_set = RegisterSetForValue(value); + if (usage_set->set == preferred_reg.set) { + // Check if available. + if (!IsRegInUse(preferred_reg)) { + // Mark as in-use and return. Best case. + MarkRegUsed(preferred_reg, value, value->use_head); + value->reg = preferred_reg; + return true; + } + } + + // Otherwise, fallback to allocating like normal. + return TryAllocateRegister(value); +} + +bool RegisterAllocationPass::TryAllocateRegister(Value* value) { + // Get the set this register is in. + RegisterSetUsage* usage_set = RegisterSetForValue(value); + + // Find the first free register, if any. + // We have to ensure it's a valid one (in our count). + unsigned long first_unused = 0; + bool all_used = _BitScanForward(&first_unused, usage_set->availability.to_ulong()) == 0; + if (!all_used && first_unused < usage_set->count) { + // Available! Use it!. + value->reg.set = usage_set->set; + value->reg.index = first_unused; + MarkRegUsed(value->reg, value, value->use_head); + return true; + } + + // None available! Spill required. + return false; +} + +bool RegisterAllocationPass::SpillOneRegister( + HIRBuilder* builder, TypeName required_type) { + // Get the set that we will be picking from. + RegisterSetUsage* usage_set; + if (required_type <= INT64_TYPE) { + usage_set = usage_sets_.int_set; + } else if (required_type <= FLOAT64_TYPE) { + usage_set = usage_sets_.float_set; + } else { + usage_set = usage_sets_.vec_set; + } + + DumpUsage("SpillOneRegister (pre)"); + // Pick the one with the furthest next use. + XEASSERT(!usage_set->upcoming_uses.empty()); + auto furthest_usage = std::max_element( + usage_set->upcoming_uses.begin(), usage_set->upcoming_uses.end(), + RegisterUsage::Comparer()); + Value* spill_value = furthest_usage->value; + Value::Use* prev_use = furthest_usage->use->prev; + Value::Use* next_use = furthest_usage->use; + XEASSERTNOTNULL(next_use); + usage_set->upcoming_uses.erase(furthest_usage); + DumpUsage("SpillOneRegister (post)"); + const auto reg = spill_value->reg; + + // We know the spill_value use list is sorted, so we can cut it right now. + // This makes it easier down below. + auto new_head_use = next_use; + + // Allocate local. + if (spill_value->local_slot) { + // Value is already assigned a slot. Since we allocate in order and this is + // all SSA we know the stored value will be exactly what we want. Yay, + // we can prevent the redundant store! + // In fact, we may even want to pin this spilled value so that we always + // use the spilled value and prevent the need for more locals. + } else { + // Allocate a local slot. + spill_value->local_slot = builder->AllocLocal(spill_value->type); + + // Add store. + builder->StoreLocal(spill_value->local_slot, spill_value); + auto spill_store = builder->last_instr(); + auto spill_store_use = spill_store->src2_use; + XEASSERTNULL(spill_store_use->prev); + if (prev_use && prev_use->instr->opcode->flags & OPCODE_FLAG_PAIRED_PREV) { + // Instruction is paired. This is bad. We will insert the spill after the + // paired instruction. + XEASSERTNOTNULL(prev_use->instr->next); + spill_store->MoveBefore(prev_use->instr->next); + + // Update last use. + spill_value->last_use = spill_store; + } else if (prev_use) { + // We insert the store immediately before the previous use. + // If we were smarter we could then re-run allocation and reuse the register + // once dropped. + spill_store->MoveBefore(prev_use->instr); + + // Update last use. + spill_value->last_use = prev_use->instr; + } else { + // This is the first use, so the only thing we have is the define. + // Move the store to right after that. + spill_store->MoveBefore(spill_value->def->next); + + // Update last use. + spill_value->last_use = spill_store; + } + } + +#if ASSERT_NO_CYCLES + builder->AssertNoCycles(); + spill_value->def->block->AssertNoCycles(); +#endif // ASSERT_NO_CYCLES + + // Add load. + // Inserted immediately before the next use. Since by definition the next + // use is after the instruction requesting the spill we know we haven't + // done allocation for that code yet and can let that be handled + // automatically when we get to it. + auto new_value = builder->LoadLocal(spill_value->local_slot); + auto spill_load = builder->last_instr(); + spill_load->MoveBefore(next_use->instr); + // Note: implicit first use added. + +#if ASSERT_NO_CYCLES + builder->AssertNoCycles(); + spill_value->def->block->AssertNoCycles(); +#endif // ASSERT_NO_CYCLES + + // Set the local slot of the new value to our existing one. This way we will + // reuse that same memory if needed. + new_value->local_slot = spill_value->local_slot; + + // Rename all future uses of the SSA value to the new value as loaded + // from the local. + // We can quickly do this by walking the use list. Because the list is + // already sorted we know we are going to end up with a sorted list. + auto walk_use = new_head_use; + auto new_use_tail = walk_use; + while (walk_use) { + auto next_walk_use = walk_use->next; + auto instr = walk_use->instr; + + uint32_t signature = instr->opcode->signature; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src1.value == spill_value) { + instr->set_src1(new_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src2.value == spill_value) { + instr->set_src2(new_value); + } + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + if (instr->src3.value == spill_value) { + instr->set_src3(new_value); + } + } + + walk_use = next_walk_use; + if (walk_use) { + new_use_tail = walk_use; + } + } + new_value->last_use = new_use_tail->instr; + + // Update tracking. + MarkRegAvailable(reg); + + return true; +} + +RegisterAllocationPass::RegisterSetUsage* +RegisterAllocationPass::RegisterSetForValue( + const Value* value) { + if (value->type <= INT64_TYPE) { + return usage_sets_.int_set; + } else if (value->type <= FLOAT64_TYPE) { + return usage_sets_.float_set; + } else { + return usage_sets_.vec_set; + } +} + +namespace { +int CompareValueUse(const Value::Use* a, const Value::Use* b) { + return a->instr->ordinal - b->instr->ordinal; +} +} // namespace +void RegisterAllocationPass::SortUsageList(Value* value) { + // Modified in-place linked list sort from: + // http://www.chiark.greenend.org.uk/~sgtatham/algorithms/listsort.c + if (!value->use_head) { + return; + } + Value::Use* head = value->use_head; + Value::Use* tail = nullptr; + int insize = 1; + while (true) { + auto p = head; + head = nullptr; + tail = nullptr; + // count number of merges we do in this pass + int nmerges = 0; + while (p) { + // there exists a merge to be done + nmerges++; + // step 'insize' places along from p + auto q = p; + int psize = 0; + for (int i = 0; i < insize; i++) { + psize++; + q = q->next; + if (!q) break; + } + // if q hasn't fallen off end, we have two lists to merge + int qsize = insize; + // now we have two lists; merge them + while (psize > 0 || (qsize > 0 && q)) { + // decide whether next element of merge comes from p or q + Value::Use* e = nullptr; + if (psize == 0) { + // p is empty; e must come from q + e = q; q = q->next; qsize--; + } else if (qsize == 0 || !q) { + // q is empty; e must come from p + e = p; p = p->next; psize--; + } else if (CompareValueUse(p, q) <= 0) { + // First element of p is lower (or same); e must come from p + e = p; p = p->next; psize--; + } else { + // First element of q is lower; e must come from q + e = q; q = q->next; qsize--; + } + // add the next element to the merged list + if (tail) { + tail->next = e; + } else { + head = e; + } + // Maintain reverse pointers in a doubly linked list. + e->prev = tail; + tail = e; + } + // now p has stepped 'insize' places along, and q has too + p = q; + } + if (tail) { + tail->next = nullptr; + } + // If we have done only one merge, we're finished + if (nmerges <= 1) { + // allow for nmerges==0, the empty list case + break; + } + // Otherwise repeat, merging lists twice the size + insize *= 2; + } + + value->use_head = head; + value->last_use = tail->instr; +} diff --git a/src/alloy/compiler/passes/register_allocation_pass.h b/src/alloy/compiler/passes/register_allocation_pass.h new file mode 100644 index 000000000..aa5943aea --- /dev/null +++ b/src/alloy/compiler/passes/register_allocation_pass.h @@ -0,0 +1,89 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ +#define ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ + +#include +#include +#include + +#include +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class RegisterAllocationPass : public CompilerPass { +public: + RegisterAllocationPass(const backend::MachineInfo* machine_info); + virtual ~RegisterAllocationPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: + // TODO(benvanik): rewrite all this set shit -- too much indirection, the + // complexity is not needed. + struct RegisterUsage { + hir::Value* value; + hir::Value::Use* use; + RegisterUsage() : value(nullptr), use(nullptr) {} + RegisterUsage(hir::Value* value_, hir::Value::Use* use_) + : value(value_), use(use_) {} + struct Comparer : std::binary_function { + bool operator()(const RegisterUsage& a, const RegisterUsage& b) const { + return a.use->instr->ordinal < b.use->instr->ordinal; + } + }; + }; + struct RegisterSetUsage { + const backend::MachineInfo::RegisterSet* set = nullptr; + uint32_t count = 0; + std::bitset<32> availability = 0; + // TODO(benvanik): another data type. + std::vector upcoming_uses; + }; + + void DumpUsage(const char* name); + void PrepareBlockState(); + void AdvanceUses(hir::Instr* instr); + bool IsRegInUse(const hir::RegAssignment& reg); + RegisterSetUsage* MarkRegUsed(const hir::RegAssignment& reg, + hir::Value* value, hir::Value::Use* use); + RegisterSetUsage* MarkRegAvailable(const hir::RegAssignment& reg); + + bool TryAllocateRegister(hir::Value* value, + const hir::RegAssignment& preferred_reg); + bool TryAllocateRegister(hir::Value* value); + bool SpillOneRegister(hir::HIRBuilder* builder, hir::TypeName required_type); + + RegisterSetUsage* RegisterSetForValue(const hir::Value* value); + + void SortUsageList(hir::Value* value); + +private: + const backend::MachineInfo* machine_info_; + struct { + RegisterSetUsage* int_set = nullptr; + RegisterSetUsage* float_set = nullptr; + RegisterSetUsage* vec_set = nullptr; + RegisterSetUsage* all_sets[3]; + } usage_sets_; +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_ diff --git a/src/alloy/compiler/passes/simplification_pass.cc b/src/alloy/compiler/passes/simplification_pass.cc index 14cea8681..7fc53c940 100644 --- a/src/alloy/compiler/passes/simplification_pass.cc +++ b/src/alloy/compiler/passes/simplification_pass.cc @@ -23,6 +23,8 @@ SimplificationPass::~SimplificationPass() { } int SimplificationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + EliminateConversions(builder); SimplifyAssignments(builder); return 0; diff --git a/src/alloy/compiler/passes/sources.gypi b/src/alloy/compiler/passes/sources.gypi index 251e6350a..ed16920ad 100644 --- a/src/alloy/compiler/passes/sources.gypi +++ b/src/alloy/compiler/passes/sources.gypi @@ -5,14 +5,22 @@ 'constant_propagation_pass.h', 'context_promotion_pass.cc', 'context_promotion_pass.h', + 'control_flow_analysis_pass.cc', + 'control_flow_analysis_pass.h', + 'data_flow_analysis_pass.cc', + 'data_flow_analysis_pass.h', 'dead_code_elimination_pass.cc', 'dead_code_elimination_pass.h', 'finalization_pass.cc', 'finalization_pass.h', #'dead_store_elimination_pass.cc', #'dead_store_elimination_pass.h', + 'register_allocation_pass.cc', + 'register_allocation_pass.h', 'simplification_pass.cc', 'simplification_pass.h', + 'validation_pass.cc', + 'validation_pass.h', 'value_reduction_pass.cc', 'value_reduction_pass.h', ], diff --git a/src/alloy/compiler/passes/validation_pass.cc b/src/alloy/compiler/passes/validation_pass.cc new file mode 100644 index 000000000..265c82fe9 --- /dev/null +++ b/src/alloy/compiler/passes/validation_pass.cc @@ -0,0 +1,101 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include + +using namespace alloy; +using namespace alloy::backend; +using namespace alloy::compiler; +using namespace alloy::compiler::passes; +using namespace alloy::frontend; +using namespace alloy::hir; +using namespace alloy::runtime; + + +ValidationPass::ValidationPass() : + CompilerPass() { +} + +ValidationPass::~ValidationPass() { +} + +int ValidationPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + + StringBuffer str; + builder->Dump(&str); + printf(str.GetString()); + fflush(stdout); + str.Reset(); + + auto block = builder->first_block(); + while (block) { + auto label = block->label_head; + while (label) { + XEASSERT(label->block == block); + if (label->block != block) { + return 1; + } + label = label->next; + } + + auto instr = block->instr_head; + while (instr) { + if (ValidateInstruction(block, instr)) { + return 1; + } + instr = instr->next; + } + + block = block->next; + } + + return 0; +} + +int ValidationPass::ValidateInstruction(Block* block, Instr* instr) { + XEASSERT(instr->block == block); + if (instr->block != block) { + return 1; + } + + uint32_t signature = instr->opcode->signature; + if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) { + if (ValidateValue(block, instr, instr->src1.value)) { + return 1; + } + } + if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) { + if (ValidateValue(block, instr, instr->src2.value)) { + return 1; + } + } + if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) { + if (ValidateValue(block, instr, instr->src3.value)) { + return 1; + } + } + + return 0; +} + +int ValidationPass::ValidateValue(Block* block, Instr* instr, Value* value) { + //if (value->def) { + // auto def = value->def; + // XEASSERT(def->block == block); + // if (def->block != block) { + // return 1; + // } + //} + return 0; +} diff --git a/src/alloy/compiler/passes/validation_pass.h b/src/alloy/compiler/passes/validation_pass.h new file mode 100644 index 000000000..a9f0c8f9a --- /dev/null +++ b/src/alloy/compiler/passes/validation_pass.h @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef ALLOY_COMPILER_PASSES_VALIDATION_PASS_H_ +#define ALLOY_COMPILER_PASSES_VALIDATION_PASS_H_ + +#include + + +namespace alloy { +namespace compiler { +namespace passes { + + +class ValidationPass : public CompilerPass { +public: + ValidationPass(); + virtual ~ValidationPass(); + + virtual int Run(hir::HIRBuilder* builder); + +private: + int ValidateInstruction(hir::Block* block, hir::Instr* instr); + int ValidateValue(hir::Block* block, hir::Instr* instr, hir::Value* value); +}; + + +} // namespace passes +} // namespace compiler +} // namespace alloy + + +#endif // ALLOY_COMPILER_PASSES_VALIDATION_PASS_H_ diff --git a/src/alloy/compiler/passes/value_reduction_pass.cc b/src/alloy/compiler/passes/value_reduction_pass.cc index 78367f35a..94453e294 100644 --- a/src/alloy/compiler/passes/value_reduction_pass.cc +++ b/src/alloy/compiler/passes/value_reduction_pass.cc @@ -13,7 +13,11 @@ #include #include -#include +#pragma warning(push) +#pragma warning(disable : 4244) +#pragma warning(disable : 4267) +#include +#pragma warning(pop) using namespace alloy; using namespace alloy::backend; @@ -49,10 +53,11 @@ void ValueReductionPass::ComputeLastUse(Value* value) { } int ValueReductionPass::Run(HIRBuilder* builder) { + SCOPE_profile_cpu_f("alloy"); + // Walk each block and reuse variable ordinals as much as possible. - // Let's hope this is enough. - std::bitset<1024> ordinals; + llvm::BitVector ordinals(builder->max_value_ordinal()); auto block = builder->first_block(); while (block) { @@ -74,34 +79,40 @@ int ValueReductionPass::Run(HIRBuilder* builder) { OpcodeSignatureType src1_type = GET_OPCODE_SIG_TYPE_SRC1(info->signature); OpcodeSignatureType src2_type = GET_OPCODE_SIG_TYPE_SRC2(info->signature); OpcodeSignatureType src3_type = GET_OPCODE_SIG_TYPE_SRC3(info->signature); - if (src1_type == OPCODE_SIG_TYPE_V && !instr->src1.value->IsConstant()) { + if (src1_type == OPCODE_SIG_TYPE_V) { auto v = instr->src1.value; if (!v->last_use) { ComputeLastUse(v); } if (v->last_use == instr) { // Available. - ordinals.set(v->ordinal, false); + if (!instr->src1.value->IsConstant()) { + ordinals.reset(v->ordinal); + } } } - if (src2_type == OPCODE_SIG_TYPE_V && !instr->src2.value->IsConstant()) { + if (src2_type == OPCODE_SIG_TYPE_V) { auto v = instr->src2.value; if (!v->last_use) { ComputeLastUse(v); } if (v->last_use == instr) { // Available. - ordinals.set(v->ordinal, false); + if (!instr->src2.value->IsConstant()) { + ordinals.reset(v->ordinal); + } } } - if (src3_type == OPCODE_SIG_TYPE_V && !instr->src3.value->IsConstant()) { + if (src3_type == OPCODE_SIG_TYPE_V) { auto v = instr->src3.value; if (!v->last_use) { ComputeLastUse(v); } if (v->last_use == instr) { // Available. - ordinals.set(v->ordinal, false); + if (!instr->src3.value->IsConstant()) { + ordinals.reset(v->ordinal); + } } } if (dest_type == OPCODE_SIG_TYPE_V) { @@ -109,7 +120,7 @@ int ValueReductionPass::Run(HIRBuilder* builder) { // source value ordinal. auto v = instr->dest; // Find a lower ordinal. - for (auto n = 0; n < ordinals.size(); n++) { + for (auto n = 0u; n < ordinals.size(); n++) { if (!ordinals.test(n)) { ordinals.set(n); v->ordinal = n; diff --git a/src/alloy/compiler/tracing.h b/src/alloy/compiler/tracing.h index 04da6d9ee..85d99992a 100644 --- a/src/alloy/compiler/tracing.h +++ b/src/alloy/compiler/tracing.h @@ -27,10 +27,10 @@ public: ALLOY_COMPILER_DEINIT = ALLOY_COMPILER | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_COMPILER_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_COMPILER_DEINIT; } Deinit; }; diff --git a/src/alloy/core.h b/src/alloy/core.h index d61b3dd2c..3beb11ba4 100644 --- a/src/alloy/core.h +++ b/src/alloy/core.h @@ -44,7 +44,33 @@ typedef struct XECACHEALIGN vec128_s { uint64_t high; }; }; + + bool operator== (const vec128_s& b) const { + return low == b.low && high == b.high; + } } vec128_t; +XEFORCEINLINE vec128_t vec128i(uint32_t x, uint32_t y, uint32_t z, uint32_t w) { + vec128_t v; + v.i4[0] = x; v.i4[1] = y; v.i4[2] = z; v.i4[3] = w; + return v; +} +XEFORCEINLINE vec128_t vec128f(float x, float y, float z, float w) { + vec128_t v; + v.f4[0] = x; v.f4[1] = y; v.f4[2] = z; v.f4[3] = w; + return v; +} +XEFORCEINLINE vec128_t vec128b( + uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, + uint8_t y0, uint8_t y1, uint8_t y2, uint8_t y3, + uint8_t z0, uint8_t z1, uint8_t z2, uint8_t z3, + uint8_t w0, uint8_t w1, uint8_t w2, uint8_t w3) { + vec128_t v; + v.b16[0] = x3; v.b16[1] = x2; v.b16[2] = x1; v.b16[3] = x0; + v.b16[4] = y3; v.b16[5] = y2; v.b16[6] = y1; v.b16[7] = y0; + v.b16[8] = z3; v.b16[9] = z2; v.b16[10] = z1; v.b16[11] = z0; + v.b16[12] = w3; v.b16[13] = w2; v.b16[14] = w1; v.b16[15] = w0; + return v; +} } // namespace alloy diff --git a/src/alloy/delegate.h b/src/alloy/delegate.h index e6ad2fcd1..176ff4b6b 100644 --- a/src/alloy/delegate.h +++ b/src/alloy/delegate.h @@ -11,6 +11,7 @@ #define ALLOY_DELEGATE_H_ #include +#include #include #include diff --git a/src/alloy/frontend/ppc/ppc_context.h b/src/alloy/frontend/ppc/ppc_context.h index 92d6d2877..5bc5f159e 100644 --- a/src/alloy/frontend/ppc/ppc_context.h +++ b/src/alloy/frontend/ppc/ppc_context.h @@ -67,6 +67,8 @@ typedef struct XECACHEALIGN64 PPCContext_s { // Must be stored at 0x0 for now. // TODO(benvanik): find a nice way to describe this to the JIT. runtime::ThreadState* thread_state; + // TODO(benvanik): this is getting nasty. Must be here. + uint8_t* membase; // Most frequently used registers first. uint64_t r[32]; // General purpose registers @@ -196,7 +198,6 @@ typedef struct XECACHEALIGN64 PPCContext_s { // Runtime-specific data pointer. Used on callbacks to get access to the // current runtime and its data. - uint8_t* membase; runtime::Runtime* runtime; volatile int suspend_flag; diff --git a/src/alloy/frontend/ppc/ppc_disasm.cc b/src/alloy/frontend/ppc/ppc_disasm.cc index 99e325fa6..aa823a972 100644 --- a/src/alloy/frontend/ppc/ppc_disasm.cc +++ b/src/alloy/frontend/ppc/ppc_disasm.cc @@ -115,7 +115,7 @@ void Disasm_X_RA_RB(InstrData& i, StringBuffer* str) { i.X.RA, i.X.RB); } void Disasm_XO_RT_RA_RB(InstrData& i, StringBuffer* str) { - str->Append("%*s%s%s r%d, r%d", i.XO.Rc ? -7 : -8, i.type->name, + str->Append("%*s%s%s r%d, r%d, r%d", i.XO.Rc ? -7 : -8, i.type->name, i.XO.OE ? "o" : "", i.XO.Rc ? "." : "", i.XO.RT, i.XO.RA, i.XO.RB); } @@ -266,7 +266,7 @@ void Disasm_dcbz(InstrData& i, StringBuffer* str) { } void Disasm_fcmp(InstrData& i, StringBuffer* str) { - str->Append("%-8s cr%d, r%d, r%d", i.type->name, + str->Append("%-8s cr%d, f%d, f%d", i.type->name, i.X.RT >> 2, i.X.RA, i.X.RB); } diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index d5a77c400..1a985d1ae 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -105,6 +105,10 @@ Value* CalculateEA_0(PPCHIRBuilder& f, uint32_t ra, uint32_t rb); // } +unsigned int xerotl(unsigned int value, unsigned int shift) { + XEASSERT(shift < 32); + return shift == 0 ? value : ((value << shift) | (value >> (32 - shift))); +} XEEMITTER(dst, 0x7C0002AC, XDSS)(PPCHIRBuilder& f, InstrData& i) { XEINSTRNOTIMPLEMENTED(); @@ -1797,7 +1801,7 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, InstrData // http://hlssmod.net/he_code/public/pixelwriter.h // control = prev:0123 | new:4567 uint32_t control = 0x00010203; // original - uint32_t src = _rotl(0x04050607, shift * 8); + uint32_t src = xerotl(0x04050607, shift * 8); uint32_t mask = 0; switch (pack) { case 1: // VPACK_32 diff --git a/src/alloy/frontend/ppc/ppc_emit_alu.cc b/src/alloy/frontend/ppc/ppc_emit_alu.cc index 7144e7eb6..ce023eb85 100644 --- a/src/alloy/frontend/ppc/ppc_emit_alu.cc +++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc @@ -643,20 +643,20 @@ XEEMITTER(cmpli, 0x28000000, D )(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(andx, 0x7C000038, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- (RS) & (RB) Value* ra = f.And(f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } XEEMITTER(andcx, 0x7C000078, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- (RS) & ¬(RB) Value* ra = f.And(f.LoadGPR(i.X.RT), f.Not(f.LoadGPR(i.X.RB))); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -665,8 +665,8 @@ XEEMITTER(andix, 0x70000000, D )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.And( f.LoadGPR(i.D.RT), f.LoadConstant((uint64_t)i.D.DS)); - f.UpdateCR(0, ra); f.StoreGPR(i.D.RA, ra); + f.UpdateCR(0, ra); return 0; } @@ -675,8 +675,8 @@ XEEMITTER(andisx, 0x74000000, D )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.And( f.LoadGPR(i.D.RT), f.LoadConstant((uint64_t(i.D.DS) << 16))); - f.UpdateCR(0, ra); f.StoreGPR(i.D.RA, ra); + f.UpdateCR(0, ra); return 0; } @@ -688,10 +688,10 @@ XEEMITTER(cntlzdx, 0x7C000074, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- n Value* v = f.CountLeadingZeros(f.LoadGPR(i.X.RT)); v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.X.RA, v); if (i.X.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.X.RA, v); return 0; } @@ -704,10 +704,10 @@ XEEMITTER(cntlzwx, 0x7C000034, X )(PPCHIRBuilder& f, InstrData& i) { Value* v = f.CountLeadingZeros( f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE)); v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.X.RA, v); if (i.X.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.X.RA, v); return 0; } @@ -715,10 +715,10 @@ XEEMITTER(eqvx, 0x7C000238, X )(PPCHIRBuilder& f, InstrData& i) { // RA <- (RS) == (RB) Value* ra = f.Xor(f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); ra = f.Not(ra); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -728,10 +728,10 @@ XEEMITTER(extsbx, 0x7C000774, X )(PPCHIRBuilder& f, InstrData& i) { // RA[0:55] <- i56.s Value* rt = f.LoadGPR(i.X.RT); rt = f.SignExtend(f.Truncate(rt, INT8_TYPE), INT64_TYPE); + f.StoreGPR(i.X.RA, rt); if (i.X.Rc) { f.UpdateCR(0, rt); } - f.StoreGPR(i.X.RA, rt); return 0; } @@ -741,10 +741,10 @@ XEEMITTER(extshx, 0x7C000734, X )(PPCHIRBuilder& f, InstrData& i) { // RA[0:47] <- 48.s Value* rt = f.LoadGPR(i.X.RT); rt = f.SignExtend(f.Truncate(rt, INT16_TYPE), INT64_TYPE); + f.StoreGPR(i.X.RA, rt); if (i.X.Rc) { f.UpdateCR(0, rt); } - f.StoreGPR(i.X.RA, rt); return 0; } @@ -754,10 +754,10 @@ XEEMITTER(extswx, 0x7C0007B4, X )(PPCHIRBuilder& f, InstrData& i) { // RA[0:31] <- i32.s Value* rt = f.LoadGPR(i.X.RT); rt = f.SignExtend(f.Truncate(rt, INT32_TYPE), INT64_TYPE); + f.StoreGPR(i.X.RA, rt); if (i.X.Rc) { f.UpdateCR(0, rt); } - f.StoreGPR(i.X.RA, rt); return 0; } @@ -767,10 +767,10 @@ XEEMITTER(nandx, 0x7C0003B8, X )(PPCHIRBuilder& f, InstrData& i) { f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); ra = f.Not(ra); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -780,10 +780,10 @@ XEEMITTER(norx, 0x7C0000F8, X )(PPCHIRBuilder& f, InstrData& i) { f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); ra = f.Not(ra); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -803,10 +803,10 @@ XEEMITTER(orx, 0x7C000378, X )(PPCHIRBuilder& f, InstrData& i) { f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); } + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -815,10 +815,10 @@ XEEMITTER(orcx, 0x7C000338, X )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.Or( f.LoadGPR(i.X.RT), f.Not(f.LoadGPR(i.X.RB))); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -849,10 +849,10 @@ XEEMITTER(xorx, 0x7C000278, X )(PPCHIRBuilder& f, InstrData& i) { Value* ra = f.Xor( f.LoadGPR(i.X.RT), f.LoadGPR(i.X.RB)); + f.StoreGPR(i.X.RA, ra); if (i.X.Rc) { f.UpdateCR(0, ra); } - f.StoreGPR(i.X.RA, ra); return 0; } @@ -895,10 +895,10 @@ XEEMITTER(rld, 0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) { if (m != 0xFFFFFFFFFFFFFFFF) { v = f.And(v, f.LoadConstant(m)); } + f.StoreGPR(i.MD.RA, v); if (i.MD.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.MD.RA, v); return 0; } else if (i.MD.idx == 1) { // XEEMITTER(rldicrx, 0x78000004, MD ) @@ -922,10 +922,10 @@ XEEMITTER(rld, 0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) { v = f.And(v, f.LoadConstant(m)); } } + f.StoreGPR(i.MD.RA, v); if (i.MD.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.MD.RA, v); return 0; } else if (i.MD.idx == 2) { // XEEMITTER(rldicx, 0x78000008, MD ) @@ -959,10 +959,10 @@ XEEMITTER(rld, 0x78000000, MDS)(PPCHIRBuilder& f, InstrData& i) { f.And(v, f.LoadConstant(m)), f.And(ra, f.LoadConstant(~m))); } + f.StoreGPR(i.MD.RA, v); if (i.MD.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.MD.RA, v); return 0; } else { XEINSTRNOTIMPLEMENTED(); @@ -987,10 +987,10 @@ XEEMITTER(rlwimix, 0x50000000, M )(PPCHIRBuilder& f, InstrData& i) { } v = f.ZeroExtend(v, INT64_TYPE); v = f.Or(v, f.And(f.LoadGPR(i.M.RA), f.LoadConstant((~(uint64_t)m)))); + f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.M.RA, v); return 0; } @@ -1014,10 +1014,10 @@ XEEMITTER(rlwinmx, 0x54000000, M )(PPCHIRBuilder& f, InstrData& i) { v = f.And(v, f.LoadConstant((uint32_t)XEMASK(i.M.MB + 32, i.M.ME + 32))); } v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.M.RA, v); return 0; } @@ -1027,7 +1027,8 @@ XEEMITTER(rlwnmx, 0x5C000000, M )(PPCHIRBuilder& f, InstrData& i) { // m <- MASK(MB+32, ME+32) // RA <- r & m Value* v = f.Truncate(f.LoadGPR(i.M.RT), INT32_TYPE); - Value* sh = f.And(f.LoadGPR(i.M.SH), f.LoadConstant(0x1F)); + Value* sh = f.And(f.Truncate(f.LoadGPR(i.M.SH), INT32_TYPE), + f.LoadConstant(0x1F)); v = f.RotateLeft(v, sh); // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here // as our truncation/zero-extend does it for us. @@ -1035,10 +1036,10 @@ XEEMITTER(rlwnmx, 0x5C000000, M )(PPCHIRBuilder& f, InstrData& i) { v = f.And(v, f.LoadConstant((uint32_t)XEMASK(i.M.MB + 32, i.M.ME + 32))); } v = f.ZeroExtend(v, INT64_TYPE); + f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.M.RA, v); return 0; } @@ -1145,7 +1146,7 @@ XEEMITTER(sradx, 0x7C000634, X )(PPCHIRBuilder& f, InstrData& i) { // CA is set to 1 if the low-order 32 bits of (RS) contain a negative number // and any 1-bits are shifted out of position 63; otherwise CA is set to 0. // We already have ca set to indicate the pos 63 bit, now just and in sign. - ca = f.And(ca, f.Shr(v, 63)); + ca = f.And(ca, f.Truncate(f.Shr(v, 63), INT8_TYPE)); f.StoreCA(ca); f.StoreGPR(i.X.RA, v); @@ -1173,15 +1174,15 @@ XEEMITTER(sradix, 0x7C000674, XS )(PPCHIRBuilder& f, InstrData& i) { XEASSERT(sh); uint64_t mask = XEMASK(64 - sh, 63); Value* ca = f.And( - f.Shr(v, 63), + f.Truncate(f.Shr(v, 63), INT8_TYPE), f.IsTrue(f.And(v, f.LoadConstant(mask)))); f.StoreCA(ca); v = f.Sha(v, sh); + f.StoreGPR(i.XS.RA, v); if (i.XS.Rc) { f.UpdateCR(0, v); } - f.StoreGPR(i.XS.RA, v); return 0; } @@ -1197,12 +1198,12 @@ XEEMITTER(srawx, 0x7C000630, X )(PPCHIRBuilder& f, InstrData& i) { Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE); Value* sh = f.And( f.Truncate(f.LoadGPR(i.X.RB), INT32_TYPE), - f.LoadConstant((int8_t)0x7F)); + f.LoadConstant(0x7F)); // CA is set if any bits are shifted out of the right and if the result // is negative. Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh)); Value* ca = f.And( - f.Shr(v, 31), + f.Truncate(f.Shr(v, 31), INT8_TYPE), f.IsTrue(f.And(v, mask))); f.StoreCA(ca); v = f.Sha(v, sh), @@ -1234,8 +1235,8 @@ XEEMITTER(srawix, 0x7C000670, X )(PPCHIRBuilder& f, InstrData& i) { // is negative. uint32_t mask = (uint32_t)XEMASK(64 - i.X.RB, 63); ca = f.And( - f.Shr(v, 31), - f.ZeroExtend(f.IsTrue(f.And(v, f.LoadConstant(mask))), INT32_TYPE)); + f.Truncate(f.Shr(v, 31), INT8_TYPE), + f.IsTrue(f.And(v, f.LoadConstant(mask)))); v = f.Sha(v, (int8_t)i.X.RB), v = f.SignExtend(v, INT64_TYPE); diff --git a/src/alloy/frontend/ppc/ppc_emit_control.cc b/src/alloy/frontend/ppc/ppc_emit_control.cc index 83a50a2c4..0365c849b 100644 --- a/src/alloy/frontend/ppc/ppc_emit_control.cc +++ b/src/alloy/frontend/ppc/ppc_emit_control.cc @@ -35,6 +35,7 @@ int InstrEmit_branch( // be correct for returns. if (lk) { Value* return_address = f.LoadConstant(cia + 4); + f.SetReturnAddress(return_address); f.StoreLR(return_address); } @@ -104,6 +105,10 @@ int InstrEmit_branch( // // TODO(benvanik): evaluate hint here. // c.je(e.GetReturnLabel(), kCondHintLikely); //} +#if 0 + // This breaks longjump, as that uses blr with a non-return lr. + // It'd be nice to move SET_RETURN_ADDRESS semantics up into context + // so that we can just use this. if (!lk && nia_is_lr) { // Return (most likely). // TODO(benvanik): test? ReturnCheck()? @@ -116,7 +121,14 @@ int InstrEmit_branch( f.Return(); } } else { +#else + { +#endif // Jump to pointer. + bool likely_return = !lk && nia_is_lr; + if (likely_return) { + call_flags |= CALL_POSSIBLE_RETURN; + } if (cond) { if (!expect_true) { cond = f.IsFalse(cond); @@ -380,8 +392,8 @@ XEEMITTER(mcrf, 0x4C000000, XL )(PPCHIRBuilder& f, InstrData& i) { // System linkage (A-24) XEEMITTER(sc, 0x44000002, SC )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + f.CallExtern(f.symbol_info()); + return 0; } diff --git a/src/alloy/frontend/ppc/ppc_emit_memory.cc b/src/alloy/frontend/ppc/ppc_emit_memory.cc index ab810f6b2..738090abf 100644 --- a/src/alloy/frontend/ppc/ppc_emit_memory.cc +++ b/src/alloy/frontend/ppc/ppc_emit_memory.cc @@ -891,7 +891,8 @@ XEEMITTER(stfiwx, 0x7C0007AE, X )(PPCHIRBuilder& f, InstrData& i) { // EA <- b + (RB) // MEM(EA, 4) <- (FRS)[32:63] Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - f.Store(ea, f.ByteSwap(f.Cast(f.LoadFPR(i.X.RT), INT32_TYPE))); + f.Store(ea, f.ByteSwap( + f.Truncate(f.Cast(f.LoadFPR(i.X.RT), INT64_TYPE), INT32_TYPE))); return 0; } diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index 501fc3f15..a8bec8435 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -9,6 +9,7 @@ #include +#include #include #include #include @@ -43,6 +44,8 @@ void PPCHIRBuilder::Reset() { } int PPCHIRBuilder::Emit(FunctionInfo* symbol_info, bool with_debug_info) { + SCOPE_profile_cpu_f("alloy"); + Memory* memory = frontend_->memory(); const uint8_t* p = memory->membase(); @@ -125,10 +128,10 @@ int PPCHIRBuilder::Emit(FunctionInfo* symbol_info, bool with_debug_info) { typedef int (*InstrEmitter)(PPCHIRBuilder& f, InstrData& i); InstrEmitter emit = (InstrEmitter)i.type->emit; - /*if (i.address == FLAGS_break_on_instruction) { + if (i.address == FLAGS_break_on_instruction) { Comment("--break-on-instruction target"); DebugBreak(); - }*/ + } if (!i.type->emit || emit(*this, i)) { XELOGCPU("Unimplemented instr %.8X %.8X %s", @@ -239,18 +242,18 @@ void PPCHIRBuilder::UpdateCR( void PPCHIRBuilder::UpdateCR( uint32_t n, Value* lhs, Value* rhs, bool is_signed) { - Value* lt; - Value* gt; if (is_signed) { - lt = CompareSLT(lhs, rhs); - gt = CompareSGT(lhs, rhs); + Value* lt = CompareSLT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt); + Value* gt = CompareSGT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt); } else { - lt = CompareULT(lhs, rhs); - gt = CompareUGT(lhs, rhs); + Value* lt = CompareULT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt); + Value* gt = CompareUGT(lhs, rhs); + StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt); } Value* eq = CompareEQ(lhs, rhs); - StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 0, lt); - StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 1, gt); StoreContext(offsetof(PPCContext, cr0) + (4 * n) + 2, eq); // Value* so = AllocValue(UINT8_TYPE); @@ -279,6 +282,7 @@ Value* PPCHIRBuilder::LoadCA() { } void PPCHIRBuilder::StoreCA(Value* value) { + XEASSERT(value->type == INT8_TYPE); StoreContext(offsetof(PPCContext, xer_ca), value); } @@ -287,6 +291,7 @@ Value* PPCHIRBuilder::LoadSAT() { } void PPCHIRBuilder::StoreSAT(Value* value) { + value = Truncate(value, INT8_TYPE); StoreContext(offsetof(PPCContext, vscr_sat), value); } diff --git a/src/alloy/frontend/ppc/ppc_scanner.cc b/src/alloy/frontend/ppc/ppc_scanner.cc index f75229b9e..9658bd595 100644 --- a/src/alloy/frontend/ppc/ppc_scanner.cc +++ b/src/alloy/frontend/ppc/ppc_scanner.cc @@ -38,6 +38,8 @@ bool PPCScanner::IsRestGprLr(uint64_t address) { } int PPCScanner::FindExtents(FunctionInfo* symbol_info) { + SCOPE_profile_cpu_f("alloy"); + // This is a simple basic block analyizer. It walks the start address to the // end address looking for branches. Each span of instructions between // branches is considered a basic block. When the last blr (that has no @@ -286,6 +288,8 @@ int PPCScanner::FindExtents(FunctionInfo* symbol_info) { } std::vector PPCScanner::FindBlocks(FunctionInfo* symbol_info) { + SCOPE_profile_cpu_f("alloy"); + Memory* memory = frontend_->memory(); const uint8_t* p = memory->membase(); diff --git a/src/alloy/frontend/ppc/ppc_translator.cc b/src/alloy/frontend/ppc/ppc_translator.cc index 9f82c9827..4f879336c 100644 --- a/src/alloy/frontend/ppc/ppc_translator.cc +++ b/src/alloy/frontend/ppc/ppc_translator.cc @@ -38,20 +38,38 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : assembler_ = backend->CreateAssembler(); assembler_->Initialize(); + bool validate = FLAGS_validate_hir; + + // Build the CFG first. + compiler_->AddPass(new passes::ControlFlowAnalysisPass()); + // Passes are executed in the order they are added. Multiple of the same // pass type may be used. + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::ContextPromotionPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::SimplificationPass()); - // TODO(benvanik): run repeatedly? + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::ConstantPropagationPass()); - //compiler_->AddPass(new passes::TypePropagationPass()); - //compiler_->AddPass(new passes::ByteSwapEliminationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::SimplificationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); //compiler_->AddPass(new passes::DeadStoreEliminationPass()); + //if (validate) compiler_->AddPass(new passes::ValidationPass()); compiler_->AddPass(new passes::DeadCodeEliminationPass()); + if (validate) compiler_->AddPass(new passes::ValidationPass()); - // Removes all unneeded variables. Try not to add new ones after this. - compiler_->AddPass(new passes::ValueReductionPass()); + //// Removes all unneeded variables. Try not to add new ones after this. + //compiler_->AddPass(new passes::ValueReductionPass()); + //if (validate) compiler_->AddPass(new passes::ValidationPass()); + + // Register allocation for the target backend. + // Will modify the HIR to add loads/stores. + // This should be the last pass before finalization, as after this all + // registers are assigned and ready to be emitted. + compiler_->AddPass(new passes::RegisterAllocationPass( + backend->machine_info())); + if (validate) compiler_->AddPass(new passes::ValidationPass()); // Must come last. The HIR is not really HIR after this. compiler_->AddPass(new passes::FinalizationPass()); @@ -68,6 +86,8 @@ int PPCTranslator::Translate( FunctionInfo* symbol_info, uint32_t debug_info_flags, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + // Scan the function to find its extents. We only need to do this if we // haven't already been provided with them from some other source. if (!symbol_info->has_end_address()) { diff --git a/src/alloy/frontend/tracing.h b/src/alloy/frontend/tracing.h index 61aadb949..ad9e8dae7 100644 --- a/src/alloy/frontend/tracing.h +++ b/src/alloy/frontend/tracing.h @@ -27,10 +27,10 @@ public: ALLOY_FRONTEND_DEINIT = ALLOY_FRONTEND | (2), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_FRONTEND_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_FRONTEND_DEINIT; } Deinit; }; diff --git a/src/alloy/hir/block.cc b/src/alloy/hir/block.cc new file mode 100644 index 000000000..ebace67fa --- /dev/null +++ b/src/alloy/hir/block.cc @@ -0,0 +1,39 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + +using namespace alloy; +using namespace alloy::hir; + + +void Block::AssertNoCycles() { + Instr* hare = instr_head; + Instr* tortoise = instr_head; + if (!hare) { + return; + } + while (hare = hare->next) { + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + hare = hare->next; + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + tortoise = tortoise->next; + if (!hare || !tortoise) { + return; + } + } +} diff --git a/src/alloy/hir/block.h b/src/alloy/hir/block.h index 1cb6d6414..f60dd83c5 100644 --- a/src/alloy/hir/block.h +++ b/src/alloy/hir/block.h @@ -12,15 +12,37 @@ #include +XEDECLARECLASS1(llvm, BitVector); + namespace alloy { namespace hir { +class Block; class HIRBuilder; class Instr; class Label; +class Edge { +public: + enum EdgeFlags { + UNCONDITIONAL = (1 << 0), + DOMINATES = (1 << 1), + }; +public: + Edge* outgoing_next; + Edge* outgoing_prev; + Edge* incoming_next; + Edge* incoming_prev; + + Block* src; + Block* dest; + + uint32_t flags; +}; + + class Block { public: Arena* arena; @@ -28,6 +50,10 @@ public: Block* next; Block* prev; + Edge* incoming_edge_head; + Edge* outgoing_edge_head; + llvm::BitVector* incoming_values; + Label* label_head; Label* label_tail; @@ -35,6 +61,8 @@ public: Instr* instr_tail; uint16_t ordinal; + + void AssertNoCycles(); }; diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 99d5649d1..158e08224 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -41,6 +41,7 @@ void HIRBuilder::Reset() { attributes_ = 0; next_label_id_ = 0; next_value_ordinal_ = 0; + locals_.clear(); block_head_ = block_tail_ = NULL; current_block_ = NULL; #if XE_DEBUG @@ -50,6 +51,8 @@ void HIRBuilder::Reset() { } int HIRBuilder::Finalize() { + SCOPE_profile_cpu_f("alloy"); + // Scan blocks in order and add fallthrough branches. These are needed for // analysis passes to work. We may have also added blocks out of order and // need to ensure they fall through in the right order. @@ -71,7 +74,7 @@ int HIRBuilder::Finalize() { // No following block. // Sometimes VC++ generates functions with bl at the end even if they // will never return. Just add a return to satisfy things. - XELOGW("Fall-through out of the function."); + //XELOGW("Fall-through out of the function."); Trap(); Return(); current_block_ = NULL; @@ -91,7 +94,7 @@ void HIRBuilder::DumpValue(StringBuffer* str, Value* value) { case INT8_TYPE: str->Append("%X", value->constant.i8); break; case INT16_TYPE: str->Append("%X", value->constant.i16); break; case INT32_TYPE: str->Append("%X", value->constant.i32); break; - case INT64_TYPE: str->Append("%X", value->constant.i64); break; + case INT64_TYPE: str->Append("%llX", value->constant.i64); break; case FLOAT32_TYPE: str->Append("%F", value->constant.f32); break; case FLOAT64_TYPE: str->Append("%F", value->constant.f64); break; case VEC128_TYPE: str->Append("(%F,%F,%F,%F)", @@ -107,6 +110,9 @@ void HIRBuilder::DumpValue(StringBuffer* str, Value* value) { }; str->Append("v%d.%s", value->ordinal, type_names[value->type]); } + if (value->reg.index != -1) { + str->Append("<%s%d>", value->reg.set->name, value->reg.index); + } } void HIRBuilder::DumpOp( @@ -137,10 +143,19 @@ void HIRBuilder::DumpOp( } void HIRBuilder::Dump(StringBuffer* str) { + SCOPE_profile_cpu_f("alloy"); + if (attributes_) { str->Append("; attributes = %.8X\n", attributes_); } + for (auto it = locals_.begin(); it != locals_.end(); ++it) { + auto local = *it; + str->Append(" ; local "); + DumpValue(str, local); + str->Append("\n"); + } + uint32_t block_ordinal = 0; Block* block = block_head_; while (block) { @@ -161,6 +176,39 @@ void HIRBuilder::Dump(StringBuffer* str) { label = label->next; } + Edge* incoming_edge = block->incoming_edge_head; + while (incoming_edge) { + auto src_label = incoming_edge->src->label_head; + if (src_label && src_label->name) { + str->Append(" ; in: %s", src_label->name); + } else if (src_label) { + str->Append(" ; in: label%d", src_label->id); + } else { + str->Append(" ; in: ", + incoming_edge->src->ordinal); + } + str->Append(", dom:%d, uncond:%d\n", + (incoming_edge->flags & Edge::DOMINATES) ? 1 : 0, + (incoming_edge->flags & Edge::UNCONDITIONAL) ? 1 : 0); + incoming_edge = incoming_edge->incoming_next; + } + Edge* outgoing_edge = block->outgoing_edge_head; + while (outgoing_edge) { + auto dest_label = outgoing_edge->dest->label_head; + if (dest_label && dest_label->name) { + str->Append(" ; out: %s", dest_label->name); + } else if (dest_label) { + str->Append(" ; out: label%d", dest_label->id); + } else { + str->Append(" ; out: ", + outgoing_edge->dest->ordinal); + } + str->Append(", dom:%d, uncond:%d\n", + (outgoing_edge->flags & Edge::DOMINATES) ? 1 : 0, + (outgoing_edge->flags & Edge::UNCONDITIONAL) ? 1 : 0); + outgoing_edge = outgoing_edge->outgoing_next; + } + Instr* i = block->instr_head; while (i) { if (i->opcode->flags & OPCODE_FLAG_HIDE) { @@ -208,6 +256,29 @@ void HIRBuilder::Dump(StringBuffer* str) { } } +void HIRBuilder::AssertNoCycles() { + Block* hare = block_head_; + Block* tortoise = block_head_; + if (!hare) { + return; + } + while (hare = hare->next) { + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + hare = hare->next; + if (hare == tortoise) { + // Cycle! + XEASSERTALWAYS(); + } + tortoise = tortoise->next; + if (!hare || !tortoise) { + return; + } + } +} + Block* HIRBuilder::current_block() const { return current_block_; } @@ -303,6 +374,7 @@ void HIRBuilder::InsertLabel(Label* label, Instr* prev_instr) { block_tail_ = new_block; } new_block->label_head = new_block->label_tail = label; + new_block->incoming_edge_head = new_block->outgoing_edge_head = NULL; label->block = new_block; label->prev = label->next = NULL; @@ -319,8 +391,7 @@ void HIRBuilder::InsertLabel(Label* label, Instr* prev_instr) { new_block->instr_tail = old_prev_tail; } - for (auto instr = new_block->instr_head; instr != new_block->instr_tail; - instr = instr->next) { + for (auto instr = new_block->instr_head; instr; instr = instr->next) { instr->block = new_block; } @@ -342,6 +413,19 @@ void HIRBuilder::ResetLabelTags() { } } +void HIRBuilder::AddEdge(Block* src, Block* dest, uint32_t flags) { + Edge* edge = arena_->Alloc(); + edge->src = src; + edge->dest = dest; + edge->flags = flags; + edge->outgoing_prev = NULL; + edge->outgoing_next = src->outgoing_edge_head; + src->outgoing_edge_head = edge; + edge->incoming_prev = NULL; + edge->incoming_next = dest->incoming_edge_head; + dest->incoming_edge_head = edge; +} + Block* HIRBuilder::AppendBlock() { Block* block = arena_->Alloc(); block->arena = arena_; @@ -356,6 +440,7 @@ Block* HIRBuilder::AppendBlock() { } current_block_ = block; block->label_head = block->label_tail = NULL; + block->incoming_edge_head = block->outgoing_edge_head = NULL; block->instr_head = block->instr_tail = NULL; return block; } @@ -398,6 +483,7 @@ Instr* HIRBuilder::AppendInstr( if (!block->instr_head) { block->instr_head = instr; } + instr->ordinal = -1; instr->block = block; instr->opcode = &opcode_info; instr->flags = flags; @@ -420,8 +506,10 @@ Value* HIRBuilder::AllocValue(TypeName type) { value->def = NULL; value->use_head = NULL; value->last_use = NULL; + value->local_slot = NULL; value->tag = NULL; - value->reg = -1; + value->reg.set = NULL; + value->reg.index = -1; return value; } @@ -434,8 +522,10 @@ Value* HIRBuilder::CloneValue(Value* source) { value->def = NULL; value->use_head = NULL; value->last_use = NULL; + value->local_slot = NULL; value->tag = NULL; - value->reg = -1; + value->reg.set = NULL; + value->reg.index = -1; return value; } @@ -557,6 +647,13 @@ void HIRBuilder::CallIndirectTrue( EndBlock(); } +void HIRBuilder::CallExtern(FunctionInfo* symbol_info) { + Instr* i = AppendInstr(OPCODE_CALL_EXTERN_info, 0); + i->src1.symbol_info = symbol_info; + i->src2.value = i->src3.value = NULL; + EndBlock(); +} + void HIRBuilder::Return() { Instr* i = AppendInstr(OPCODE_RETURN_info, 0); i->src1.value = i->src2.value = i->src3.value = NULL; @@ -578,6 +675,12 @@ void HIRBuilder::ReturnTrue(Value* cond) { EndBlock(); } +void HIRBuilder::SetReturnAddress(Value* value) { + Instr* i = AppendInstr(OPCODE_SET_RETURN_ADDRESS_info, 0); + i->set_src1(value); + i->src2.value = i->src3.value = NULL; +} + void HIRBuilder::Branch(Label* label, uint32_t branch_flags) { Instr* i = AppendInstr(OPCODE_BRANCH_info, branch_flags); i->src1.label = label; @@ -870,6 +973,28 @@ Value* HIRBuilder::LoadClock() { return i->dest; } +Value* HIRBuilder::AllocLocal(TypeName type) { + Value* slot = AllocValue(type); + locals_.push_back(slot); + return slot; +} + +Value* HIRBuilder::LoadLocal(Value* slot) { + Instr* i = AppendInstr( + OPCODE_LOAD_LOCAL_info, 0, + AllocValue(slot->type)); + i->set_src1(slot); + i->src2.value = i->src3.value = NULL; + return i->dest; +} + +void HIRBuilder::StoreLocal(Value* slot, Value* value) { + Instr* i = AppendInstr(OPCODE_STORE_LOCAL_info, 0); + i->set_src1(slot); + i->set_src2(value); + i->src3.value = NULL; +} + Value* HIRBuilder::LoadContext(size_t offset, TypeName type) { Instr* i = AppendInstr( OPCODE_LOAD_CONTEXT_info, 0, @@ -1631,16 +1756,19 @@ Value* HIRBuilder::Extract(Value* value, Value* index, TypeName target_type) { // TODO(benvanik): could do some of this as constants. + Value* trunc_index = index->type != INT8_TYPE ? + Truncate(index, INT8_TYPE) : index; + Instr* i = AppendInstr( OPCODE_EXTRACT_info, 0, AllocValue(target_type)); i->set_src1(value); - i->set_src2(ZeroExtend(index, INT64_TYPE)); + i->set_src2(trunc_index); i->src3.value = NULL; return i->dest; } -Value* HIRBuilder::Extract(Value* value, uint64_t index, +Value* HIRBuilder::Extract(Value* value, uint8_t index, TypeName target_type) { return Extract(value, LoadConstant(index), target_type); } diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index 05fa632de..6568a5a49 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -35,13 +35,19 @@ public: virtual int Finalize(); void Dump(StringBuffer* str); + void AssertNoCycles(); Arena* arena() const { return arena_; } uint32_t attributes() const { return attributes_; } void set_attributes(uint32_t value) { attributes_ = value; } + std::vector& locals() { return locals_; } + + uint32_t max_value_ordinal() const { return next_value_ordinal_; } + Block* first_block() const { return block_head_; } + Block* last_block() const { return block_tail_; } Block* current_block() const; Instr* last_instr() const; @@ -50,12 +56,11 @@ public: void InsertLabel(Label* label, Instr* prev_instr); void ResetLabelTags(); + void AddEdge(Block* src, Block* dest, uint32_t flags); + // static allocations: // Value* AllocStatic(size_t length); - // stack allocations: - // Value* AllocLocal(TypeName type); - void Comment(const char* format, ...); void Nop(); @@ -74,8 +79,10 @@ public: uint32_t call_flags = 0); void CallIndirect(Value* value, uint32_t call_flags = 0); void CallIndirectTrue(Value* cond, Value* value, uint32_t call_flags = 0); + void CallExtern(runtime::FunctionInfo* symbol_info); void Return(); void ReturnTrue(Value* cond); + void SetReturnAddress(Value* value); void Branch(Label* label, uint32_t branch_flags = 0); void Branch(Block* block, uint32_t branch_flags = 0); @@ -115,6 +122,10 @@ public: Value* LoadClock(); + Value* AllocLocal(TypeName type); + Value* LoadLocal(Value* slot); + void StoreLocal(Value* slot, Value* value); + Value* LoadContext(size_t offset, TypeName type); void StoreContext(size_t offset, Value* value); @@ -186,7 +197,7 @@ public: Value* Insert(Value* value, Value* index, Value* part); Value* Insert(Value* value, uint64_t index, Value* part); Value* Extract(Value* value, Value* index, TypeName target_type); - Value* Extract(Value* value, uint64_t index, TypeName target_type); + Value* Extract(Value* value, uint8_t index, TypeName target_type); // i8->i16/i32/... (i8|i8 / i8|i8|i8|i8 / ...) // i8/i16/i32 -> vec128 Value* Splat(Value* value, TypeName target_type); @@ -229,6 +240,8 @@ protected: uint32_t next_label_id_; uint32_t next_value_ordinal_; + std::vector locals_; + Block* block_head_; Block* block_tail_; Block* current_block_; diff --git a/src/alloy/hir/instr.cc b/src/alloy/hir/instr.cc index 35349f28e..dc489ef4b 100644 --- a/src/alloy/hir/instr.cc +++ b/src/alloy/hir/instr.cc @@ -48,17 +48,34 @@ void Instr::set_src3(Value* value) { src3_use = value ? value->AddUse(block->arena, this) : NULL; } -bool Instr::Match(SignatureType dest_req, - SignatureType src1_req, - SignatureType src2_req, - SignatureType src3_req) const { - #define TO_SIG_TYPE(v) \ - (v ? (v->IsConstant() ? SignatureType((v->type + 1) | SIG_TYPE_C) : SignatureType(v->type + 1)) : SIG_TYPE_X) - return - ((dest_req == SIG_TYPE_IGNORE) || (dest_req == TO_SIG_TYPE(dest))) && - ((src1_req == SIG_TYPE_IGNORE) || (src1_req == TO_SIG_TYPE(src1.value))) && - ((src2_req == SIG_TYPE_IGNORE) || (src2_req == TO_SIG_TYPE(src2.value))) && - ((src3_req == SIG_TYPE_IGNORE) || (src3_req == TO_SIG_TYPE(src3.value))); +void Instr::MoveBefore(Instr* other) { + if (next == other) { + return; + } + + // Remove from current location. + if (prev) { + prev->next = next; + } else { + block->instr_head = next; + } + if (next) { + next->prev = prev; + } else { + block->instr_tail = prev; + } + + // Insert into new location. + block = other->block; + next = other; + prev = other->prev; + other->prev = this; + if (prev) { + prev->next = this; + } + if (other == block->instr_head) { + block->instr_head = this; + } } void Instr::Replace(const OpcodeInfo* opcode, uint16_t flags) { diff --git a/src/alloy/hir/instr.h b/src/alloy/hir/instr.h index 42b3c36bf..b128c534a 100644 --- a/src/alloy/hir/instr.h +++ b/src/alloy/hir/instr.h @@ -24,26 +24,6 @@ namespace hir { class Block; class Label; -enum SignatureType { - SIG_TYPE_X = 0, - SIG_TYPE_I8 = 1, - SIG_TYPE_I16 = 2, - SIG_TYPE_I32 = 3, - SIG_TYPE_I64 = 4, - SIG_TYPE_F32 = 5, - SIG_TYPE_F64 = 6, - SIG_TYPE_V128 = 7, - SIG_TYPE_C = (1 << 3), - SIG_TYPE_I8C = SIG_TYPE_C | SIG_TYPE_I8, - SIG_TYPE_I16C = SIG_TYPE_C | SIG_TYPE_I16, - SIG_TYPE_I32C = SIG_TYPE_C | SIG_TYPE_I32, - SIG_TYPE_I64C = SIG_TYPE_C | SIG_TYPE_I64, - SIG_TYPE_F32C = SIG_TYPE_C | SIG_TYPE_F32, - SIG_TYPE_F64C = SIG_TYPE_C | SIG_TYPE_F64, - SIG_TYPE_V128C = SIG_TYPE_C | SIG_TYPE_V128, - SIG_TYPE_IGNORE = 0xFF, -}; - class Instr { public: Block* block; @@ -52,7 +32,7 @@ public: const OpcodeInfo* opcode; uint16_t flags; - uint16_t ordinal; + uint32_t ordinal; typedef union { runtime::FunctionInfo* symbol_info; @@ -74,11 +54,7 @@ public: void set_src2(Value* value); void set_src3(Value* value); - bool Match(SignatureType dest = SIG_TYPE_X, - SignatureType src1 = SIG_TYPE_X, - SignatureType src2 = SIG_TYPE_X, - SignatureType src3 = SIG_TYPE_X) const; - + void MoveBefore(Instr* other); void Replace(const OpcodeInfo* opcode, uint16_t flags); void Remove(); }; diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 9fdcd311e..b52e7b55d 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -18,7 +18,8 @@ namespace hir { enum CallFlags { - CALL_TAIL = (1 << 1), + CALL_TAIL = (1 << 1), + CALL_POSSIBLE_RETURN = (1 << 2), }; enum BranchFlags { BRANCH_LIKELY = (1 << 1), @@ -94,8 +95,10 @@ enum Opcode { OPCODE_CALL_TRUE, OPCODE_CALL_INDIRECT, OPCODE_CALL_INDIRECT_TRUE, + OPCODE_CALL_EXTERN, OPCODE_RETURN, OPCODE_RETURN_TRUE, + OPCODE_SET_RETURN_ADDRESS, OPCODE_BRANCH, OPCODE_BRANCH_TRUE, @@ -116,6 +119,9 @@ enum Opcode { OPCODE_LOAD_CLOCK, + OPCODE_LOAD_LOCAL, + OPCODE_STORE_LOCAL, + OPCODE_LOAD_CONTEXT, OPCODE_STORE_CONTEXT, @@ -201,6 +207,7 @@ enum OpcodeFlags { OPCODE_FLAG_VOLATILE = (1 << 4), OPCODE_FLAG_IGNORE = (1 << 5), OPCODE_FLAG_HIDE = (1 << 6), + OPCODE_FLAG_PAIRED_PREV = (1 << 7), }; enum OpcodeSignatureType { diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index abdea12db..deb789675 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -11,566 +11,590 @@ DEFINE_OPCODE( OPCODE_COMMENT, "comment", - OPCODE_SIG_X, - OPCODE_FLAG_IGNORE); + OPCODE_SIG_X_O, + OPCODE_FLAG_IGNORE) DEFINE_OPCODE( OPCODE_NOP, "nop", OPCODE_SIG_X, - OPCODE_FLAG_IGNORE); + OPCODE_FLAG_IGNORE) DEFINE_OPCODE( OPCODE_SOURCE_OFFSET, "source_offset", OPCODE_SIG_X_O, - OPCODE_FLAG_IGNORE | OPCODE_FLAG_HIDE); + OPCODE_FLAG_IGNORE | OPCODE_FLAG_HIDE) DEFINE_OPCODE( OPCODE_DEBUG_BREAK, "debug_break", OPCODE_SIG_X, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_DEBUG_BREAK_TRUE, "debug_break_true", OPCODE_SIG_X_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_TRAP, "trap", OPCODE_SIG_X, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_TRAP_TRUE, "trap_true", OPCODE_SIG_X_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_CALL, "call", OPCODE_SIG_X_S, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_TRUE, "call_true", OPCODE_SIG_X_V_S, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_INDIRECT, "call_indirect", OPCODE_SIG_X_V, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_CALL_INDIRECT_TRUE, "call_indirect_true", OPCODE_SIG_X_V_V, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) + +DEFINE_OPCODE( + OPCODE_CALL_EXTERN, + "call_extern", + OPCODE_SIG_X_S, + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_RETURN, "return", OPCODE_SIG_X, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_RETURN_TRUE, "return_true", OPCODE_SIG_X_V, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) + +DEFINE_OPCODE( + OPCODE_SET_RETURN_ADDRESS, + "set_return_address", + OPCODE_SIG_X_V, + 0) DEFINE_OPCODE( OPCODE_BRANCH, "branch", OPCODE_SIG_X_L, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_BRANCH_TRUE, "branch_true", OPCODE_SIG_X_V_L, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_BRANCH_FALSE, "branch_false", OPCODE_SIG_X_V_L, - OPCODE_FLAG_BRANCH); + OPCODE_FLAG_BRANCH) DEFINE_OPCODE( OPCODE_ASSIGN, "assign", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_CAST, "cast", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ZERO_EXTEND, "zero_extend", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SIGN_EXTEND, "sign_extend", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_TRUNCATE, "truncate", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_CONVERT, "convert", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ROUND, "round", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_CONVERT_I2F, "vector_convert_i2f", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_CONVERT_F2I, "vector_convert_f2i", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_VECTOR_SHL, "load_vector_shl", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_VECTOR_SHR, "load_vector_shr", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD_CLOCK, "load_clock", OPCODE_SIG_V, - 0); + 0) + +DEFINE_OPCODE( + OPCODE_LOAD_LOCAL, + "load_local", + OPCODE_SIG_V_V, + 0) + +DEFINE_OPCODE( + OPCODE_STORE_LOCAL, + "store_local", + OPCODE_SIG_X_V_V, + 0) DEFINE_OPCODE( OPCODE_LOAD_CONTEXT, "load_context", OPCODE_SIG_V_O, - 0); + 0) DEFINE_OPCODE( OPCODE_STORE_CONTEXT, "store_context", OPCODE_SIG_X_O_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOAD, "load", OPCODE_SIG_V_V, - OPCODE_FLAG_MEMORY); + OPCODE_FLAG_MEMORY) DEFINE_OPCODE( OPCODE_STORE, "store", OPCODE_SIG_X_V_V, - OPCODE_FLAG_MEMORY); + OPCODE_FLAG_MEMORY) DEFINE_OPCODE( OPCODE_PREFETCH, "prefetch", OPCODE_SIG_X_V_O, - 0); + 0) DEFINE_OPCODE( OPCODE_MAX, "max", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MIN, "min", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SELECT, "select", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_IS_TRUE, "is_true", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_IS_FALSE, "is_false", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_EQ, "compare_eq", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_COMPARE_NE, "compare_ne", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_COMPARE_SLT, "compare_slt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_SLE, "compare_sle", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_SGT, "compare_sgt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_SGE, "compare_sge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_ULT, "compare_ult", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_ULE, "compare_ule", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_UGT, "compare_ugt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_UGE, "compare_uge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_DID_CARRY, "did_carry", OPCODE_SIG_V_V, - 0); + OPCODE_FLAG_PAIRED_PREV) DEFINE_OPCODE( OPCODE_DID_OVERFLOW, "did_overflow", OPCODE_SIG_V_V, - 0); + OPCODE_FLAG_PAIRED_PREV) DEFINE_OPCODE( OPCODE_DID_SATURATE, "did_saturate", OPCODE_SIG_V_V, - 0); + OPCODE_FLAG_PAIRED_PREV) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_EQ, "vector_compare_eq", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_SGT, "vector_compare_sgt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_SGE, "vector_compare_sge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_UGT, "vector_compare_ugt", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_UGE, "vector_compare_uge", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ADD, "add", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_ADD_CARRY, "add_carry", OPCODE_SIG_V_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + 0) DEFINE_OPCODE( OPCODE_VECTOR_ADD, "vector_add", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_SUB, "sub", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MUL, "mul", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_MUL_HI, "mul_hi", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_DIV, "div", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MUL_ADD, "mul_add", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_MUL_SUB, "mul_sub", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_NEG, "neg", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ABS, "abs", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SQRT, "sqrt", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_RSQRT, "rsqrt", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_POW2, "pow2", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_LOG2, "log2", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_DOT_PRODUCT_3, "dot_product_3", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_DOT_PRODUCT_4, "dot_product_4", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_AND, "and", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_OR, "or", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_XOR, "xor", OPCODE_SIG_V_V_V, - OPCODE_FLAG_COMMUNATIVE); + OPCODE_FLAG_COMMUNATIVE) DEFINE_OPCODE( OPCODE_NOT, "not", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SHL, "shl", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_SHL, "vector_shl", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SHR, "shr", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_SHR, "vector_shr", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SHA, "sha", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_VECTOR_SHA, "vector_sha", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ROTATE_LEFT, "rotate_left", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_BYTE_SWAP, "byte_swap", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_CNTLZ, "cntlz", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_INSERT, "insert", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_EXTRACT, "extract", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SPLAT, "splat", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_PERMUTE, "permute", OPCODE_SIG_V_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_SWIZZLE, "swizzle", OPCODE_SIG_V_V_O, - 0); + 0) DEFINE_OPCODE( OPCODE_PACK, "pack", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_UNPACK, "unpack", OPCODE_SIG_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_COMPARE_EXCHANGE, "compare_exchange", OPCODE_SIG_V_V_V_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_ATOMIC_EXCHANGE, "atomic_exchange", OPCODE_SIG_V_V_V, - OPCODE_FLAG_VOLATILE); + OPCODE_FLAG_VOLATILE) DEFINE_OPCODE( OPCODE_ATOMIC_ADD, "atomic_add", OPCODE_SIG_V_V_V, - 0); + 0) DEFINE_OPCODE( OPCODE_ATOMIC_SUB, "atomic_sub", OPCODE_SIG_V_V_V, - 0); + 0) diff --git a/src/alloy/hir/sources.gypi b/src/alloy/hir/sources.gypi index 948b43dd8..1ea2d7783 100644 --- a/src/alloy/hir/sources.gypi +++ b/src/alloy/hir/sources.gypi @@ -1,6 +1,7 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ + 'block.cc', 'block.h', 'hir_builder.cc', 'hir_builder.h', diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc index a684c6f2b..0f723e943 100644 --- a/src/alloy/hir/value.cc +++ b/src/alloy/hir/value.cc @@ -187,19 +187,26 @@ void Value::Round(RoundMode round_mode) { XEASSERTALWAYS(); } -void Value::Add(Value* other) { +bool Value::Add(Value* other) { + #define CHECK_DID_CARRY(v1, v2) (((uint64_t)v2) > ~((uint64_t)v1)) + #define ADD_DID_CARRY(a, b) CHECK_DID_CARRY(a, b) XEASSERT(type == other->type); + bool did_carry = false; switch (type) { case INT8_TYPE: + did_carry = ADD_DID_CARRY(constant.i8, other->constant.i8); constant.i8 += other->constant.i8; break; case INT16_TYPE: + did_carry = ADD_DID_CARRY(constant.i16, other->constant.i16); constant.i16 += other->constant.i16; break; case INT32_TYPE: + did_carry = ADD_DID_CARRY(constant.i32, other->constant.i32); constant.i32 += other->constant.i32; break; case INT64_TYPE: + did_carry = ADD_DID_CARRY(constant.i64, other->constant.i64); constant.i64 += other->constant.i64; break; case FLOAT32_TYPE: @@ -212,21 +219,28 @@ void Value::Add(Value* other) { XEASSERTALWAYS(); break; } + return did_carry; } -void Value::Sub(Value* other) { +bool Value::Sub(Value* other) { + #define SUB_DID_CARRY(a, b) (b > a) XEASSERT(type == other->type); + bool did_carry = false; switch (type) { case INT8_TYPE: + did_carry = SUB_DID_CARRY(constant.i8, other->constant.i8); constant.i8 -= other->constant.i8; break; case INT16_TYPE: + did_carry = SUB_DID_CARRY(constant.i16, other->constant.i16); constant.i16 -= other->constant.i16; break; case INT32_TYPE: + did_carry = SUB_DID_CARRY(constant.i32, other->constant.i32); constant.i32 -= other->constant.i32; break; case INT64_TYPE: + did_carry = SUB_DID_CARRY(constant.i64, other->constant.i64); constant.i64 -= other->constant.i64; break; case FLOAT32_TYPE: @@ -239,6 +253,7 @@ void Value::Sub(Value* other) { XEASSERTALWAYS(); break; } + return did_carry; } void Value::Mul(Value* other) { @@ -560,6 +575,26 @@ void Value::ByteSwap() { } } +void Value::CountLeadingZeros(const Value* other) { + switch (other->type) { + case INT8_TYPE: + constant.i8 = static_cast(__lzcnt16(other->constant.i8) - 8); + break; + case INT16_TYPE: + constant.i8 = static_cast(__lzcnt16(other->constant.i16)); + break; + case INT32_TYPE: + constant.i8 = static_cast(__lzcnt(other->constant.i32)); + break; + case INT64_TYPE: + constant.i8 = static_cast(__lzcnt64(other->constant.i64)); + break; + default: + XEASSERTALWAYS(); + break; + } +} + bool Value::Compare(Opcode opcode, Value* other) { // TODO(benvanik): big matrix. XEASSERTALWAYS(); diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h index f1d4b1d37..3c4e82619 100644 --- a/src/alloy/hir/value.h +++ b/src/alloy/hir/value.h @@ -11,6 +11,7 @@ #define ALLOY_HIR_VALUE_H_ #include +#include #include @@ -34,7 +35,32 @@ enum TypeName { }; static bool IsIntType(TypeName type_name) { - return type_name < 4; + return type_name <= INT64_TYPE; +} +static bool IsFloatType(TypeName type_name) { + return type_name == FLOAT32_TYPE || type_name == FLOAT64_TYPE; +} +static bool IsVecType(TypeName type_name) { + return type_name == VEC128_TYPE; +} +static size_t GetTypeSize(TypeName type_name) { + switch (type_name) { + case INT8_TYPE: + return 1; + case INT16_TYPE: + return 2; + case INT32_TYPE: + return 4; + case INT64_TYPE: + return 8; + case FLOAT32_TYPE: + return 4; + case FLOAT64_TYPE: + return 8; + default: + case VEC128_TYPE: + return 16; + } } enum ValueFlags { @@ -42,6 +68,10 @@ enum ValueFlags { VALUE_IS_ALLOCATED = (1 << 2), // Used by backends. Do not set. }; +struct RegAssignment { + const backend::MachineInfo::RegisterSet* set; + int32_t index; +}; class Value { public: @@ -65,13 +95,14 @@ public: TypeName type; uint32_t flags; - uint32_t reg; + RegAssignment reg; ConstantValue constant; Instr* def; Use* use_head; // NOTE: for performance reasons this is not maintained during construction. Instr* last_use; + Value* local_slot; // TODO(benvanik): remove to shrink size. void* tag; @@ -158,25 +189,26 @@ public: } bool IsConstantTrue() const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && !!constant.i64; } bool IsConstantFalse() const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && !constant.i64; } bool IsConstantZero() const { if (type == VEC128_TYPE) { - return false; + return (flags & VALUE_IS_CONSTANT) && + !constant.v128.low && !constant.v128.high; } return (flags & VALUE_IS_CONSTANT) && !constant.i64; } bool IsConstantEQ(Value* other) const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && (other->flags & VALUE_IS_CONSTANT) && @@ -184,12 +216,156 @@ public: } bool IsConstantNE(Value* other) const { if (type == VEC128_TYPE) { - return false; + XEASSERTALWAYS(); } return (flags & VALUE_IS_CONSTANT) && (other->flags & VALUE_IS_CONSTANT) && constant.i64 != other->constant.i64; } + bool IsConstantSLT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 < other->constant.i8; + case INT16_TYPE: + return constant.i16 < other->constant.i16; + case INT32_TYPE: + return constant.i32 < other->constant.i32; + case INT64_TYPE: + return constant.i64 < other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 < other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 < other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantSLE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 <= other->constant.i8; + case INT16_TYPE: + return constant.i16 <= other->constant.i16; + case INT32_TYPE: + return constant.i32 <= other->constant.i32; + case INT64_TYPE: + return constant.i64 <= other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 <= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 <= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantSGT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 > other->constant.i8; + case INT16_TYPE: + return constant.i16 > other->constant.i16; + case INT32_TYPE: + return constant.i32 > other->constant.i32; + case INT64_TYPE: + return constant.i64 > other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 > other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 > other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantSGE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return constant.i8 >= other->constant.i8; + case INT16_TYPE: + return constant.i16 >= other->constant.i16; + case INT32_TYPE: + return constant.i32 >= other->constant.i32; + case INT64_TYPE: + return constant.i64 >= other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 >= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 >= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantULT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 < (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 < (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 < (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 < (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 < other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 < other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantULE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 <= (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 <= (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 <= (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 <= (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 <= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 <= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantUGT(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 > (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 > (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 > (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 > (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 > other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 > other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } + bool IsConstantUGE(Value* other) const { + XEASSERT(flags & VALUE_IS_CONSTANT && other->flags & VALUE_IS_CONSTANT); + switch (type) { + case INT8_TYPE: + return (uint8_t)constant.i8 >= (uint8_t)other->constant.i8; + case INT16_TYPE: + return (uint16_t)constant.i16 >= (uint16_t)other->constant.i16; + case INT32_TYPE: + return (uint32_t)constant.i32 >= (uint32_t)other->constant.i32; + case INT64_TYPE: + return (uint64_t)constant.i64 >= (uint64_t)other->constant.i64; + case FLOAT32_TYPE: + return constant.f32 >= other->constant.f32; + case FLOAT64_TYPE: + return constant.f64 >= other->constant.f64; + default: XEASSERTALWAYS(); return false; + } + } uint32_t AsUint32(); uint64_t AsUint64(); @@ -199,8 +375,8 @@ public: void Truncate(TypeName target_type); void Convert(TypeName target_type, RoundMode round_mode); void Round(RoundMode round_mode); - void Add(Value* other); - void Sub(Value* other); + bool Add(Value* other); + bool Sub(Value* other); void Mul(Value* other); void Div(Value* other); static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3); @@ -217,6 +393,7 @@ public: void Shr(Value* other); void Sha(Value* other); void ByteSwap(); + void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); }; diff --git a/src/alloy/memory.cc b/src/alloy/memory.cc index 4948d22d6..2933392dd 100644 --- a/src/alloy/memory.cc +++ b/src/alloy/memory.cc @@ -9,14 +9,22 @@ #include +#if !XE_LIKE_WIN32 +#include +#endif + using namespace alloy; Memory::Memory() : - membase_(0) { + membase_(0), reserve_address_(0) { +#if XE_LIKE_WIN32 SYSTEM_INFO si; GetSystemInfo(&si); system_page_size_ = si.dwPageSize; +#else + system_page_size_ = getpagesize(); +#endif } Memory::~Memory() { diff --git a/src/alloy/memory.h b/src/alloy/memory.h index 9fa8c11fd..72719cc4a 100644 --- a/src/alloy/memory.h +++ b/src/alloy/memory.h @@ -34,6 +34,8 @@ public: }; inline uint32_t* reserve_address() { return &reserve_address_; } + virtual uint64_t page_table() const = 0; + virtual int Initialize(); void Zero(uint64_t address, size_t size); @@ -43,6 +45,15 @@ public: uint64_t SearchAligned(uint64_t start, uint64_t end, const uint32_t* values, size_t value_count); + virtual uint8_t LoadI8(uint64_t address) = 0; + virtual uint16_t LoadI16(uint64_t address) = 0; + virtual uint32_t LoadI32(uint64_t address) = 0; + virtual uint64_t LoadI64(uint64_t address) = 0; + virtual void StoreI8(uint64_t address, uint8_t value) = 0; + virtual void StoreI16(uint64_t address, uint16_t value) = 0; + virtual void StoreI32(uint64_t address, uint32_t value) = 0; + virtual void StoreI64(uint64_t address, uint64_t value) = 0; + virtual uint64_t HeapAlloc( uint64_t base_address, size_t size, uint32_t flags, uint32_t alignment = 0x20) = 0; diff --git a/src/alloy/runtime/debug_info.cc b/src/alloy/runtime/debug_info.cc index b7b060ef8..ad6056eec 100644 --- a/src/alloy/runtime/debug_info.cc +++ b/src/alloy/runtime/debug_info.cc @@ -62,7 +62,7 @@ SourceMapEntry* DebugInfo::LookupHIROffset(uint64_t offset) { SourceMapEntry* DebugInfo::LookupCodeOffset(uint64_t offset) { // TODO(benvanik): binary search? We know the list is sorted by code order. - for (int n = source_map_count_ - 1; n >= 0; n--) { + for (int64_t n = source_map_count_ - 1; n >= 0; n--) { auto entry = &source_map_[n]; if (entry->code_offset <= offset) { return entry; diff --git a/src/alloy/runtime/entry_table.cc b/src/alloy/runtime/entry_table.cc index cf6da5d70..ebec56ea4 100644 --- a/src/alloy/runtime/entry_table.cc +++ b/src/alloy/runtime/entry_table.cc @@ -75,6 +75,8 @@ Entry::Status EntryTable::GetOrCreate(uint64_t address, Entry** out_entry) { } std::vector EntryTable::FindWithAddress(uint64_t address) { + SCOPE_profile_cpu_f("alloy"); + std::vector fns; LockMutex(lock_); for (auto it = map_.begin(); it != map_.end(); ++it) { diff --git a/src/alloy/runtime/entry_table.h b/src/alloy/runtime/entry_table.h index e9f1ca9f2..acbabc26e 100644 --- a/src/alloy/runtime/entry_table.h +++ b/src/alloy/runtime/entry_table.h @@ -47,7 +47,7 @@ public: private: // TODO(benvanik): replace with a better data structure. Mutex* lock_; - typedef std::tr1::unordered_map EntryMap; + typedef std::unordered_map EntryMap; EntryMap map_; }; diff --git a/src/alloy/runtime/function.cc b/src/alloy/runtime/function.cc index c09d3b929..2dd0ddce5 100644 --- a/src/alloy/runtime/function.cc +++ b/src/alloy/runtime/function.cc @@ -17,8 +17,9 @@ using namespace alloy; using namespace alloy::runtime; -Function::Function(Type type, uint64_t address) : - type_(type), address_(address), debug_info_(0) { +Function::Function(FunctionInfo* symbol_info) : + address_(symbol_info->address()), + symbol_info_(symbol_info), debug_info_(0) { // TODO(benvanik): create on demand? lock_ = AllocMutex(); } @@ -72,48 +73,34 @@ Breakpoint* Function::FindBreakpoint(uint64_t address) { return result; } -int Function::Call(ThreadState* thread_state) { +int Function::Call(ThreadState* thread_state, uint64_t return_address) { + SCOPE_profile_cpu_f("alloy"); + ThreadState* original_thread_state = ThreadState::Get(); if (original_thread_state != thread_state) { ThreadState::Bind(thread_state); } - int result = CallImpl(thread_state); + + int result = 0; + + if (symbol_info_->behavior() == FunctionInfo::BEHAVIOR_EXTERN) { + auto handler = symbol_info_->extern_handler(); + if (handler) { + handler(thread_state->raw_context(), + symbol_info_->extern_arg0(), + symbol_info_->extern_arg1()); + } else { + XELOGW("undefined extern call to %.8X %s", + symbol_info_->address(), + symbol_info_->name()); + result = 1; + } + } else { + CallImpl(thread_state, return_address); + } + if (original_thread_state != thread_state) { ThreadState::Bind(original_thread_state); } return result; } - -ExternFunction::ExternFunction( - uint64_t address, Handler handler, void* arg0, void* arg1) : - name_(0), - handler_(handler), arg0_(arg0), arg1_(arg1), - Function(Function::EXTERN_FUNCTION, address) { -} - -ExternFunction::~ExternFunction() { - if (name_) { - xe_free(name_); - } -} - -void ExternFunction::set_name(const char* name) { - name_ = xestrdupa(name); -} - -int ExternFunction::CallImpl(ThreadState* thread_state) { - if (!handler_) { - XELOGW("undefined extern call to %.8X %s", address(), name()); - return 0; - } - handler_(thread_state->raw_context(), arg0_, arg1_); - return 0; -} - -GuestFunction::GuestFunction(FunctionInfo* symbol_info) : - symbol_info_(symbol_info), - Function(Function::USER_FUNCTION, symbol_info->address()) { -} - -GuestFunction::~GuestFunction() { -} diff --git a/src/alloy/runtime/function.h b/src/alloy/runtime/function.h index d150f91a6..22f4df0aa 100644 --- a/src/alloy/runtime/function.h +++ b/src/alloy/runtime/function.h @@ -24,17 +24,11 @@ class ThreadState; class Function { public: - enum Type { - UNKNOWN_FUNCTION = 0, - EXTERN_FUNCTION, - USER_FUNCTION, - }; -public: - Function(Type type, uint64_t address); + Function(FunctionInfo* symbol_info); virtual ~Function(); - Type type() const { return type_; } uint64_t address() const { return address_; } + FunctionInfo* symbol_info() const { return symbol_info_; } DebugInfo* debug_info() const { return debug_info_; } void set_debug_info(DebugInfo* debug_info) { debug_info_ = debug_info; } @@ -42,17 +36,18 @@ public: int AddBreakpoint(Breakpoint* breakpoint); int RemoveBreakpoint(Breakpoint* breakpoint); - int Call(ThreadState* thread_state); + int Call(ThreadState* thread_state, uint64_t return_address); protected: Breakpoint* FindBreakpoint(uint64_t address); virtual int AddBreakpointImpl(Breakpoint* breakpoint) { return 0; } virtual int RemoveBreakpointImpl(Breakpoint* breakpoint) { return 0; } - virtual int CallImpl(ThreadState* thread_state) = 0; + virtual int CallImpl(ThreadState* thread_state, + uint64_t return_address) = 0; protected: - Type type_; uint64_t address_; + FunctionInfo* symbol_info_; DebugInfo* debug_info_; // TODO(benvanik): move elsewhere? DebugData? @@ -61,43 +56,6 @@ protected: }; -class ExternFunction : public Function { -public: - typedef void(*Handler)(void* context, void* arg0, void* arg1); -public: - ExternFunction(uint64_t address, Handler handler, void* arg0, void* arg1); - virtual ~ExternFunction(); - - const char* name() const { return name_; } - void set_name(const char* name); - - Handler handler() const { return handler_; } - void* arg0() const { return arg0_; } - void* arg1() const { return arg1_; } - -protected: - virtual int CallImpl(ThreadState* thread_state); - -protected: - char* name_; - Handler handler_; - void* arg0_; - void* arg1_; -}; - - -class GuestFunction : public Function { -public: - GuestFunction(FunctionInfo* symbol_info); - virtual ~GuestFunction(); - - FunctionInfo* symbol_info() const { return symbol_info_; } - -protected: - FunctionInfo* symbol_info_; -}; - - } // namespace runtime } // namespace alloy diff --git a/src/alloy/runtime/module.cc b/src/alloy/runtime/module.cc index ea056e0dd..5e38c3902 100644 --- a/src/alloy/runtime/module.cc +++ b/src/alloy/runtime/module.cc @@ -161,6 +161,8 @@ SymbolInfo::Status Module::DefineVariable(VariableInfo* symbol_info) { } void Module::ForEachFunction(std::function callback) { + SCOPE_profile_cpu_f("alloy"); + LockMutex(lock_); for (auto it = list_.begin(); it != list_.end(); ++it) { SymbolInfo* symbol_info = *it; @@ -174,6 +176,8 @@ void Module::ForEachFunction(std::function callback) { void Module::ForEachFunction(size_t since, size_t& version, std::function callback) { + SCOPE_profile_cpu_f("alloy"); + LockMutex(lock_); size_t count = list_.size(); version = count; diff --git a/src/alloy/runtime/module.h b/src/alloy/runtime/module.h index 005e325a1..c05e009ca 100644 --- a/src/alloy/runtime/module.h +++ b/src/alloy/runtime/module.h @@ -62,7 +62,7 @@ protected: private: // TODO(benvanik): replace with a better data structure. Mutex* lock_; - typedef std::tr1::unordered_map SymbolMap; + typedef std::unordered_map SymbolMap; SymbolMap map_; typedef std::vector SymbolList; SymbolList list_; diff --git a/src/alloy/runtime/register_access.h b/src/alloy/runtime/register_access.h deleted file mode 100644 index 21e3f1549..000000000 --- a/src/alloy/runtime/register_access.h +++ /dev/null @@ -1,38 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef ALLOY_RUNTIME_REGISTER_ACCESS_H_ -#define ALLOY_RUNTIME_REGISTER_ACCESS_H_ - -#include - - -namespace alloy { -namespace runtime { - -typedef bool (*RegisterHandlesCallback)(void* context, uint64_t addr); -typedef uint64_t (*RegisterReadCallback)(void* context, uint64_t addr); -typedef void (*RegisterWriteCallback)(void* context, uint64_t addr, - uint64_t value); - -typedef struct RegisterAccessCallbacks_s { - void* context; - RegisterHandlesCallback handles; - RegisterReadCallback read; - RegisterWriteCallback write; - - RegisterAccessCallbacks_s* next; -} RegisterAccessCallbacks; - - -} // namespace runtime -} // namespace alloy - - -#endif // ALLOY_RUNTIME_REGISTER_ACCESS_H_ diff --git a/src/alloy/runtime/runtime.cc b/src/alloy/runtime/runtime.cc index 8a49a1bc4..7be29dce1 100644 --- a/src/alloy/runtime/runtime.cc +++ b/src/alloy/runtime/runtime.cc @@ -25,8 +25,7 @@ DEFINE_string(runtime_backend, "any", Runtime::Runtime(Memory* memory) : - memory_(memory), debugger_(0), backend_(0), frontend_(0), - access_callbacks_(0) { + memory_(memory), debugger_(0), backend_(0), frontend_(0) { tracing::Initialize(); modules_lock_ = AllocMutex(10000); } @@ -41,14 +40,6 @@ Runtime::~Runtime() { UnlockMutex(modules_lock_); FreeMutex(modules_lock_); - RegisterAccessCallbacks* cbs = access_callbacks_; - while (cbs) { - RegisterAccessCallbacks* next = cbs->next; - delete cbs; - cbs = next; - } - access_callbacks_ = NULL; - delete frontend_; delete backend_; delete debugger_; @@ -64,11 +55,6 @@ int Runtime::Initialize(Frontend* frontend, Backend* backend) { // Must be initialized by subclass before calling into this. XEASSERTNOTNULL(memory_); - int result = memory_->Initialize(); - if (result) { - return result; - } - // Create debugger first. Other types hook up to it. debugger_ = new Debugger(this); @@ -91,10 +77,10 @@ int Runtime::Initialize(Frontend* frontend, Backend* backend) { #endif // ALLOY_HAS_IVM_BACKEND if (FLAGS_runtime_backend == "any") { #if defined(ALLOY_HAS_X64_BACKEND) && ALLOY_HAS_X64_BACKEND - /*if (!backend) { + if (!backend) { backend = new alloy::backend::x64::X64Backend( this); - }*/ + } #endif // ALLOY_HAS_X64_BACKEND #if defined(ALLOY_HAS_IVM_BACKEND) && ALLOY_HAS_IVM_BACKEND if (!backend) { @@ -111,7 +97,7 @@ int Runtime::Initialize(Frontend* frontend, Backend* backend) { backend_ = backend; frontend_ = frontend; - result = backend_->Initialize(); + int result = backend_->Initialize(); if (result) { return result; } @@ -159,6 +145,8 @@ std::vector Runtime::FindFunctionsWithAddress(uint64_t address) { } int Runtime::ResolveFunction(uint64_t address, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + *out_function = NULL; Entry* entry; Entry::Status status = entry_table_.GetOrCreate(address, &entry); @@ -192,6 +180,8 @@ int Runtime::ResolveFunction(uint64_t address, Function** out_function) { int Runtime::LookupFunctionInfo( uint64_t address, FunctionInfo** out_symbol_info) { + SCOPE_profile_cpu_f("alloy"); + *out_symbol_info = NULL; // TODO(benvanik): fast reject invalid addresses/log errors. @@ -220,6 +210,8 @@ int Runtime::LookupFunctionInfo( int Runtime::LookupFunctionInfo(Module* module, uint64_t address, FunctionInfo** out_symbol_info) { + SCOPE_profile_cpu_f("alloy"); + // Atomic create/lookup symbol in module. // If we get back the NEW flag we must declare it now. FunctionInfo* symbol_info = NULL; @@ -241,6 +233,8 @@ int Runtime::LookupFunctionInfo(Module* module, uint64_t address, int Runtime::DemandFunction( FunctionInfo* symbol_info, Function** out_function) { + SCOPE_profile_cpu_f("alloy"); + *out_function = NULL; // Lock function for generation. If it's already being generated @@ -273,11 +267,3 @@ int Runtime::DemandFunction( return 0; } - -void Runtime::AddRegisterAccessCallbacks( - const RegisterAccessCallbacks& callbacks) { - RegisterAccessCallbacks* cbs = new RegisterAccessCallbacks(); - xe_copy_struct(cbs, &callbacks, sizeof(callbacks)); - cbs->next = access_callbacks_; - access_callbacks_ = cbs; -} diff --git a/src/alloy/runtime/runtime.h b/src/alloy/runtime/runtime.h index 3ccd82fb6..a6c506fc5 100644 --- a/src/alloy/runtime/runtime.h +++ b/src/alloy/runtime/runtime.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -38,9 +37,6 @@ public: Debugger* debugger() const { return debugger_; } frontend::Frontend* frontend() const { return frontend_; } backend::Backend* backend() const { return backend_; } - RegisterAccessCallbacks* access_callbacks() const { - return access_callbacks_; - } int Initialize(frontend::Frontend* frontend, backend::Backend* backend = 0); @@ -55,9 +51,6 @@ public: FunctionInfo** out_symbol_info); int ResolveFunction(uint64_t address, Function** out_function); - void AddRegisterAccessCallbacks( - const RegisterAccessCallbacks& callbacks); - //uint32_t CreateCallback(void (*callback)(void* data), void* data); private: @@ -74,8 +67,6 @@ protected: EntryTable entry_table_; Mutex* modules_lock_; ModuleList modules_; - - RegisterAccessCallbacks* access_callbacks_; }; diff --git a/src/alloy/runtime/sources.gypi b/src/alloy/runtime/sources.gypi index be12e8f4e..399580ec0 100644 --- a/src/alloy/runtime/sources.gypi +++ b/src/alloy/runtime/sources.gypi @@ -15,7 +15,6 @@ 'module.h', 'raw_module.cc', 'raw_module.h', - 'register_access.h', 'runtime.cc', 'runtime.h', 'symbol_info.cc', diff --git a/src/alloy/runtime/symbol_info.cc b/src/alloy/runtime/symbol_info.cc index 3a486840d..e87727b3a 100644 --- a/src/alloy/runtime/symbol_info.cc +++ b/src/alloy/runtime/symbol_info.cc @@ -34,11 +34,19 @@ void SymbolInfo::set_name(const char* name) { FunctionInfo::FunctionInfo(Module* module, uint64_t address) : end_address_(0), behavior_(BEHAVIOR_DEFAULT), function_(0), SymbolInfo(SymbolInfo::TYPE_FUNCTION, module, address) { + xe_zero_struct(&extern_info_, sizeof(extern_info_)); } FunctionInfo::~FunctionInfo() { } +void FunctionInfo::SetupExtern(ExternHandler handler, void* arg0, void* arg1) { + behavior_ = BEHAVIOR_EXTERN; + extern_info_.handler = handler; + extern_info_.arg0 = arg0; + extern_info_.arg1 = arg1; +} + VariableInfo::VariableInfo(Module* module, uint64_t address) : SymbolInfo(SymbolInfo::TYPE_VARIABLE, module, address) { } diff --git a/src/alloy/runtime/symbol_info.h b/src/alloy/runtime/symbol_info.h index c91fda40a..8d2a964e7 100644 --- a/src/alloy/runtime/symbol_info.h +++ b/src/alloy/runtime/symbol_info.h @@ -63,6 +63,7 @@ public: BEHAVIOR_PROLOG, BEHAVIOR_EPILOG, BEHAVIOR_EPILOG_RETURN, + BEHAVIOR_EXTERN, }; public: @@ -79,10 +80,21 @@ public: Function* function() const { return function_; } void set_function(Function* value) { function_ = value; } + typedef void(*ExternHandler)(void* context, void* arg0, void* arg1); + void SetupExtern(ExternHandler handler, void* arg0, void* arg1); + ExternHandler extern_handler() const { return extern_info_.handler; } + void* extern_arg0() const { return extern_info_.arg0; } + void* extern_arg1() const { return extern_info_.arg1; } + private: uint64_t end_address_; Behavior behavior_; Function* function_; + struct { + ExternHandler handler; + void* arg0; + void* arg1; + } extern_info_; }; class VariableInfo : public SymbolInfo { diff --git a/src/alloy/runtime/thread_state.cc b/src/alloy/runtime/thread_state.cc index 32edf177e..84add8bce 100644 --- a/src/alloy/runtime/thread_state.cc +++ b/src/alloy/runtime/thread_state.cc @@ -64,6 +64,5 @@ ThreadState* ThreadState::Get() { } uint32_t ThreadState::GetThreadID() { - XEASSERT(thread_state_); return thread_state_->thread_id_; } diff --git a/src/alloy/runtime/tracing.h b/src/alloy/runtime/tracing.h index 005562d07..262662b90 100644 --- a/src/alloy/runtime/tracing.h +++ b/src/alloy/runtime/tracing.h @@ -40,46 +40,46 @@ public: ALLOY_RUNTIME_MEMORY_HEAP_FREE = ALLOY_RUNTIME_MEMORY | (4), }; - typedef struct { + typedef struct Init_s { static const uint32_t event_type = ALLOY_RUNTIME_INIT; } Init; - typedef struct { + typedef struct Deinit_s { static const uint32_t event_type = ALLOY_RUNTIME_DEINIT; } Deinit; - typedef struct { + typedef struct ThreadInit_s { static const uint32_t event_type = ALLOY_RUNTIME_THREAD_INIT; } ThreadInit; - typedef struct { + typedef struct ThreadDeinit_s { static const uint32_t event_type = ALLOY_RUNTIME_THREAD_DEINIT; } ThreadDeinit; - typedef struct { + typedef struct MemoryInit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_INIT; // map of memory, etc? } MemoryInit; - typedef struct { + typedef struct MemoryDeinit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_DEINIT; } MemoryDeinit; - typedef struct { + typedef struct MemoryHeapInit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_INIT; uint32_t heap_id; uint64_t low_address; uint64_t high_address; uint32_t is_physical; } MemoryHeapInit; - typedef struct { + typedef struct MemoryHeapDeinit_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_DEINIT; uint32_t heap_id; } MemoryHeapDeinit; - typedef struct { + typedef struct MemoryHeapAlloc_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_ALLOC; uint32_t heap_id; uint32_t flags; uint64_t address; size_t size; } MemoryHeapAlloc; - typedef struct { + typedef struct MemoryHeapFree_s { static const uint32_t event_type = ALLOY_RUNTIME_MEMORY_HEAP_FREE; uint32_t heap_id; uint64_t address; diff --git a/src/alloy/tracing/event_type.h b/src/alloy/tracing/event_type.h index e51353708..33e2614fb 100644 --- a/src/alloy/tracing/event_type.h +++ b/src/alloy/tracing/event_type.h @@ -33,10 +33,10 @@ public: USER = (1 << 31), }; - typedef struct { + typedef struct TraceInit_s { static const uint32_t event_type = ALLOY_TRACE_INIT; } TraceInit; - typedef struct { + typedef struct TraceEOF_s { static const uint32_t event_type = ALLOY_TRACE_EOF; } TraceEOF; }; diff --git a/src/alloy/tracing/tracing.h b/src/alloy/tracing/tracing.h index b4eb2c865..ced2081de 100644 --- a/src/alloy/tracing/tracing.h +++ b/src/alloy/tracing/tracing.h @@ -30,7 +30,7 @@ Tracer* GetThreadTracer(); void WriteEvent(uint32_t event_type, size_t size = 0, const void* data = 0); -template void WriteEvent(T& ev) { +template void WriteEvent(const T& ev) { if (sizeof(T) > 1) { alloy::tracing::WriteEvent(T::event_type, sizeof(T), &ev); } else { diff --git a/src/xenia/apu/audio_system.cc b/src/xenia/apu/audio_system.cc index 1793fc92d..68b9ccbee 100644 --- a/src/xenia/apu/audio_system.cc +++ b/src/xenia/apu/audio_system.cc @@ -42,12 +42,13 @@ X_STATUS AudioSystem::Setup() { processor_ = emulator_->processor(); // Let the processor know we want register access callbacks. - RegisterAccessCallbacks callbacks; - callbacks.context = this; - callbacks.handles = (RegisterHandlesCallback)HandlesRegisterThunk; - callbacks.read = (RegisterReadCallback)ReadRegisterThunk; - callbacks.write = (RegisterWriteCallback)WriteRegisterThunk; - emulator_->processor()->AddRegisterAccessCallbacks(callbacks); + emulator_->memory()->AddMappedRange( + 0x7FEA0000, + 0xFFFF0000, + 0x0000FFFF, + this, + reinterpret_cast(MMIOReadRegisterThunk), + reinterpret_cast(MMIOWriteRegisterThunk)); // Setup worker thread state. This lets us make calls into guest code. thread_state_ = new XenonThreadState( @@ -82,7 +83,9 @@ void AudioSystem::ThreadStart() { if (result == WAIT_FAILED) { DWORD err = GetLastError(); XEASSERTALWAYS(); + break; } + size_t pumped = 0; if (result >= WAIT_OBJECT_0 && result <= WAIT_OBJECT_0 + (maximum_client_count_ - 1)) { size_t index = result - WAIT_OBJECT_0; @@ -92,7 +95,8 @@ void AudioSystem::ThreadStart() { uint32_t client_callback_arg = clients_[index].wrapped_callback_arg; xe_mutex_unlock(lock_); if (client_callback) { - processor->Execute(thread_state_, client_callback, client_callback_arg, 0); + uint64_t args[] = { client_callback_arg }; + processor->Execute(thread_state_, client_callback, args, XECOUNT(args)); } pumped++; index++; @@ -104,6 +108,7 @@ void AudioSystem::ThreadStart() { } if (!pumped) { + SCOPE_profile_cpu_i("apu", "Sleep"); Sleep(500); } } @@ -157,6 +162,8 @@ X_STATUS AudioSystem::RegisterClient( } void AudioSystem::SubmitFrame(size_t index, uint32_t samples_ptr) { + SCOPE_profile_cpu_f("apu"); + xe_mutex_lock(lock_); XEASSERTTRUE(index < maximum_client_count_); XEASSERTTRUE(clients_[index].driver != NULL); @@ -166,6 +173,8 @@ void AudioSystem::SubmitFrame(size_t index, uint32_t samples_ptr) { } void AudioSystem::UnregisterClient(size_t index) { + SCOPE_profile_cpu_f("apu"); + xe_mutex_lock(lock_); XEASSERTTRUE(index < maximum_client_count_); DestroyDriver(clients_[index].driver); @@ -174,10 +183,6 @@ void AudioSystem::UnregisterClient(size_t index) { xe_mutex_unlock(lock_); } -bool AudioSystem::HandlesRegister(uint64_t addr) { - return (addr & 0xFFFF0000) == 0x7FEA0000; -} - // free60 may be useful here, however it looks like it's using a different // piece of hardware: // https://github.com/Free60Project/libxenon/blob/master/libxenon/drivers/xenon_sound/sound.c diff --git a/src/xenia/apu/audio_system.h b/src/xenia/apu/audio_system.h index 25d0b5829..964e331cf 100644 --- a/src/xenia/apu/audio_system.h +++ b/src/xenia/apu/audio_system.h @@ -42,7 +42,6 @@ public: virtual X_STATUS CreateDriver(size_t index, HANDLE wait_handle, AudioDriver** out_driver) = 0; virtual void DestroyDriver(AudioDriver* driver) = 0; - bool HandlesRegister(uint64_t addr); virtual uint64_t ReadRegister(uint64_t addr); virtual void WriteRegister(uint64_t addr, uint64_t value); @@ -55,14 +54,11 @@ private: } void ThreadStart(); - static bool HandlesRegisterThunk(AudioSystem* as, uint64_t addr) { - return as->HandlesRegister(addr); - } - static uint64_t ReadRegisterThunk(AudioSystem* as, uint64_t addr) { + static uint64_t MMIOReadRegisterThunk(AudioSystem* as, uint64_t addr) { return as->ReadRegister(addr); } - static void WriteRegisterThunk(AudioSystem* as, uint64_t addr, - uint64_t value) { + static void MMIOWriteRegisterThunk(AudioSystem* as, uint64_t addr, + uint64_t value) { as->WriteRegister(addr, value); } diff --git a/src/xenia/common.h b/src/xenia/common.h index ff16b03c3..68d9d2eb7 100644 --- a/src/xenia/common.h +++ b/src/xenia/common.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/src/xenia/config.h b/src/xenia/config.h index b83aa8715..0804277bf 100644 --- a/src/xenia/config.h +++ b/src/xenia/config.h @@ -27,6 +27,8 @@ #define XE_OPTION_LOG_KERNEL 1 #define XE_OPTION_LOG_FS 1 +// Enable profiling. +#define XE_OPTION_PROFILING 1 // TODO(benvanik): make this a runtime option #define XE_OPTION_OPTIMIZED 0 diff --git a/src/xenia/core/pal_win.cc b/src/xenia/core/pal_win.cc index febf87935..3ef63b141 100644 --- a/src/xenia/core/pal_win.cc +++ b/src/xenia/core/pal_win.cc @@ -77,7 +77,7 @@ int xe_pal_get_system_info(xe_system_info* out_info) { kernel32 = GetModuleHandle(TEXT("kernel32")); XEEXPECTNOTNULL(kernel32); - glpi = (LPFN_GLPI)GetProcAddress(kernel32, "GetLogicalProcessorInfomration"); + glpi = (LPFN_GLPI)GetProcAddress(kernel32, "GetLogicalProcessorInformation"); XEEXPECTNOTNULL(glpi); // Call GLPI once to get the buffer size, allocate it, then call again. diff --git a/src/xenia/core/thread.cc b/src/xenia/core/thread.cc index 8a48d8267..8aace9ee4 100644 --- a/src/xenia/core/thread.cc +++ b/src/xenia/core/thread.cc @@ -79,7 +79,9 @@ static uint32_t __stdcall xe_thread_callback_win32(void* param) { } } + xe::Profiler::ThreadEnter(thread->name); thread->callback(thread->callback_param); + xe::Profiler::ThreadExit(); return 0; } #pragma warning(default : 6320; default : 6322) @@ -118,7 +120,9 @@ static void* xe_thread_callback_pthreads(void* param) { #else pthread_setname_np(pthread_self(), thread->name); #endif // OSX + xe::Profiler::ThreadEnter(thread->name); thread->callback(thread->callback_param); + xe::Profiler::ThreadExit(); return 0; } diff --git a/src/xenia/cpu/cpu-private.h b/src/xenia/cpu/cpu-private.h index 1b49862f6..3272d2ece 100644 --- a/src/xenia/cpu/cpu-private.h +++ b/src/xenia/cpu/cpu-private.h @@ -20,9 +20,6 @@ DECLARE_bool(trace_user_calls); DECLARE_bool(trace_kernel_calls); DECLARE_uint64(trace_thread_mask); -DECLARE_uint64(break_on_instruction); -DECLARE_uint64(break_on_memory); - DECLARE_string(load_module_map); DECLARE_string(dump_path); diff --git a/src/xenia/cpu/cpu.cc b/src/xenia/cpu/cpu.cc index 12c8118f6..389aeee18 100644 --- a/src/xenia/cpu/cpu.cc +++ b/src/xenia/cpu/cpu.cc @@ -25,13 +25,6 @@ DEFINE_uint64(trace_thread_mask, -1, "Trace threads with IDs in the mask, or -1 for all."); -// Breakpoints: -DEFINE_uint64(break_on_instruction, 0, - "int3 before the given guest address is executed."); -DEFINE_uint64(break_on_memory, 0, - "int3 on read/write to the given memory address."); - - // Debugging: DEFINE_string(load_module_map, "", "Loads a .map for symbol names and to diff with the generated symbol " diff --git a/src/xenia/cpu/processor.cc b/src/xenia/cpu/processor.cc index e03890a92..969907d96 100644 --- a/src/xenia/cpu/processor.cc +++ b/src/xenia/cpu/processor.cc @@ -141,12 +141,9 @@ int Processor::Setup() { return 0; } -void Processor::AddRegisterAccessCallbacks( - xe::cpu::RegisterAccessCallbacks callbacks) { - runtime_->AddRegisterAccessCallbacks(callbacks); -} - int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { + SCOPE_profile_cpu_f("cpu"); + // Attempt to get the function. Function* fn; if (runtime_->ResolveFunction(address, &fn)) { @@ -165,26 +162,20 @@ int Processor::Execute(XenonThreadState* thread_state, uint64_t address) { context->lr = lr; // Execute the function. - fn->Call(thread_state); + fn->Call(thread_state, lr); return 0; } uint64_t Processor::Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0) { - PPCContext* context = thread_state->context(); - context->r[3] = arg0; - if (Execute(thread_state, address)) { - return 0xDEADBABE; - } - return context->r[3]; -} + XenonThreadState* thread_state, uint64_t address, uint64_t args[], + size_t arg_count) { + SCOPE_profile_cpu_f("cpu"); -uint64_t Processor::Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0, - uint64_t arg1) { PPCContext* context = thread_state->context(); - context->r[3] = arg0; - context->r[4] = arg1; + XEASSERT(arg_count <= 5); + for (size_t i = 0; i < arg_count; ++i) { + context->r[3 + i] = args[i]; + } if (Execute(thread_state, address)) { return 0xDEADBABE; } @@ -192,7 +183,9 @@ uint64_t Processor::Execute( } uint64_t Processor::ExecuteInterrupt( - uint32_t cpu, uint64_t address, uint64_t arg0, uint64_t arg1) { + uint32_t cpu, uint64_t address, uint64_t args[], size_t arg_count) { + SCOPE_profile_cpu_f("cpu"); + // Acquire lock on interrupt thread (we can only dispatch one at a time). xe_mutex_lock(interrupt_thread_lock_); @@ -201,7 +194,7 @@ uint64_t Processor::ExecuteInterrupt( XESETUINT8BE(p + interrupt_thread_block_ + 0x10C, cpu); // Execute interrupt. - uint64_t result = Execute(interrupt_thread_state_, address, arg0, arg1); + uint64_t result = Execute(interrupt_thread_state_, address, args, arg_count); xe_mutex_unlock(interrupt_thread_lock_); return result; @@ -648,7 +641,6 @@ json_t* Processor::DumpModule(XexModule* module, bool& succeeded) { json_object_set_new(import_library_json, "imports", imports_json); json_array_append_new(library_imports_json, import_library_json); - xe_free(import_infos); } json_object_set_new(module_json, "libraryImports", library_imports_json); diff --git a/src/xenia/cpu/processor.h b/src/xenia/cpu/processor.h index d08912c88..3ad8217db 100644 --- a/src/xenia/cpu/processor.h +++ b/src/xenia/cpu/processor.h @@ -10,7 +10,6 @@ #ifndef XENIA_CPU_PROCESSOR_H_ #define XENIA_CPU_PROCESSOR_H_ -#include #include #include @@ -28,11 +27,6 @@ XEDECLARECLASS2(xe, cpu, XexModule); namespace xe { namespace cpu { -using RegisterAccessCallbacks = alloy::runtime::RegisterAccessCallbacks; -using RegisterHandlesCallback = alloy::runtime::RegisterHandlesCallback; -using RegisterReadCallback = alloy::runtime::RegisterReadCallback; -using RegisterWriteCallback = alloy::runtime::RegisterWriteCallback; - class Processor : public debug::DebugTarget { public: @@ -45,18 +39,14 @@ public: int Setup(); - void AddRegisterAccessCallbacks(RegisterAccessCallbacks callbacks); - int Execute( XenonThreadState* thread_state, uint64_t address); uint64_t Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0); - uint64_t Execute( - XenonThreadState* thread_state, uint64_t address, uint64_t arg0, - uint64_t arg1); + XenonThreadState* thread_state, uint64_t address, uint64_t args[], + size_t arg_count); uint64_t ExecuteInterrupt( - uint32_t cpu, uint64_t address, uint64_t arg0, uint64_t arg1); + uint32_t cpu, uint64_t address, uint64_t args[], size_t arg_count); virtual void OnDebugClientConnected(uint32_t client_id); virtual void OnDebugClientDisconnected(uint32_t client_id); diff --git a/src/xenia/cpu/xenon_memory.cc b/src/xenia/cpu/xenon_memory.cc index f730f99a4..8e3bf6dc5 100644 --- a/src/xenia/cpu/xenon_memory.cc +++ b/src/xenia/cpu/xenon_memory.cc @@ -119,15 +119,136 @@ private: }; uint32_t XenonMemoryHeap::next_heap_id_ = 1; +namespace { -XenonMemory::XenonMemory() : - mapping_(0), mapping_base_(0), - Memory() { +namespace BE { +#include +} + +struct MMIORange { + uint64_t address; + uint64_t mask; + uint64_t size; + void* context; + MMIOReadCallback read; + MMIOWriteCallback write; +}; +MMIORange g_mapped_ranges_[16] = { 0 }; +int g_mapped_range_count_ = 0; + +uint64_t* GetContextRegPtr(BE::Int32 arg_type, PCONTEXT context) { + DWORD index = 0; + _BitScanForward(&index, arg_type); + return &context->Rax + index; +} + +// Handles potential accesses to mmio. We look for access violations to +// addresses in our range and call into the registered handlers, if any. +// If there are none, we continue. +LONG CALLBACK CheckMMIOHandler(PEXCEPTION_POINTERS ex_info) { + // http://msdn.microsoft.com/en-us/library/ms679331(v=vs.85).aspx + // http://msdn.microsoft.com/en-us/library/aa363082(v=vs.85).aspx + auto code = ex_info->ExceptionRecord->ExceptionCode; + if (code == STATUS_ACCESS_VIOLATION) { + // Access violations are pretty rare, so we can do a linear search here. + auto address = ex_info->ExceptionRecord->ExceptionInformation[1]; + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + if ((address & range.mask) == range.address) { + // Within our range. + + // TODO(benvanik): replace with simple check of mov (that's all + // we care about). + BE::DISASM disasm = { 0 }; + disasm.Archi = 64; + disasm.Options = BE::MasmSyntax + BE::PrefixedNumeral; + disasm.EIP = (BE::UIntPtr)ex_info->ExceptionRecord->ExceptionAddress; + BE::UIntPtr eip_end = disasm.EIP + 20; + size_t len = BE::Disasm(&disasm); + if (len == BE::UNKNOWN_OPCODE) { + break; + } + + auto action = ex_info->ExceptionRecord->ExceptionInformation[0]; + if (action == 0) { + uint64_t value = range.read(range.context, address & 0xFFFFFFFF); + XEASSERT((disasm.Argument1.ArgType & BE::REGISTER_TYPE) == + BE::REGISTER_TYPE); + uint64_t* reg_ptr = GetContextRegPtr(disasm.Argument1.ArgType, + ex_info->ContextRecord); + switch (disasm.Argument1.ArgSize) { + case 8: + *reg_ptr = static_cast(value); + break; + case 16: + *reg_ptr = XESWAP16(static_cast(value)); + break; + case 32: + *reg_ptr = XESWAP32(static_cast(value)); + break; + case 64: + *reg_ptr = XESWAP64(static_cast(value)); + break; + } + ex_info->ContextRecord->Rip += len; + return EXCEPTION_CONTINUE_EXECUTION; + } else if (action == 1) { + uint64_t value; + if ((disasm.Argument2.ArgType & BE::REGISTER_TYPE) == BE::REGISTER_TYPE) { + uint64_t* reg_ptr = GetContextRegPtr(disasm.Argument2.ArgType, + ex_info->ContextRecord); + value = *reg_ptr; + } else if ((disasm.Argument2.ArgType & BE::CONSTANT_TYPE) == BE::CONSTANT_TYPE) { + value = disasm.Instruction.Immediat; + } else { + XEASSERTALWAYS(); + } + switch (disasm.Argument2.ArgSize) { + case 8: + value = static_cast(value); + break; + case 16: + value = XESWAP16(static_cast(value)); + break; + case 32: + value = XESWAP32(static_cast(value)); + break; + case 64: + value = XESWAP64(static_cast(value)); + break; + } + range.write(range.context, address & 0xFFFFFFFF, value); + ex_info->ContextRecord->Rip += len; + return EXCEPTION_CONTINUE_EXECUTION; + } + } + } + } + return EXCEPTION_CONTINUE_SEARCH; +} + +} // namespace + + +XenonMemory::XenonMemory() + : Memory(), + mapping_(0), mapping_base_(0), page_table_(0) { virtual_heap_ = new XenonMemoryHeap(this, false); physical_heap_ = new XenonMemoryHeap(this, true); } XenonMemory::~XenonMemory() { + // Remove exception handlers. + RemoveVectoredExceptionHandler(CheckMMIOHandler); + RemoveVectoredContinueHandler(CheckMMIOHandler); + + // Unallocate mapped ranges. + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + VirtualFree(reinterpret_cast(range.address), range.size, + MEM_DECOMMIT); + } + if (mapping_base_) { // GPU writeback. VirtualFree( @@ -195,7 +316,7 @@ int XenonMemory::Initialize() { virtual_heap_->Initialize( XENON_MEMORY_VIRTUAL_HEAP_LOW, XENON_MEMORY_VIRTUAL_HEAP_HIGH); physical_heap_->Initialize( - XENON_MEMORY_PHYSICAL_HEAP_LOW, XENON_MEMORY_PHYSICAL_HEAP_HIGH); + XENON_MEMORY_PHYSICAL_HEAP_LOW, XENON_MEMORY_PHYSICAL_HEAP_HIGH - 0x1000); // GPU writeback. // 0xC... is physical, 0x7F... is virtual. We may need to overlay these. @@ -204,6 +325,22 @@ int XenonMemory::Initialize() { 0x00100000, MEM_COMMIT, PAGE_READWRITE); + // Add handlers for MMIO. + // If there is a debugger attached the normal exception handler will not + // fire and we must instead add the continue handler. + AddVectoredExceptionHandler(1, CheckMMIOHandler); + if (IsDebuggerPresent()) { + // TODO(benvanik): is this really required? + //AddVectoredContinueHandler(1, CheckMMIOHandler); + } + + // Allocate dirty page table. + // This must live within our low heap. Ideally we'd hardcode the address but + // this is more flexible. + page_table_ = physical_heap_->Alloc( + 0, (512 * 1024 * 1024) / (16 * 1024), + X_MEM_COMMIT, 16 * 1024); + return 0; XECLEANUP: @@ -248,6 +385,112 @@ void XenonMemory::UnmapViews() { } } +bool XenonMemory::AddMappedRange(uint64_t address, uint64_t mask, + uint64_t size, void* context, + MMIOReadCallback read_callback, + MMIOWriteCallback write_callback) { + DWORD protect = 0; + if (read_callback && write_callback) { + protect = PAGE_NOACCESS; + } else if (write_callback) { + protect = PAGE_READONLY; + } else { + // Write-only memory is not supported. + XEASSERTALWAYS(); + } + if (!VirtualAlloc(Translate(address), + size, + MEM_COMMIT, protect)) { + return false; + } + XEASSERT(g_mapped_range_count_ + 1 < XECOUNT(g_mapped_ranges_)); + g_mapped_ranges_[g_mapped_range_count_++] = { + reinterpret_cast(mapping_base_) | address, + 0xFFFFFFFF00000000 | mask, + size, context, + read_callback, write_callback, + }; + return true; +} + +bool XenonMemory::CheckMMIOLoad(uint64_t address, uint64_t* out_value) { + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + if (((address | (uint64_t)mapping_base_) & range.mask) == range.address) { + *out_value = static_cast(range.read(range.context, address)); + return true; + } + } + return false; +} + +uint8_t XenonMemory::LoadI8(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +uint16_t XenonMemory::LoadI16(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +uint32_t XenonMemory::LoadI32(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +uint64_t XenonMemory::LoadI64(uint64_t address) { + uint64_t value; + if (!CheckMMIOLoad(address, &value)) { + value = *reinterpret_cast(Translate(address)); + } + return static_cast(value); +} + +bool XenonMemory::CheckMMIOStore(uint64_t address, uint64_t value) { + for (int i = 0; i < g_mapped_range_count_; ++i) { + const auto& range = g_mapped_ranges_[i]; + if (((address | (uint64_t)mapping_base_) & range.mask) == range.address) { + range.write(range.context, address, value); + return true; + } + } + return false; +} + +void XenonMemory::StoreI8(uint64_t address, uint8_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + +void XenonMemory::StoreI16(uint64_t address, uint16_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + +void XenonMemory::StoreI32(uint64_t address, uint32_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + +void XenonMemory::StoreI64(uint64_t address, uint64_t value) { + if (!CheckMMIOStore(address, value)) { + *reinterpret_cast(Translate(address)) = value; + } +} + uint64_t XenonMemory::HeapAlloc( uint64_t base_address, size_t size, uint32_t flags, uint32_t alignment) { diff --git a/src/xenia/cpu/xenon_memory.h b/src/xenia/cpu/xenon_memory.h index 96ba352fa..05872d12e 100644 --- a/src/xenia/cpu/xenon_memory.h +++ b/src/xenia/cpu/xenon_memory.h @@ -15,33 +15,58 @@ #include +typedef struct xe_ppc_state xe_ppc_state_t; + namespace xe { namespace cpu { class XenonMemoryHeap; +typedef uint64_t (*MMIOReadCallback)(void* context, uint64_t addr); +typedef void (*MMIOWriteCallback)(void* context, uint64_t addr, + uint64_t value); class XenonMemory : public alloy::Memory { public: XenonMemory(); virtual ~XenonMemory(); - virtual int Initialize(); + int Initialize() override; - virtual uint64_t HeapAlloc( + uint64_t page_table() const override { return page_table_; } + + bool AddMappedRange(uint64_t address, uint64_t mask, + uint64_t size, + void* context, + MMIOReadCallback read_callback = nullptr, + MMIOWriteCallback write_callback = nullptr); + + uint8_t LoadI8(uint64_t address) override; + uint16_t LoadI16(uint64_t address) override; + uint32_t LoadI32(uint64_t address) override; + uint64_t LoadI64(uint64_t address) override; + void StoreI8(uint64_t address, uint8_t value) override; + void StoreI16(uint64_t address, uint16_t value) override; + void StoreI32(uint64_t address, uint32_t value) override; + void StoreI64(uint64_t address, uint64_t value) override; + + uint64_t HeapAlloc( uint64_t base_address, size_t size, uint32_t flags, - uint32_t alignment = 0x20); - virtual int HeapFree(uint64_t address, size_t size); + uint32_t alignment = 0x20) override; + int HeapFree(uint64_t address, size_t size) override; - virtual size_t QuerySize(uint64_t base_address); + size_t QuerySize(uint64_t base_address) override; - virtual int Protect(uint64_t address, size_t size, uint32_t access); - virtual uint32_t QueryProtect(uint64_t address); + int Protect(uint64_t address, size_t size, uint32_t access) override; + uint32_t QueryProtect(uint64_t address) override; private: int MapViews(uint8_t* mapping_base); void UnmapViews(); + bool CheckMMIOLoad(uint64_t address, uint64_t* out_value); + bool CheckMMIOStore(uint64_t address, uint64_t value); + private: HANDLE mapping_; uint8_t* mapping_base_; @@ -60,6 +85,8 @@ private: XenonMemoryHeap* virtual_heap_; XenonMemoryHeap* physical_heap_; + uint64_t page_table_; + friend class XenonMemoryHeap; }; diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index 9291724d2..2ff90c16a 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -135,14 +135,12 @@ int XexModule::SetupLibraryImports(const xe_xex2_import_library_t* library) { if (kernel_export->type == KernelExport::Function) { // Not exactly sure what this should be... if (info->thunk_address) { - // slot = XESWAP32BE(info->thunk_address); - // Setting this breaks other emu code that relies on it not being - // modified. Not sure what to do. + *slot = XESWAP32BE(info->thunk_address); } else { // TODO(benvanik): find out what import variables are. XELOGW("kernel import variable not defined %.8X %s", info->value_address, kernel_export->name); - //*slot = XESWAP32BE(0xF00DF00D); + *slot = XESWAP32BE(0xF00DF00D); } } else { if (kernel_export->is_implemented) { @@ -165,39 +163,45 @@ int XexModule::SetupLibraryImports(const xe_xex2_import_library_t* library) { info->ordinal); } - FunctionInfo* fn_info; - DeclareFunction(info->thunk_address, &fn_info); - fn_info->set_end_address(info->thunk_address + 16 - 4); - //fn->type = FunctionSymbol::Kernel; - //fn->kernel_export = kernel_export; - fn_info->set_name(name); - fn_info->set_status(SymbolInfo::STATUS_DECLARED); + // On load we have something like this in memory: + // li r3, 0 + // li r4, 0x1F5 + // mtspr CTR, r11 + // bctr + // Real consoles rewrite this with some code that sets r11. + // If we did that we'd still have to put a thunk somewhere and do the + // dynamic lookup. Instead, we rewrite it to use syscalls, as they + // aren't used on the 360. Alloy backends can either take the syscall + // or do something smarter. + // sc + // blr + // nop + // nop + uint8_t* p = memory()->Translate(info->thunk_address); + XESETUINT32BE(p + 0x0, 0x44000002); + XESETUINT32BE(p + 0x4, 0x4E800020); + XESETUINT32BE(p + 0x8, 0x60000000); + XESETUINT32BE(p + 0xC, 0x60000000); - ExternFunction::Handler handler = 0; + FunctionInfo::ExternHandler handler = 0; void* handler_data = 0; if (kernel_export) { - handler = (ExternFunction::Handler)kernel_export->function_data.shim; + handler = (FunctionInfo::ExternHandler)kernel_export->function_data.shim; handler_data = kernel_export->function_data.shim_data; } else { - handler = (ExternFunction::Handler)UndefinedImport; + handler = (FunctionInfo::ExternHandler)UndefinedImport; handler_data = this; } - DefineFunction(fn_info); - auto fn = new ExternFunction( - info->thunk_address, - handler, - handler_data, - NULL); - if (kernel_export) { - fn->set_name(kernel_export->name); - } - fn_info->set_function(fn); - fn_info->set_status(SymbolInfo::STATUS_DEFINED); + FunctionInfo* fn_info; + DeclareFunction(info->thunk_address, &fn_info); + fn_info->set_end_address(info->thunk_address + 16 - 4); + fn_info->set_name(name); + fn_info->SetupExtern(handler, handler_data, NULL); + fn_info->set_status(SymbolInfo::STATUS_DECLARED); } } - xe_free(import_infos); return 0; } diff --git a/src/xenia/emulator.h b/src/xenia/emulator.h index c94fa3771..82ede0ec6 100644 --- a/src/xenia/emulator.h +++ b/src/xenia/emulator.h @@ -13,6 +13,7 @@ #include #include #include +#include XEDECLARECLASS1(xe, ExportResolver); @@ -41,7 +42,7 @@ public: ui::Window* main_window() const { return main_window_; } void set_main_window(ui::Window* window); - Memory* memory() const { return memory_; } + cpu::XenonMemory* memory() const { return memory_; } debug::DebugServer* debug_server() const { return debug_server_; } @@ -68,7 +69,7 @@ private: ui::Window* main_window_; - Memory* memory_; + cpu::XenonMemory* memory_; debug::DebugServer* debug_server_; diff --git a/src/xenia/export_resolver.cc b/src/xenia/export_resolver.cc index f630a3c48..9d09b63ec 100644 --- a/src/xenia/export_resolver.cc +++ b/src/xenia/export_resolver.cc @@ -36,6 +36,28 @@ void ExportResolver::RegisterTable( } } +uint16_t ExportResolver::GetLibraryOrdinal(const char* library_name) { + uint16_t n = 0; + for (auto it = tables_.begin(); it != tables_.end(); ++it, n++) { + if (!xestrcmpa(library_name, it->name)) { + return n; + } + } + return -1; +} + +KernelExport* ExportResolver::GetExportByOrdinal( + const uint16_t library_ordinal, const uint32_t ordinal) { + auto& table = tables_[library_ordinal]; + // TODO(benvanik): binary search? + for (size_t n = 0; n < table.count; n++) { + if (table.exports[n].ordinal == ordinal) { + return &table.exports[n]; + } + } + return NULL; +} + KernelExport* ExportResolver::GetExportByOrdinal(const char* library_name, const uint32_t ordinal) { for (std::vector::iterator it = tables_.begin(); diff --git a/src/xenia/export_resolver.h b/src/xenia/export_resolver.h index fcc9a6d87..487e8bd9c 100644 --- a/src/xenia/export_resolver.h +++ b/src/xenia/export_resolver.h @@ -68,6 +68,10 @@ public: void RegisterTable(const char* library_name, KernelExport* exports, const size_t count); + uint16_t GetLibraryOrdinal(const char* library_name); + + KernelExport* GetExportByOrdinal(const uint16_t library_ordinal, + const uint32_t ordinal); KernelExport* GetExportByOrdinal(const char* library_name, const uint32_t ordinal); KernelExport* GetExportByName(const char* library_name, const char* name); diff --git a/src/xenia/gpu/buffer_resource.cc b/src/xenia/gpu/buffer_resource.cc new file mode 100644 index 000000000..9f9accb9b --- /dev/null +++ b/src/xenia/gpu/buffer_resource.cc @@ -0,0 +1,56 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +BufferResource::BufferResource(const MemoryRange& memory_range) + : PagedResource(memory_range) { +} + +BufferResource::~BufferResource() = default; + +int BufferResource::Prepare() { + if (!handle()) { + if (CreateHandle()) { + XELOGE("Unable to create buffer handle"); + return 1; + } + } + + if (!dirtied_) { + return 0; + } + dirtied_ = false; + + // pass dirty regions? + return InvalidateRegion(memory_range_); +} + +IndexBufferResource::IndexBufferResource(const MemoryRange& memory_range, + const Info& info) + : BufferResource(memory_range), + info_(info) { +} + +IndexBufferResource::~IndexBufferResource() = default; + +VertexBufferResource::VertexBufferResource(const MemoryRange& memory_range, + const Info& info) + : BufferResource(memory_range), + info_(info) { +} + +VertexBufferResource::~VertexBufferResource() = default; diff --git a/src/xenia/gpu/buffer_resource.h b/src/xenia/gpu/buffer_resource.h new file mode 100644 index 000000000..a88d1ae06 --- /dev/null +++ b/src/xenia/gpu/buffer_resource.h @@ -0,0 +1,99 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_BUFFER_RESOURCE_H_ +#define XENIA_GPU_BUFFER_RESOURCE_H_ + +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class BufferResource : public PagedResource { +public: + BufferResource(const MemoryRange& memory_range); + ~BufferResource() override; + + virtual int Prepare(); + +protected: + virtual int CreateHandle() = 0; + virtual int InvalidateRegion(const MemoryRange& memory_range) = 0; +}; + + +enum IndexFormat { + INDEX_FORMAT_16BIT = 0, + INDEX_FORMAT_32BIT = 1, +}; + +class IndexBufferResource : public BufferResource { +public: + struct Info { + IndexFormat format; + xenos::XE_GPU_ENDIAN endianness; + }; + + IndexBufferResource(const MemoryRange& memory_range, + const Info& info); + ~IndexBufferResource() override; + + const Info& info() const { return info_; } + + bool Equals(const void* info_ptr, size_t info_length) override { + return info_length == sizeof(Info) && + memcmp(info_ptr, &info_, info_length) == 0; + } + +protected: + Info info_; +}; + + +class VertexBufferResource : public BufferResource { +public: + struct DeclElement { + xenos::instr_fetch_vtx_t vtx_fetch; + uint32_t format; + uint32_t offset_words; + uint32_t size_words; + bool is_signed; + bool is_normalized; + }; + struct Info { + uint32_t stride_words; + uint32_t element_count; + DeclElement elements[16]; + }; + + VertexBufferResource(const MemoryRange& memory_range, + const Info& info); + ~VertexBufferResource() override; + + const Info& info() const { return info_; } + + bool Equals(const void* info_ptr, size_t info_length) override { + return info_length == sizeof(Info) && + memcmp(info_ptr, &info_, info_length) == 0; + } + +protected: + Info info_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_BUFFER_RESOURCE_H_ diff --git a/src/xenia/gpu/command_buffer.h b/src/xenia/gpu/command_buffer.h deleted file mode 100644 index b601505f5..000000000 --- a/src/xenia/gpu/command_buffer.h +++ /dev/null @@ -1,49 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_COMMAND_BUFFER_H_ -#define XENIA_GPU_COMMAND_BUFFER_H_ - -#include - - -namespace xe { -namespace gpu { - - -// TODO(benvanik): command packet types. - - -class CommandBuffer { -public: - CommandBuffer(xe_memory_ref memory) { - memory_ = xe_memory_retain(memory); - } - - virtual ~CommandBuffer() { - xe_memory_release(memory_); - } - - xe_memory_ref memory() { - return memory_; - } - - // TODO(benvanik): command methods. - virtual void Foo() = 0; - -protected: - xe_memory_ref memory_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_COMMAND_BUFFER_H_ diff --git a/src/xenia/gpu/ring_buffer_worker.cc b/src/xenia/gpu/command_processor.cc similarity index 73% rename from src/xenia/gpu/ring_buffer_worker.cc rename to src/xenia/gpu/command_processor.cc index 3792c0b61..23c27c5a9 100644 --- a/src/xenia/gpu/ring_buffer_worker.cc +++ b/src/xenia/gpu/command_processor.cc @@ -1,739 +1,804 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include -#include -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -#define XETRACERB(fmt, ...) if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) - - -RingBufferWorker::RingBufferWorker( - GraphicsSystem* graphics_system, Memory* memory) : - graphics_system_(graphics_system), memory_(memory), driver_(0) { - write_ptr_index_event_ = CreateEvent( - NULL, FALSE, FALSE, NULL); - - primary_buffer_ptr_ = 0; - primary_buffer_size_ = 0; - read_ptr_index_ = 0; - read_ptr_update_freq_ = 0; - read_ptr_writeback_ptr_ = 0; - write_ptr_index_ = 0; - write_ptr_max_index_ = 0; - - LARGE_INTEGER perf_counter; - QueryPerformanceCounter(&perf_counter); - time_base_ = perf_counter.QuadPart; - counter_ = 0; -} - -RingBufferWorker::~RingBufferWorker() { - SetEvent(write_ptr_index_event_); - CloseHandle(write_ptr_index_event_); -} - -uint64_t RingBufferWorker::QueryTime() { - LARGE_INTEGER perf_counter; - QueryPerformanceCounter(&perf_counter); - return perf_counter.QuadPart - time_base_; -} - -void RingBufferWorker::Initialize(GraphicsDriver* driver, - uint32_t ptr, uint32_t page_count) { - driver_ = driver; - primary_buffer_ptr_ = ptr; - // Not sure this is correct, but it's a way to take the page_count back to - // the number of bytes allocated by the physical alloc. - uint32_t original_size = 1 << (0x1C - page_count - 1); - primary_buffer_size_ = original_size; - read_ptr_index_ = 0; - - // Tell the driver what to use for translation. - driver_->set_address_translation(primary_buffer_ptr_ & ~0x1FFFFFFF); -} - -void RingBufferWorker::EnableReadPointerWriteBack(uint32_t ptr, - uint32_t block_size) { - // CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C - // ptr = RB_RPTR_ADDR, pointer to write back the address to. - read_ptr_writeback_ptr_ = (primary_buffer_ptr_ & ~0x1FFFFFFF) + ptr; - // CP_RB_CNTL Ring Buffer Control 0x704 - // block_size = RB_BLKSZ, number of quadwords read between updates of the - // read pointer. - read_ptr_update_freq_ = (uint32_t)pow(2.0, (double)block_size) / 4; -} - -void RingBufferWorker::UpdateWritePointer(uint32_t value) { - write_ptr_max_index_ = MAX(write_ptr_max_index_, value); - write_ptr_index_ = value; - SetEvent(write_ptr_index_event_); -} - -void RingBufferWorker::Pump() { - uint8_t* p = memory_->membase(); - - if (write_ptr_index_ == 0xBAADF00D || - read_ptr_index_ == write_ptr_index_) { - // Check if the pointer has moved. - // We wait a short bit here to yield time. Since we are also running the - // main window display we don't want to pause too long, though. - const int wait_time_ms = 1; - if (WaitForSingleObject(write_ptr_index_event_, - wait_time_ms) == WAIT_TIMEOUT) { - return; - } - } - - // Bring local so we don't have to worry about them changing out from under - // us. - uint32_t write_ptr_index = write_ptr_index_; - uint32_t write_ptr_max_index = write_ptr_max_index_; - if (read_ptr_index_ == write_ptr_index) { - return; - } - - // Process the new commands. - XETRACERB("Ring buffer thread work"); - - // Execute. Note that we handle wraparound transparently. - ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index); - read_ptr_index_ = write_ptr_index; - - // TODO(benvanik): use read_ptr_update_freq_ and only issue after moving - // that many indices. - if (read_ptr_writeback_ptr_) { - XESETUINT32BE(p + read_ptr_writeback_ptr_, read_ptr_index_); - } -} - -void RingBufferWorker::ExecutePrimaryBuffer( - uint32_t start_index, uint32_t end_index) { - // Adjust pointer base. - uint32_t ptr = primary_buffer_ptr_ + start_index * 4; - ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (ptr & 0x1FFFFFFF); - uint32_t end_ptr = primary_buffer_ptr_ + end_index * 4; - end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF); - - XETRACERB("[%.8X] ExecutePrimaryBuffer(%dw -> %dw)", - ptr, start_index, end_index); - - // Execute commands! - PacketArgs args; - args.ptr = ptr; - args.base_ptr = primary_buffer_ptr_; - args.max_address = primary_buffer_ptr_ + primary_buffer_size_ * 4; - args.ptr_mask = (primary_buffer_size_ / 4) - 1; - uint32_t n = 0; - while (args.ptr != end_ptr) { - n += ExecutePacket(args); - } - if (end_index > start_index) { - XEASSERT(n == (end_index - start_index)); - } - - XETRACERB(" ExecutePrimaryBuffer End"); -} - -void RingBufferWorker::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) { - XETRACERB("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length); - - // Execute commands! - PacketArgs args; - args.ptr = ptr; - args.base_ptr = ptr; - args.max_address = ptr + length * 4; - args.ptr_mask = 0; - for (uint32_t n = 0; n < length;) { - n += ExecutePacket(args); - XEASSERT(n <= length); - } - - XETRACERB(" ExecuteIndirectBuffer End"); -} - -#define LOG_DATA(count) \ - for (uint32_t __m = 0; __m < count; __m++) { \ - XETRACERB("[%.8X] %.8X", \ - packet_ptr + (1 + __m) * 4, \ - XEGETUINT32BE(packet_base + 1 * 4 + __m * 4)); \ - } - -void RingBufferWorker::AdvancePtr(PacketArgs& args, uint32_t n) { - args.ptr = args.ptr + n * 4; - if (args.ptr_mask) { - args.ptr = - args.base_ptr + (((args.ptr - args.base_ptr) / 4) & args.ptr_mask) * 4; - } -} -#define ADVANCE_PTR(n) AdvancePtr(args, n) -#define PEEK_PTR() \ - XEGETUINT32BE(p + args.ptr) -#define READ_PTR() \ - XEGETUINT32BE(p + args.ptr); ADVANCE_PTR(1); - -uint32_t RingBufferWorker::ExecutePacket(PacketArgs& args) { - uint8_t* p = memory_->membase(); - RegisterFile* regs = driver_->register_file(); - - uint32_t packet_ptr = args.ptr; - const uint8_t* packet_base = p + packet_ptr; - const uint32_t packet = PEEK_PTR(); - ADVANCE_PTR(1); - const uint32_t packet_type = packet >> 30; - if (packet == 0) { - XETRACERB("[%.8X] Packet(%.8X): 0?", - packet_ptr, packet); - return 1; - } - - switch (packet_type) { - case 0x00: - { - // Type-0 packet. - // Write count registers in sequence to the registers starting at - // (base_index << 2). - XETRACERB("[%.8X] Packet(%.8X): set registers:", - packet_ptr, packet); - uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - uint32_t base_index = (packet & 0x7FFF); - uint32_t write_one_reg = (packet >> 15) & 0x1; - for (uint32_t m = 0; m < count; m++) { - uint32_t reg_data = PEEK_PTR(); - uint32_t target_index = write_one_reg ? base_index : base_index + m; - const char* reg_name = xenos::GetRegisterName(target_index); - XETRACERB("[%.8X] %.8X -> %.4X %s", - args.ptr, - reg_data, target_index, reg_name ? reg_name : ""); - ADVANCE_PTR(1); - WriteRegister(packet_ptr, target_index, reg_data); - } - return 1 + count; - } - break; - case 0x01: - { - // Type-1 packet. - // Contains two registers of data. Type-0 should be more common. - XETRACERB("[%.8X] Packet(%.8X): set registers:", - packet_ptr, packet); - uint32_t reg_index_1 = packet & 0x7FF; - uint32_t reg_index_2 = (packet >> 11) & 0x7FF; - uint32_t reg_ptr_1 = args.ptr; - uint32_t reg_data_1 = READ_PTR(); - uint32_t reg_ptr_2 = args.ptr; - uint32_t reg_data_2 = READ_PTR(); - const char* reg_name_1 = xenos::GetRegisterName(reg_index_1); - const char* reg_name_2 = xenos::GetRegisterName(reg_index_2); - XETRACERB("[%.8X] %.8X -> %.4X %s", - reg_ptr_1, - reg_data_1, reg_index_1, reg_name_1 ? reg_name_1 : ""); - XETRACERB("[%.8X] %.8X -> %.4X %s", - reg_ptr_2, - reg_data_2, reg_index_2, reg_name_2 ? reg_name_2 : ""); - WriteRegister(packet_ptr, reg_index_1, reg_data_1); - WriteRegister(packet_ptr, reg_index_2, reg_data_2); - return 1 + 2; - } - break; - case 0x02: - // Type-2 packet. - // No-op. Do nothing. - XETRACERB("[%.8X] Packet(%.8X): padding", - packet_ptr, packet); - return 1; - case 0x03: - { - // Type-3 packet. - uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - uint32_t opcode = (packet >> 8) & 0x7F; - // & 1 == predicate, maybe? - - switch (opcode) { - case PM4_ME_INIT: - // initialize CP's micro-engine - XETRACERB("[%.8X] Packet(%.8X): PM4_ME_INIT", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - - case PM4_NOP: - // skip N 32-bit words to get to the next packet - // No-op, ignore some data. - XETRACERB("[%.8X] Packet(%.8X): PM4_NOP", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - - case PM4_INTERRUPT: - // generate interrupt from the command stream - { - XETRACERB("[%.8X] Packet(%.8X): PM4_INTERRUPT", - packet_ptr, packet); - LOG_DATA(count); - uint32_t cpu_mask = READ_PTR(); - for (int n = 0; n < 6; n++) { - if (cpu_mask & (1 << n)) { - graphics_system_->DispatchInterruptCallback(1, n); - } - } - } - break; - - case PM4_INDIRECT_BUFFER: - // indirect buffer dispatch - { - uint32_t list_ptr = READ_PTR(); - uint32_t list_length = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_INDIRECT_BUFFER %.8X (%dw)", - packet_ptr, packet, list_ptr, list_length); - ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length); - } - break; - - case PM4_WAIT_REG_MEM: - // wait until a register or memory location is a specific value - { - XETRACERB("[%.8X] Packet(%.8X): PM4_WAIT_REG_MEM", - packet_ptr, packet); - LOG_DATA(count); - uint32_t wait_info = READ_PTR(); - uint32_t poll_reg_addr = READ_PTR(); - uint32_t ref = READ_PTR(); - uint32_t mask = READ_PTR(); - uint32_t wait = READ_PTR(); - bool matched = false; - do { - uint32_t value; - if (wait_info & 0x10) { - // Memory. - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); - value = GpuSwap(value, endianness); - } else { - // Register. - XEASSERT(poll_reg_addr < kXEGpuRegisterCount); - value = regs->values[poll_reg_addr].u32; - } - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = (value & mask) < ref; - break; - case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; - break; - case 0x3: // Equal to reference. - matched = (value & mask) == ref; - break; - case 0x4: // Not equal to reference. - matched = (value & mask) != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; - break; - case 0x6: // Greater than reference. - matched = (value & mask) > ref; - break; - case 0x7: // Always - matched = true; - break; - } - if (!matched) { - // Wait. - if (wait >= 0x100) { - Sleep(wait / 0x100); - } else { - SwitchToThread(); - } - } - } while (!matched); - } - break; - - case PM4_REG_RMW: - // register read/modify/write - // ? (used during shader upload and edram setup) - { - XETRACERB("[%.8X] Packet(%.8X): PM4_REG_RMW", - packet_ptr, packet); - LOG_DATA(count); - uint32_t rmw_info = READ_PTR(); - uint32_t and_mask = READ_PTR(); - uint32_t or_mask = READ_PTR(); - uint32_t value = regs->values[rmw_info & 0x1FFF].u32; - if ((rmw_info >> 30) & 0x1) { - // | reg - value |= regs->values[or_mask & 0x1FFF].u32; - } else { - // | imm - value |= or_mask; - } - if ((rmw_info >> 31) & 0x1) { - // & reg - value &= regs->values[and_mask & 0x1FFF].u32; - } else { - // & imm - value &= and_mask; - } - WriteRegister(packet_ptr, rmw_info & 0x1FFF, value); - } - break; - - case PM4_COND_WRITE: - // conditional write to memory or register - { - XETRACERB("[%.8X] Packet(%.8X): PM4_COND_WRITE", - packet_ptr, packet); - LOG_DATA(count); - uint32_t wait_info = READ_PTR(); - uint32_t poll_reg_addr = READ_PTR(); - uint32_t ref = READ_PTR(); - uint32_t mask = READ_PTR(); - uint32_t write_reg_addr = READ_PTR(); - uint32_t write_data = READ_PTR(); - uint32_t value; - if (wait_info & 0x10) { - // Memory. - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); - value = GpuSwap(value, endianness); - } else { - // Register. - XEASSERT(poll_reg_addr < kXEGpuRegisterCount); - value = regs->values[poll_reg_addr].u32; - } - bool matched = false; - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = (value & mask) < ref; - break; - case 0x2: // Less than or equal to reference. - matched = (value & mask) <= ref; - break; - case 0x3: // Equal to reference. - matched = (value & mask) == ref; - break; - case 0x4: // Not equal to reference. - matched = (value & mask) != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = (value & mask) >= ref; - break; - case 0x6: // Greater than reference. - matched = (value & mask) > ref; - break; - case 0x7: // Always - matched = true; - break; - } - if (matched) { - // Write. - if (wait_info & 0x100) { - // Memory. - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(write_reg_addr & 0x3); - write_reg_addr &= ~0x3; - write_data = GpuSwap(write_data, endianness); - XESETUINT32LE(p + GpuToCpu(packet_ptr, write_reg_addr), - write_data); - } else { - // Register. - WriteRegister(packet_ptr, write_reg_addr, write_data); - } - } - } - break; - - case PM4_EVENT_WRITE: - // generate an event that creates a write to memory when completed - { - XETRACERB("[%.8X] Packet(%.8X): PM4_EVENT_WRITE (unimplemented!)", - packet_ptr, packet); - LOG_DATA(count); - uint32_t initiator = READ_PTR(); - if (count == 1) { - // Just an event flag? Where does this write? - } else { - // Write to an address. - XEASSERTALWAYS(); - ADVANCE_PTR(count - 1); - } - } - break; - case PM4_EVENT_WRITE_SHD: - // generate a VS|PS_done event - { - XETRACERB("[%.8X] Packet(%.8X): PM4_EVENT_WRITE_SHD", - packet_ptr, packet); - LOG_DATA(count); - uint32_t initiator = READ_PTR(); - uint32_t address = READ_PTR(); - uint32_t value = READ_PTR(); - // Writeback initiator. - WriteRegister(packet_ptr, XE_GPU_REG_VGT_EVENT_INITIATOR, - initiator & 0x1F); - uint32_t data_value; - if ((initiator >> 31) & 0x1) { - // Write counter (GPU vblank counter?). - data_value = counter_; - } else { - // Write value. - data_value = value; - } - XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(address & 0x3); - address &= ~0x3; - data_value = GpuSwap(data_value, endianness); - XESETUINT32LE(p + GpuToCpu(address), data_value); - } - break; - - case PM4_DRAW_INDX: - // initiate fetch of index buffer and draw - { - XETRACERB("[%.8X] Packet(%.8X): PM4_DRAW_INDX", - packet_ptr, packet); - LOG_DATA(count); - // d0 = viz query info - uint32_t d0 = READ_PTR(); - uint32_t d1 = READ_PTR(); - uint32_t index_count = d1 >> 16; - uint32_t prim_type = d1 & 0x3F; - uint32_t src_sel = (d1 >> 6) & 0x3; - if (src_sel == 0x0) { - uint32_t index_base = READ_PTR(); - uint32_t index_size = READ_PTR(); - uint32_t endianness = index_size >> 29; - index_size &= 0x00FFFFFF; - bool index_32bit = (d1 >> 11) & 0x1; - index_size *= index_32bit ? 4 : 2; - driver_->DrawIndexBuffer( - (XE_GPU_PRIMITIVE_TYPE)prim_type, - index_32bit, index_count, index_base, index_size, endianness); - } else if (src_sel == 0x2) { - driver_->DrawIndexAuto( - (XE_GPU_PRIMITIVE_TYPE)prim_type, - index_count); - } else { - // Unknown source select. - XEASSERTALWAYS(); - } - } - break; - case PM4_DRAW_INDX_2: - // draw using supplied indices in packet - { - XETRACERB("[%.8X] Packet(%.8X): PM4_DRAW_INDX_2", - packet_ptr, packet); - LOG_DATA(count); - uint32_t d0 = READ_PTR(); - uint32_t index_count = d0 >> 16; - uint32_t prim_type = d0 & 0x3F; - uint32_t src_sel = (d0 >> 6) & 0x3; - XEASSERT(src_sel == 0x2); // 'SrcSel=AutoIndex' - driver_->DrawIndexAuto( - (XE_GPU_PRIMITIVE_TYPE)prim_type, - index_count); - } - break; - - case PM4_SET_CONSTANT: - // load constant into chip and to memory - { - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_CONSTANT", - packet_ptr, packet); - // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg))) - // reg - 0x2000 - uint32_t offset_type = READ_PTR(); - uint32_t index = offset_type & 0x7FF; - uint32_t type = (offset_type >> 16) & 0xFF; - switch (type) { - case 0x4: // REGISTER - index += 0x2000; // registers - for (uint32_t n = 0; n < count - 1; n++, index++) { - uint32_t data = READ_PTR(); - const char* reg_name = xenos::GetRegisterName(index); - XETRACERB("[%.8X] %.8X -> %.4X %s", - packet_ptr + (1 + n) * 4, - data, index, reg_name ? reg_name : ""); - WriteRegister(packet_ptr, index, data); - } - break; - default: - XEASSERTALWAYS(); - break; - } - } - break; - case PM4_LOAD_ALU_CONSTANT: - // load constants from memory - { - XETRACERB("[%.8X] Packet(%.8X): PM4_LOAD_ALU_CONSTANT", - packet_ptr, packet); - uint32_t address = READ_PTR(); - address &= 0x3FFFFFFF; - uint32_t offset_type = READ_PTR(); - uint32_t index = offset_type & 0x7FF; - uint32_t size = READ_PTR(); - size &= 0xFFF; - index += 0x4000; // alu constants - for (uint32_t n = 0; n < size; n++, index++) { - uint32_t data = XEGETUINT32BE( - p + GpuToCpu(packet_ptr, address + n * 4)); - const char* reg_name = xenos::GetRegisterName(index); - XETRACERB("[%.8X] %.8X -> %.4X %s", - packet_ptr, - data, index, reg_name ? reg_name : ""); - WriteRegister(packet_ptr, index, data); - } - } - break; - - case PM4_IM_LOAD: - // load sequencer instruction memory (pointer-based) - { - XETRACERB("[%.8X] Packet(%.8X): PM4_IM_LOAD", - packet_ptr, packet); - LOG_DATA(count); - uint32_t addr_type = READ_PTR(); - uint32_t type = addr_type & 0x3; - uint32_t addr = addr_type & ~0x3; - uint32_t start_size = READ_PTR(); - uint32_t start = start_size >> 16; - uint32_t size = start_size & 0xFFFF; // dwords - XEASSERT(start == 0); - driver_->SetShader( - (XE_GPU_SHADER_TYPE)type, - GpuToCpu(packet_ptr, addr), - start, - size * 4); - } - break; - case PM4_IM_LOAD_IMMEDIATE: - // load sequencer instruction memory (code embedded in packet) - { - XETRACERB("[%.8X] Packet(%.8X): PM4_IM_LOAD_IMMEDIATE", - packet_ptr, packet); - LOG_DATA(count); - uint32_t type = READ_PTR(); - uint32_t start_size = READ_PTR(); - uint32_t start = start_size >> 16; - uint32_t size = start_size & 0xFFFF; // dwords - XEASSERT(start == 0); - // TODO(benvanik): figure out if this could wrap. - XEASSERT(args.ptr + size * 4 < args.max_address); - driver_->SetShader( - (XE_GPU_SHADER_TYPE)type, - args.ptr, - start, - size * 4); - ADVANCE_PTR(size); - } - break; - - case PM4_INVALIDATE_STATE: - // selective invalidation of state pointers - { - XETRACERB("[%.8X] Packet(%.8X): PM4_INVALIDATE_STATE", - packet_ptr, packet); - LOG_DATA(count); - uint32_t mask = READ_PTR(); - driver_->InvalidateState(mask); - } - break; - - case PM4_SET_BIN_MASK_LO: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_LO = %.8X", - packet_ptr, packet, value); - } - break; - case PM4_SET_BIN_MASK_HI: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_HI = %.8X", - packet_ptr, packet, value); - } - break; - case PM4_SET_BIN_SELECT_LO: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_LO = %.8X", - packet_ptr, packet, value); - } - break; - case PM4_SET_BIN_SELECT_HI: - { - uint32_t value = READ_PTR(); - XETRACERB("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_HI = %.8X", - packet_ptr, packet, value); - } - break; - - // Ignored packets - useful if breaking on the default handler below. - case 0x50: // 0xC0015000 usually 2 words, 0xFFFFFFFF / 0x00000000 - XETRACERB("[%.8X] Packet(%.8X): unknown!", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - - default: - XETRACERB("[%.8X] Packet(%.8X): unknown!", - packet_ptr, packet); - LOG_DATA(count); - ADVANCE_PTR(count); - break; - } - - return 1 + count; - } - break; - } - - return 0; -} - -void RingBufferWorker::WriteRegister( - uint32_t packet_ptr, uint32_t index, uint32_t value) { - RegisterFile* regs = driver_->register_file(); - XEASSERT(index < kXEGpuRegisterCount); - regs->values[index].u32 = value; - - // Scratch register writeback. - if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { - uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; - if ((1 << scratch_reg) & regs->values[XE_GPU_REG_SCRATCH_UMSK].u32) { - // Enabled - write to address. - uint8_t* p = memory_->membase(); - uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32; - uint32_t mem_addr = scratch_addr + (scratch_reg * 4); - XESETUINT32BE(p + GpuToCpu(primary_buffer_ptr_, mem_addr), value); - } - } -} +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +#define XETRACECP(fmt, ...) if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) + + +CommandProcessor::CommandProcessor( + GraphicsSystem* graphics_system, Memory* memory) : + graphics_system_(graphics_system), memory_(memory), driver_(0) { + write_ptr_index_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + + primary_buffer_ptr_ = 0; + primary_buffer_size_ = 0; + read_ptr_index_ = 0; + read_ptr_update_freq_ = 0; + read_ptr_writeback_ptr_ = 0; + write_ptr_index_ = 0; + write_ptr_max_index_ = 0; + + LARGE_INTEGER perf_counter; + QueryPerformanceCounter(&perf_counter); + time_base_ = perf_counter.QuadPart; + counter_ = 0; +} + +CommandProcessor::~CommandProcessor() { + SetEvent(write_ptr_index_event_); + CloseHandle(write_ptr_index_event_); +} + +uint64_t CommandProcessor::QueryTime() { + LARGE_INTEGER perf_counter; + QueryPerformanceCounter(&perf_counter); + return perf_counter.QuadPart - time_base_; +} + +void CommandProcessor::Initialize(GraphicsDriver* driver, + uint32_t ptr, uint32_t page_count) { + driver_ = driver; + primary_buffer_ptr_ = ptr; + // Not sure this is correct, but it's a way to take the page_count back to + // the number of bytes allocated by the physical alloc. + uint32_t original_size = 1 << (0x1C - page_count - 1); + primary_buffer_size_ = original_size; + read_ptr_index_ = 0; + + // Tell the driver what to use for translation. + driver_->set_address_translation(primary_buffer_ptr_ & ~0x1FFFFFFF); +} + +void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr, + uint32_t block_size) { + // CP_RB_RPTR_ADDR Ring Buffer Read Pointer Address 0x70C + // ptr = RB_RPTR_ADDR, pointer to write back the address to. + read_ptr_writeback_ptr_ = (primary_buffer_ptr_ & ~0x1FFFFFFF) + ptr; + // CP_RB_CNTL Ring Buffer Control 0x704 + // block_size = RB_BLKSZ, number of quadwords read between updates of the + // read pointer. + read_ptr_update_freq_ = (uint32_t)pow(2.0, (double)block_size) / 4; +} + +void CommandProcessor::UpdateWritePointer(uint32_t value) { + write_ptr_max_index_ = MAX(write_ptr_max_index_, value); + write_ptr_index_ = value; + SetEvent(write_ptr_index_event_); +} + +void CommandProcessor::Pump() { + uint8_t* p = memory_->membase(); + + while (write_ptr_index_ == 0xBAADF00D || + read_ptr_index_ == write_ptr_index_) { + // Check if the pointer has moved. + // We wait a short bit here to yield time. Since we are also running the + // main window display we don't want to pause too long, though. + // YieldProcessor(); + const int wait_time_ms = 1; + if (WaitForSingleObject(write_ptr_index_event_, + wait_time_ms) == WAIT_TIMEOUT) { + return; + } + } + + // Bring local so we don't have to worry about them changing out from under + // us. + uint32_t write_ptr_index = write_ptr_index_; + uint32_t write_ptr_max_index = write_ptr_max_index_; + if (read_ptr_index_ == write_ptr_index) { + return; + } + + // Process the new commands. + XETRACECP("Command processor thread work"); + + // Execute. Note that we handle wraparound transparently. + ExecutePrimaryBuffer(read_ptr_index_, write_ptr_index); + read_ptr_index_ = write_ptr_index; + + // TODO(benvanik): use read_ptr_update_freq_ and only issue after moving + // that many indices. + if (read_ptr_writeback_ptr_) { + XESETUINT32BE(p + read_ptr_writeback_ptr_, read_ptr_index_); + } +} + +void CommandProcessor::ExecutePrimaryBuffer( + uint32_t start_index, uint32_t end_index) { + SCOPE_profile_cpu_f("gpu"); + + // Adjust pointer base. + uint32_t ptr = primary_buffer_ptr_ + start_index * 4; + ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (ptr & 0x1FFFFFFF); + uint32_t end_ptr = primary_buffer_ptr_ + end_index * 4; + end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF); + + XETRACECP("[%.8X] ExecutePrimaryBuffer(%dw -> %dw)", + ptr, start_index, end_index); + + // Execute commands! + PacketArgs args; + args.ptr = ptr; + args.base_ptr = primary_buffer_ptr_; + args.max_address = primary_buffer_ptr_ + primary_buffer_size_; + args.ptr_mask = (primary_buffer_size_ / 4) - 1; + uint32_t n = 0; + while (args.ptr != end_ptr) { + n += ExecutePacket(args); + XEASSERT(args.ptr < args.max_address); + } + if (end_index > start_index) { + XEASSERT(n == (end_index - start_index)); + } + + XETRACECP(" ExecutePrimaryBuffer End"); +} + +void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) { + XETRACECP("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length); + + // Execute commands! + PacketArgs args; + args.ptr = ptr; + args.base_ptr = ptr; + args.max_address = ptr + length * 4; + args.ptr_mask = 0; + for (uint32_t n = 0; n < length;) { + n += ExecutePacket(args); + XEASSERT(n <= length); + } + + XETRACECP(" ExecuteIndirectBuffer End"); +} + +#define LOG_DATA(count) \ + for (uint32_t __m = 0; __m < count; __m++) { \ + XETRACECP("[%.8X] %.8X", \ + packet_ptr + (1 + __m) * 4, \ + XEGETUINT32BE(packet_base + 1 * 4 + __m * 4)); \ + } + +void CommandProcessor::AdvancePtr(PacketArgs& args, uint32_t n) { + args.ptr = args.ptr + n * 4; + if (args.ptr_mask) { + args.ptr = + args.base_ptr + (((args.ptr - args.base_ptr) / 4) & args.ptr_mask) * 4; + } +} +#define ADVANCE_PTR(n) AdvancePtr(args, n) +#define PEEK_PTR() \ + XEGETUINT32BE(p + args.ptr) +#define READ_PTR() \ + XEGETUINT32BE(p + args.ptr); ADVANCE_PTR(1); + +uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) { + uint8_t* p = memory_->membase(); + RegisterFile* regs = driver_->register_file(); + + uint32_t packet_ptr = args.ptr; + const uint8_t* packet_base = p + packet_ptr; + const uint32_t packet = PEEK_PTR(); + ADVANCE_PTR(1); + const uint32_t packet_type = packet >> 30; + if (packet == 0) { + XETRACECP("[%.8X] Packet(%.8X): 0?", + packet_ptr, packet); + return 1; + } + + switch (packet_type) { + case 0x00: + { + // Type-0 packet. + // Write count registers in sequence to the registers starting at + // (base_index << 2). + XETRACECP("[%.8X] Packet(%.8X): set registers:", + packet_ptr, packet); + uint32_t count = ((packet >> 16) & 0x3FFF) + 1; + uint32_t base_index = (packet & 0x7FFF); + uint32_t write_one_reg = (packet >> 15) & 0x1; + for (uint32_t m = 0; m < count; m++) { + uint32_t reg_data = PEEK_PTR(); + uint32_t target_index = write_one_reg ? base_index : base_index + m; + const char* reg_name = regs->GetRegisterName(target_index); + XETRACECP("[%.8X] %.8X -> %.4X %s", + args.ptr, + reg_data, target_index, reg_name ? reg_name : ""); + ADVANCE_PTR(1); + WriteRegister(packet_ptr, target_index, reg_data); + } + return 1 + count; + } + break; + case 0x01: + { + // Type-1 packet. + // Contains two registers of data. Type-0 should be more common. + XETRACECP("[%.8X] Packet(%.8X): set registers:", + packet_ptr, packet); + uint32_t reg_index_1 = packet & 0x7FF; + uint32_t reg_index_2 = (packet >> 11) & 0x7FF; + uint32_t reg_ptr_1 = args.ptr; + uint32_t reg_data_1 = READ_PTR(); + uint32_t reg_ptr_2 = args.ptr; + uint32_t reg_data_2 = READ_PTR(); + const char* reg_name_1 = regs->GetRegisterName(reg_index_1); + const char* reg_name_2 = regs->GetRegisterName(reg_index_2); + XETRACECP("[%.8X] %.8X -> %.4X %s", + reg_ptr_1, + reg_data_1, reg_index_1, reg_name_1 ? reg_name_1 : ""); + XETRACECP("[%.8X] %.8X -> %.4X %s", + reg_ptr_2, + reg_data_2, reg_index_2, reg_name_2 ? reg_name_2 : ""); + WriteRegister(packet_ptr, reg_index_1, reg_data_1); + WriteRegister(packet_ptr, reg_index_2, reg_data_2); + return 1 + 2; + } + break; + case 0x02: + // Type-2 packet. + // No-op. Do nothing. + XETRACECP("[%.8X] Packet(%.8X): padding", + packet_ptr, packet); + return 1; + case 0x03: + { + // Type-3 packet. + uint32_t count = ((packet >> 16) & 0x3FFF) + 1; + uint32_t opcode = (packet >> 8) & 0x7F; + // & 1 == predicate, maybe? + + switch (opcode) { + case PM4_ME_INIT: + // initialize CP's micro-engine + XETRACECP("[%.8X] Packet(%.8X): PM4_ME_INIT", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + + case PM4_NOP: + // skip N 32-bit words to get to the next packet + // No-op, ignore some data. + XETRACECP("[%.8X] Packet(%.8X): PM4_NOP", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + + case PM4_INTERRUPT: + // generate interrupt from the command stream + { + XETRACECP("[%.8X] Packet(%.8X): PM4_INTERRUPT", + packet_ptr, packet); + LOG_DATA(count); + uint32_t cpu_mask = READ_PTR(); + for (int n = 0; n < 6; n++) { + if (cpu_mask & (1 << n)) { + graphics_system_->DispatchInterruptCallback(1, n); + } + } + } + break; + + case PM4_XE_SWAP: + // Xenia-specific VdSwap hook. + // VdSwap will post this to tell us we need to swap the screen/fire an interrupt. + XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + graphics_system_->Swap(); + break; + + case PM4_INDIRECT_BUFFER: + // indirect buffer dispatch + { + uint32_t list_ptr = READ_PTR(); + uint32_t list_length = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_INDIRECT_BUFFER %.8X (%dw)", + packet_ptr, packet, list_ptr, list_length); + ExecuteIndirectBuffer(GpuToCpu(list_ptr), list_length); + } + break; + + case PM4_WAIT_REG_MEM: + // wait until a register or memory location is a specific value + { + XETRACECP("[%.8X] Packet(%.8X): PM4_WAIT_REG_MEM", + packet_ptr, packet); + LOG_DATA(count); + uint32_t wait_info = READ_PTR(); + uint32_t poll_reg_addr = READ_PTR(); + uint32_t ref = READ_PTR(); + uint32_t mask = READ_PTR(); + uint32_t wait = READ_PTR(); + bool matched = false; + do { + uint32_t value; + if (wait_info & 0x10) { + // Memory. + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); + poll_reg_addr &= ~0x3; + value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); + value = GpuSwap(value, endianness); + } else { + // Register. + XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount); + value = regs->values[poll_reg_addr].u32; + if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { + MakeCoherent(); + value = regs->values[poll_reg_addr].u32; + } + } + switch (wait_info & 0x7) { + case 0x0: // Never. + matched = false; + break; + case 0x1: // Less than reference. + matched = (value & mask) < ref; + break; + case 0x2: // Less than or equal to reference. + matched = (value & mask) <= ref; + break; + case 0x3: // Equal to reference. + matched = (value & mask) == ref; + break; + case 0x4: // Not equal to reference. + matched = (value & mask) != ref; + break; + case 0x5: // Greater than or equal to reference. + matched = (value & mask) >= ref; + break; + case 0x6: // Greater than reference. + matched = (value & mask) > ref; + break; + case 0x7: // Always + matched = true; + break; + } + if (!matched) { + // Wait. + if (wait >= 0x100) { + Sleep(wait / 0x100); + } else { + SwitchToThread(); + } + } + } while (!matched); + } + break; + + case PM4_REG_RMW: + // register read/modify/write + // ? (used during shader upload and edram setup) + { + XETRACECP("[%.8X] Packet(%.8X): PM4_REG_RMW", + packet_ptr, packet); + LOG_DATA(count); + uint32_t rmw_info = READ_PTR(); + uint32_t and_mask = READ_PTR(); + uint32_t or_mask = READ_PTR(); + uint32_t value = regs->values[rmw_info & 0x1FFF].u32; + if ((rmw_info >> 30) & 0x1) { + // | reg + value |= regs->values[or_mask & 0x1FFF].u32; + } else { + // | imm + value |= or_mask; + } + if ((rmw_info >> 31) & 0x1) { + // & reg + value &= regs->values[and_mask & 0x1FFF].u32; + } else { + // & imm + value &= and_mask; + } + WriteRegister(packet_ptr, rmw_info & 0x1FFF, value); + } + break; + + case PM4_COND_WRITE: + // conditional write to memory or register + { + XETRACECP("[%.8X] Packet(%.8X): PM4_COND_WRITE", + packet_ptr, packet); + LOG_DATA(count); + uint32_t wait_info = READ_PTR(); + uint32_t poll_reg_addr = READ_PTR(); + uint32_t ref = READ_PTR(); + uint32_t mask = READ_PTR(); + uint32_t write_reg_addr = READ_PTR(); + uint32_t write_data = READ_PTR(); + uint32_t value; + if (wait_info & 0x10) { + // Memory. + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(poll_reg_addr & 0x3); + poll_reg_addr &= ~0x3; + value = XEGETUINT32LE(p + GpuToCpu(packet_ptr, poll_reg_addr)); + value = GpuSwap(value, endianness); + } else { + // Register. + XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount); + value = regs->values[poll_reg_addr].u32; + } + bool matched = false; + switch (wait_info & 0x7) { + case 0x0: // Never. + matched = false; + break; + case 0x1: // Less than reference. + matched = (value & mask) < ref; + break; + case 0x2: // Less than or equal to reference. + matched = (value & mask) <= ref; + break; + case 0x3: // Equal to reference. + matched = (value & mask) == ref; + break; + case 0x4: // Not equal to reference. + matched = (value & mask) != ref; + break; + case 0x5: // Greater than or equal to reference. + matched = (value & mask) >= ref; + break; + case 0x6: // Greater than reference. + matched = (value & mask) > ref; + break; + case 0x7: // Always + matched = true; + break; + } + if (matched) { + // Write. + if (wait_info & 0x100) { + // Memory. + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(write_reg_addr & 0x3); + write_reg_addr &= ~0x3; + write_data = GpuSwap(write_data, endianness); + XESETUINT32LE(p + GpuToCpu(packet_ptr, write_reg_addr), + write_data); + } else { + // Register. + WriteRegister(packet_ptr, write_reg_addr, write_data); + } + } + } + break; + + case PM4_EVENT_WRITE: + // generate an event that creates a write to memory when completed + { + XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE (unimplemented!)", + packet_ptr, packet); + LOG_DATA(count); + uint32_t initiator = READ_PTR(); + if (count == 1) { + // Just an event flag? Where does this write? + } else { + // Write to an address. + XEASSERTALWAYS(); + ADVANCE_PTR(count - 1); + } + } + break; + case PM4_EVENT_WRITE_SHD: + // generate a VS|PS_done event + { + XETRACECP("[%.8X] Packet(%.8X): PM4_EVENT_WRITE_SHD", + packet_ptr, packet); + LOG_DATA(count); + uint32_t initiator = READ_PTR(); + uint32_t address = READ_PTR(); + uint32_t value = READ_PTR(); + // Writeback initiator. + WriteRegister(packet_ptr, XE_GPU_REG_VGT_EVENT_INITIATOR, + initiator & 0x1F); + uint32_t data_value; + if ((initiator >> 31) & 0x1) { + // Write counter (GPU vblank counter?). + data_value = counter_; + } else { + // Write value. + data_value = value; + } + XE_GPU_ENDIAN endianness = (XE_GPU_ENDIAN)(address & 0x3); + address &= ~0x3; + data_value = GpuSwap(data_value, endianness); + XESETUINT32LE(p + GpuToCpu(address), data_value); + } + break; + + case PM4_DRAW_INDX: + // initiate fetch of index buffer and draw + { + XETRACECP("[%.8X] Packet(%.8X): PM4_DRAW_INDX", + packet_ptr, packet); + LOG_DATA(count); + // d0 = viz query info + uint32_t d0 = READ_PTR(); + uint32_t d1 = READ_PTR(); + uint32_t index_count = d1 >> 16; + uint32_t prim_type = d1 & 0x3F; + uint32_t src_sel = (d1 >> 6) & 0x3; + if (!driver_->PrepareDraw(draw_command_)) { + draw_command_.prim_type = (XE_GPU_PRIMITIVE_TYPE)prim_type; + draw_command_.start_index = 0; + draw_command_.index_count = index_count; + draw_command_.base_vertex = 0; + if (src_sel == 0x0) { + // Indexed draw. + // TODO(benvanik): detect subregions of larger index buffers! + uint32_t index_base = READ_PTR(); + uint32_t index_size = READ_PTR(); + uint32_t endianness = index_size >> 29; + index_size &= 0x00FFFFFF; + bool index_32bit = (d1 >> 11) & 0x1; + index_size *= index_32bit ? 4 : 2; + driver_->PrepareDrawIndexBuffer( + draw_command_, + index_base, index_size, + (XE_GPU_ENDIAN)endianness, + index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT); + } else if (src_sel == 0x2) { + // Auto draw. + draw_command_.index_buffer = nullptr; + } else { + // Unknown source select. + XEASSERTALWAYS(); + } + driver_->Draw(draw_command_); + } else { + if (src_sel == 0x0) { + ADVANCE_PTR(2); // skip + } + } + } + break; + case PM4_DRAW_INDX_2: + // draw using supplied indices in packet + { + XETRACECP("[%.8X] Packet(%.8X): PM4_DRAW_INDX_2", + packet_ptr, packet); + LOG_DATA(count); + uint32_t d0 = READ_PTR(); + uint32_t index_count = d0 >> 16; + uint32_t prim_type = d0 & 0x3F; + uint32_t src_sel = (d0 >> 6) & 0x3; + XEASSERT(src_sel == 0x2); // 'SrcSel=AutoIndex' + if (!driver_->PrepareDraw(draw_command_)) { + draw_command_.prim_type = (XE_GPU_PRIMITIVE_TYPE)prim_type; + draw_command_.start_index = 0; + draw_command_.index_count = index_count; + draw_command_.base_vertex = 0; + draw_command_.index_buffer = nullptr; + driver_->Draw(draw_command_); + } + } + break; + + case PM4_SET_CONSTANT: + // load constant into chip and to memory + { + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_CONSTANT", + packet_ptr, packet); + // PM4_REG(reg) ((0x4 << 16) | (GSL_HAL_SUBBLOCK_OFFSET(reg))) + // reg - 0x2000 + uint32_t offset_type = READ_PTR(); + uint32_t index = offset_type & 0x7FF; + uint32_t type = (offset_type >> 16) & 0xFF; + switch (type) { + case 0x4: // REGISTER + index += 0x2000; // registers + for (uint32_t n = 0; n < count - 1; n++, index++) { + uint32_t data = READ_PTR(); + const char* reg_name = regs->GetRegisterName(index); + XETRACECP("[%.8X] %.8X -> %.4X %s", + packet_ptr + (1 + n) * 4, + data, index, reg_name ? reg_name : ""); + WriteRegister(packet_ptr, index, data); + } + break; + default: + XEASSERTALWAYS(); + break; + } + } + break; + case PM4_LOAD_ALU_CONSTANT: + // load constants from memory + { + XETRACECP("[%.8X] Packet(%.8X): PM4_LOAD_ALU_CONSTANT", + packet_ptr, packet); + uint32_t address = READ_PTR(); + address &= 0x3FFFFFFF; + uint32_t offset_type = READ_PTR(); + uint32_t index = offset_type & 0x7FF; + uint32_t size = READ_PTR(); + size &= 0xFFF; + index += 0x4000; // alu constants + for (uint32_t n = 0; n < size; n++, index++) { + uint32_t data = XEGETUINT32BE( + p + GpuToCpu(packet_ptr, address + n * 4)); + const char* reg_name = regs->GetRegisterName(index); + XETRACECP("[%.8X] %.8X -> %.4X %s", + packet_ptr, + data, index, reg_name ? reg_name : ""); + WriteRegister(packet_ptr, index, data); + } + } + break; + + case PM4_IM_LOAD: + // load sequencer instruction memory (pointer-based) + { + XETRACECP("[%.8X] Packet(%.8X): PM4_IM_LOAD", + packet_ptr, packet); + LOG_DATA(count); + uint32_t addr_type = READ_PTR(); + uint32_t type = addr_type & 0x3; + uint32_t addr = addr_type & ~0x3; + uint32_t start_size = READ_PTR(); + uint32_t start = start_size >> 16; + uint32_t size = start_size & 0xFFFF; // dwords + XEASSERT(start == 0); + driver_->LoadShader((XE_GPU_SHADER_TYPE)type, + GpuToCpu(packet_ptr, addr), size * 4, start); + } + break; + case PM4_IM_LOAD_IMMEDIATE: + // load sequencer instruction memory (code embedded in packet) + { + XETRACECP("[%.8X] Packet(%.8X): PM4_IM_LOAD_IMMEDIATE", + packet_ptr, packet); + LOG_DATA(count); + uint32_t type = READ_PTR(); + uint32_t start_size = READ_PTR(); + uint32_t start = start_size >> 16; + uint32_t size = start_size & 0xFFFF; // dwords + XEASSERT(start == 0); + // TODO(benvanik): figure out if this could wrap. + XEASSERT(args.ptr + size * 4 < args.max_address); + driver_->LoadShader((XE_GPU_SHADER_TYPE)type, + args.ptr, size * 4, start); + ADVANCE_PTR(size); + } + break; + + case PM4_INVALIDATE_STATE: + // selective invalidation of state pointers + { + XETRACECP("[%.8X] Packet(%.8X): PM4_INVALIDATE_STATE", + packet_ptr, packet); + LOG_DATA(count); + uint32_t mask = READ_PTR(); + //driver_->InvalidateState(mask); + } + break; + + case PM4_SET_BIN_MASK_LO: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_LO = %.8X", + packet_ptr, packet, value); + } + break; + case PM4_SET_BIN_MASK_HI: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_MASK_HI = %.8X", + packet_ptr, packet, value); + } + break; + case PM4_SET_BIN_SELECT_LO: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_LO = %.8X", + packet_ptr, packet, value); + } + break; + case PM4_SET_BIN_SELECT_HI: + { + uint32_t value = READ_PTR(); + XETRACECP("[%.8X] Packet(%.8X): PM4_SET_BIN_SELECT_HI = %.8X", + packet_ptr, packet, value); + } + break; + + // Ignored packets - useful if breaking on the default handler below. + case 0x50: // 0xC0015000 usually 2 words, 0xFFFFFFFF / 0x00000000 + XETRACECP("[%.8X] Packet(%.8X): unknown!", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + + default: + XETRACECP("[%.8X] Packet(%.8X): unknown!", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + break; + } + + return 1 + count; + } + break; + } + + return 0; +} + +void CommandProcessor::WriteRegister( + uint32_t packet_ptr, uint32_t index, uint32_t value) { + RegisterFile* regs = driver_->register_file(); + XEASSERT(index < RegisterFile::kRegisterCount); + regs->values[index].u32 = value; + + // If this is a COHER register, set the dirty flag. + // This will block the command processor the next time it WAIT_MEM_REGs and + // allow us to synchronize the memory. + if (index == XE_GPU_REG_COHER_STATUS_HOST) { + regs->values[index].u32 |= 0x80000000ul; + } + + // Scratch register writeback. + if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { + uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; + if ((1 << scratch_reg) & regs->values[XE_GPU_REG_SCRATCH_UMSK].u32) { + // Enabled - write to address. + uint8_t* p = memory_->membase(); + uint32_t scratch_addr = regs->values[XE_GPU_REG_SCRATCH_ADDR].u32; + uint32_t mem_addr = scratch_addr + (scratch_reg * 4); + XESETUINT32BE(p + GpuToCpu(primary_buffer_ptr_, mem_addr), value); + } + } +} + +void CommandProcessor::MakeCoherent() { + // Status host often has 0x01000000 or 0x03000000. + // This is likely toggling VC (vertex cache) or TC (texture cache). + // Or, it also has a direction in here maybe - there is probably + // some way to check for dest coherency (what all the COHER_DEST_BASE_* + // registers are for). + // Best docs I've found on this are here: + // http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf + // http://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454 + + RegisterFile* regs = driver_->register_file(); + auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32; + auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; + auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; + + if (!(status_host & 0x80000000ul)) { + return; + } + + // TODO(benvanik): notify resource cache of base->size and type. + XETRACECP("Make %.8X -> %.8X (%db) coherent", + base_host, base_host + size_host, size_host); + driver_->resource_cache()->SyncRange(base_host, size_host); + + // Mark coherent. + status_host &= ~0x80000000ul; + regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host; +} diff --git a/src/xenia/gpu/ring_buffer_worker.h b/src/xenia/gpu/command_processor.h similarity index 76% rename from src/xenia/gpu/ring_buffer_worker.h rename to src/xenia/gpu/command_processor.h index 889625d68..ba081aefb 100644 --- a/src/xenia/gpu/ring_buffer_worker.h +++ b/src/xenia/gpu/command_processor.h @@ -1,81 +1,85 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_RING_BUFFER_WORKER_H_ -#define XENIA_GPU_RING_BUFFER_WORKER_H_ - -#include - -#include - - -namespace xe { -namespace gpu { - -class GraphicsDriver; -class GraphicsSystem; - -class RingBufferWorker { -public: - RingBufferWorker(GraphicsSystem* graphics_system, Memory* memory); - virtual ~RingBufferWorker(); - - Memory* memory() const { return memory_; } - - uint64_t QueryTime(); - uint32_t counter() const { return counter_; } - void increment_counter() { counter_++; } - - void Initialize(GraphicsDriver* driver, - uint32_t ptr, uint32_t page_count); - void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); - - void UpdateWritePointer(uint32_t value); - - void Pump(); - -private: - typedef struct { - uint32_t ptr; - uint32_t base_ptr; - uint32_t max_address; - uint32_t ptr_mask; - } PacketArgs; - void AdvancePtr(PacketArgs& args, uint32_t n); - void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); - void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length); - uint32_t ExecutePacket(PacketArgs& args); - void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value); - -protected: - Memory* memory_; - GraphicsSystem* graphics_system_; - GraphicsDriver* driver_; - - uint64_t time_base_; - uint32_t counter_; - - uint32_t primary_buffer_ptr_; - uint32_t primary_buffer_size_; - - uint32_t read_ptr_index_; - uint32_t read_ptr_update_freq_; - uint32_t read_ptr_writeback_ptr_; - - HANDLE write_ptr_index_event_; - volatile uint32_t write_ptr_index_; - volatile uint32_t write_ptr_max_index_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_RING_BUFFER_WORKER_H_ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_COMMAND_PROCESSOR_H_ +#define XENIA_GPU_COMMAND_PROCESSOR_H_ + +#include +#include +#include +#include + + +namespace xe { +namespace gpu { + +class GraphicsDriver; +class GraphicsSystem; + + +class CommandProcessor { +public: + CommandProcessor(GraphicsSystem* graphics_system, Memory* memory); + virtual ~CommandProcessor(); + + Memory* memory() const { return memory_; } + + uint64_t QueryTime(); + uint32_t counter() const { return counter_; } + void increment_counter() { counter_++; } + + void Initialize(GraphicsDriver* driver, uint32_t ptr, uint32_t page_count); + void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); + + void UpdateWritePointer(uint32_t value); + + void Pump(); + +private: + typedef struct { + uint32_t ptr; + uint32_t base_ptr; + uint32_t max_address; + uint32_t ptr_mask; + } PacketArgs; + + void AdvancePtr(PacketArgs& args, uint32_t n); + void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); + void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length); + uint32_t ExecutePacket(PacketArgs& args); + void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value); + void MakeCoherent(); + + Memory* memory_; + GraphicsSystem* graphics_system_; + GraphicsDriver* driver_; + + uint64_t time_base_; + uint32_t counter_; + + uint32_t primary_buffer_ptr_; + uint32_t primary_buffer_size_; + + uint32_t read_ptr_index_; + uint32_t read_ptr_update_freq_; + uint32_t read_ptr_writeback_ptr_; + + HANDLE write_ptr_index_event_; + volatile uint32_t write_ptr_index_; + volatile uint32_t write_ptr_max_index_; + + DrawCommand draw_command_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_COMMAND_PROCESSOR_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_resource.cc b/src/xenia/gpu/d3d11/d3d11_buffer_resource.cc new file mode 100644 index 000000000..8f03cfe58 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer_resource.cc @@ -0,0 +1,149 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11IndexBufferResource::D3D11IndexBufferResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : IndexBufferResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr) { +} + +D3D11IndexBufferResource::~D3D11IndexBufferResource() { + XESAFERELEASE(handle_); +} + +int D3D11IndexBufferResource::CreateHandle() { + D3D11_BUFFER_DESC buffer_desc; + xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); + buffer_desc.ByteWidth = static_cast(memory_range_.length); + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + HRESULT hr = resource_cache_->device()->CreateBuffer( + &buffer_desc, nullptr, &handle_); + if (FAILED(hr)) { + XELOGW("D3D11: failed to create index buffer"); + return 1; + } + return 0; +} + +int D3D11IndexBufferResource::InvalidateRegion( + const MemoryRange& memory_range) { + SCOPE_profile_cpu_f("gpu"); + + // All that's done so far: + XEASSERT(info_.endianness == 0x2); + + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = resource_cache_->context()->Map( + handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: unable to map index buffer"); + return 1; + } + + if (info_.format == INDEX_FORMAT_32BIT) { + uint32_t index_count = memory_range_.length / 4; + const uint32_t* src = reinterpret_cast( + memory_range_.host_base); + uint32_t* dest = reinterpret_cast(res.pData); + for (uint32_t n = 0; n < index_count; n++) { + dest[n] = XESWAP32(src[n]); + } + } else { + uint32_t index_count = memory_range_.length / 2; + const uint16_t* src = reinterpret_cast( + memory_range_.host_base); + uint16_t* dest = reinterpret_cast(res.pData); + for (uint32_t n = 0; n < index_count; n++) { + dest[n] = XESWAP16(src[n]); + } + } + resource_cache_->context()->Unmap(handle_, 0); + + return 0; +} + +D3D11VertexBufferResource::D3D11VertexBufferResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : VertexBufferResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr) { +} + +D3D11VertexBufferResource::~D3D11VertexBufferResource() { + XESAFERELEASE(handle_); +} + +int D3D11VertexBufferResource::CreateHandle() { + D3D11_BUFFER_DESC buffer_desc; + xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); + buffer_desc.ByteWidth = static_cast(memory_range_.length); + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + HRESULT hr = resource_cache_->device()->CreateBuffer( + &buffer_desc, nullptr, &handle_); + if (FAILED(hr)) { + XELOGW("D3D11: failed to create vertex buffer"); + return 1; + } + return 0; +} + +int D3D11VertexBufferResource::InvalidateRegion( + const MemoryRange& memory_range) { + SCOPE_profile_cpu_f("gpu"); + + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = resource_cache_->context()->Map( + handle_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: unable to map vertex buffer"); + return 1; + } + uint8_t* dest = reinterpret_cast(res.pData); + + // TODO(benvanik): rewrite to be faster/special case common/etc + uint32_t stride = info_.stride_words; + size_t count = (memory_range_.length / 4) / stride; + for (size_t n = 0; n < info_.element_count; n++) { + const auto& el = info_.elements[n]; + const uint32_t* src_ptr = (const uint32_t*)( + memory_range_.host_base + el.offset_words * 4); + uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); + uint32_t o = 0; + for (uint32_t i = 0; i < count; i++) { + for (uint32_t j = 0; j < el.size_words; j++) { + dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); + } + o += stride; + } + } + + resource_cache_->context()->Unmap(handle_, 0); + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_buffer_resource.h b/src/xenia/gpu/d3d11/d3d11_buffer_resource.h new file mode 100644 index 000000000..2e8071ae1 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_buffer_resource.h @@ -0,0 +1,69 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_BUFFER_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_BUFFER_RESOURCE_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11ResourceCache; + + +class D3D11IndexBufferResource : public IndexBufferResource { +public: + D3D11IndexBufferResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11IndexBufferResource() override; + + void* handle() const override { return handle_; } + +protected: + int CreateHandle() override; + int InvalidateRegion(const MemoryRange& memory_range) override; + +private: + D3D11ResourceCache* resource_cache_; + ID3D11Buffer* handle_; +}; + + +class D3D11VertexBufferResource : public VertexBufferResource { +public: + D3D11VertexBufferResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11VertexBufferResource() override; + + void* handle() const override { return handle_; } + +protected: + int CreateHandle() override; + int InvalidateRegion(const MemoryRange& memory_range) override; + +private: + D3D11ResourceCache* resource_cache_; + ID3D11Buffer* handle_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_BUFFER_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc index 5984631fe..d8660cbfe 100644 --- a/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc +++ b/src/xenia/gpu/d3d11/d3d11_geometry_shader.cc @@ -10,7 +10,8 @@ #include #include -#include +#include +#include #include #include @@ -22,8 +23,8 @@ using namespace xe::gpu::d3d11; using namespace xe::gpu::xenos; -D3D11GeometryShader::D3D11GeometryShader(ID3D11Device* device, uint64_t hash) : - hash_(hash), handle_(NULL) { +D3D11GeometryShader::D3D11GeometryShader(ID3D11Device* device) + : handle_(nullptr) { device_ = device; device_->AddRef(); } @@ -33,7 +34,9 @@ D3D11GeometryShader::~D3D11GeometryShader() { XESAFERELEASE(device_); } -int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { +int D3D11GeometryShader::Prepare(D3D11VertexShaderResource* vertex_shader) { + SCOPE_profile_cpu_f("gpu"); + if (handle_) { return 0; } @@ -74,6 +77,8 @@ int D3D11GeometryShader::Prepare(D3D11VertexShader* vertex_shader) { } ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { + SCOPE_profile_cpu_f("gpu"); + // TODO(benvanik): pick shared runtime mode defines. D3D10_SHADER_MACRO defines[] = { "TEST_DEFINE", "1", @@ -90,11 +95,12 @@ ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { if (FLAGS_dump_shaders.size()) { base_path = FLAGS_dump_shaders.c_str(); } + uint64_t hash = xe_hash64(shader_source, xestrlena(shader_source)); // ? char file_name[XE_MAX_PATH]; xesnprintfa(file_name, XECOUNT(file_name), "%s/gen_%.16llX.gs", base_path, - hash_); + hash); if (FLAGS_dump_shaders.size()) { FILE* f = fopen(file_name, "w"); @@ -124,7 +130,7 @@ ID3D10Blob* D3D11GeometryShader::Compile(const char* shader_source) { return shader_blob; } -int D3D11GeometryShader::Generate(D3D11VertexShader* vertex_shader, +int D3D11GeometryShader::Generate(D3D11VertexShaderResource* vertex_shader, alloy::StringBuffer* output) { output->Append( "struct VERTEX {\n" @@ -134,7 +140,7 @@ int D3D11GeometryShader::Generate(D3D11VertexShader* vertex_shader, // TODO(benvanik): only add used ones? output->Append( " float4 o[%d] : XE_O;\n", - D3D11Shader::MAX_INTERPOLATORS); + D3D11ShaderTranslator::kMaxInterpolators); } if (alloc_counts.point_size) { output->Append( @@ -152,15 +158,16 @@ int D3D11GeometryShader::Generate(D3D11VertexShader* vertex_shader, D3D11PointSpriteGeometryShader::D3D11PointSpriteGeometryShader( - ID3D11Device* device, uint64_t hash) : - D3D11GeometryShader(device, hash) { + ID3D11Device* device) : D3D11GeometryShader(device) { } D3D11PointSpriteGeometryShader::~D3D11PointSpriteGeometryShader() { } -int D3D11PointSpriteGeometryShader::Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output) { +int D3D11PointSpriteGeometryShader::Generate( + D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } @@ -206,15 +213,16 @@ int D3D11PointSpriteGeometryShader::Generate(D3D11VertexShader* vertex_shader, D3D11RectListGeometryShader::D3D11RectListGeometryShader( - ID3D11Device* device, uint64_t hash) : - D3D11GeometryShader(device, hash) { + ID3D11Device* device) : D3D11GeometryShader(device) { } D3D11RectListGeometryShader::~D3D11RectListGeometryShader() { } -int D3D11RectListGeometryShader::Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output) { +int D3D11RectListGeometryShader::Generate( + D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } @@ -250,15 +258,16 @@ int D3D11RectListGeometryShader::Generate(D3D11VertexShader* vertex_shader, D3D11QuadListGeometryShader::D3D11QuadListGeometryShader( - ID3D11Device* device, uint64_t hash) : - D3D11GeometryShader(device, hash) { + ID3D11Device* device) : D3D11GeometryShader(device) { } D3D11QuadListGeometryShader::~D3D11QuadListGeometryShader() { } -int D3D11QuadListGeometryShader::Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output) { +int D3D11QuadListGeometryShader::Generate( + D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) { + SCOPE_profile_cpu_f("gpu"); if (D3D11GeometryShader::Generate(vertex_shader, output)) { return 1; } diff --git a/src/xenia/gpu/d3d11/d3d11_geometry_shader.h b/src/xenia/gpu/d3d11/d3d11_geometry_shader.h index cdfebad5f..89529b2a4 100644 --- a/src/xenia/gpu/d3d11/d3d11_geometry_shader.h +++ b/src/xenia/gpu/d3d11/d3d11_geometry_shader.h @@ -21,7 +21,7 @@ namespace xe { namespace gpu { namespace d3d11 { -class D3D11VertexShader; +class D3D11VertexShaderResource; class D3D11GeometryShader { @@ -30,53 +30,52 @@ public: ID3D11GeometryShader* handle() const { return handle_; } - int Prepare(D3D11VertexShader* vertex_shader); + int Prepare(D3D11VertexShaderResource* vertex_shader); protected: - D3D11GeometryShader(ID3D11Device* device, uint64_t hash); + D3D11GeometryShader(ID3D11Device* device); ID3D10Blob* Compile(const char* shader_source); - virtual int Generate(D3D11VertexShader* vertex_shader, + virtual int Generate(D3D11VertexShaderResource* vertex_shader, alloy::StringBuffer* output); protected: ID3D11Device* device_; - uint64_t hash_; ID3D11GeometryShader* handle_; }; class D3D11PointSpriteGeometryShader : public D3D11GeometryShader { public: - D3D11PointSpriteGeometryShader(ID3D11Device* device, uint64_t hash); - virtual ~D3D11PointSpriteGeometryShader(); + D3D11PointSpriteGeometryShader(ID3D11Device* device); + ~D3D11PointSpriteGeometryShader() override; protected: - virtual int Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output); + int Generate(D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) override; }; class D3D11RectListGeometryShader : public D3D11GeometryShader { public: - D3D11RectListGeometryShader(ID3D11Device* device, uint64_t hash); - virtual ~D3D11RectListGeometryShader(); + D3D11RectListGeometryShader(ID3D11Device* device); + ~D3D11RectListGeometryShader() override; protected: - virtual int Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output); + int Generate(D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) override; }; class D3D11QuadListGeometryShader : public D3D11GeometryShader { public: - D3D11QuadListGeometryShader(ID3D11Device* device, uint64_t hash); - virtual ~D3D11QuadListGeometryShader(); + D3D11QuadListGeometryShader(ID3D11Device* device); + ~D3D11QuadListGeometryShader() override; protected: - virtual int Generate(D3D11VertexShader* vertex_shader, - alloy::StringBuffer* output); + int Generate(D3D11VertexShaderResource* vertex_shader, + alloy::StringBuffer* output) override; }; diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc index ddc1d8d1e..4ea0b6210 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.cc @@ -10,9 +10,12 @@ #include #include +#include +#include +#include #include -#include -#include +#include + using namespace xe; using namespace xe::gpu; @@ -20,6 +23,9 @@ using namespace xe::gpu::d3d11; using namespace xe::gpu::xenos; +#define XETRACED3D(fmt, ...) if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) + + D3D11GraphicsDriver::D3D11GraphicsDriver( Memory* memory, IDXGISwapChain* swap_chain, ID3D11Device* device) : GraphicsDriver(memory) { @@ -28,7 +34,8 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( device_ = device; device_->AddRef(); device_->GetImmediateContext(&context_); - shader_cache_ = new D3D11ShaderCache(device_); + + resource_cache_ = new D3D11ResourceCache(memory, device_, context_); xe_zero_struct(&state_, sizeof(state_)); @@ -55,7 +62,41 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( buffer_desc.ByteWidth = (32) * sizeof(int); hr = device_->CreateBuffer( &buffer_desc, NULL, &state_.constant_buffers.gs_consts); +} +D3D11GraphicsDriver::~D3D11GraphicsDriver() { + RebuildRenderTargets(0, 0); + XESAFERELEASE(state_.constant_buffers.float_constants); + XESAFERELEASE(state_.constant_buffers.bool_constants); + XESAFERELEASE(state_.constant_buffers.loop_constants); + XESAFERELEASE(state_.constant_buffers.vs_consts); + XESAFERELEASE(state_.constant_buffers.gs_consts); + for (auto it = rasterizer_state_cache_.begin(); + it != rasterizer_state_cache_.end(); ++it) { + XESAFERELEASE(it->second); + } + for (auto it = blend_state_cache_.begin(); + it != blend_state_cache_.end(); ++it) { + XESAFERELEASE(it->second); + } + for (auto it = depth_stencil_state_cache_.begin(); + it != depth_stencil_state_cache_.end(); ++it) { + XESAFERELEASE(it->second); + } + XESAFERELEASE(invalid_texture_view_); + XESAFERELEASE(invalid_texture_sampler_state_); + delete resource_cache_; + XESAFERELEASE(context_); + XESAFERELEASE(device_); + XESAFERELEASE(swap_chain_); +} + +int D3D11GraphicsDriver::Initialize() { + InitializeInvalidTexture(); + return 0; +} + +void D3D11GraphicsDriver::InitializeInvalidTexture() { // TODO(benvanik): pattern? D3D11_TEXTURE2D_DESC texture_desc; xe_zero_struct(&texture_desc, sizeof(texture_desc)); @@ -81,7 +122,7 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( initial_data.SysMemSlicePitch = 0; initial_data.pSysMem = texture_data; ID3D11Texture2D* texture = NULL; - hr = device_->CreateTexture2D( + HRESULT hr = device_->CreateTexture2D( &texture_desc, &initial_data, (ID3D11Texture2D**)&texture); if (FAILED(hr)) { XEFATAL("D3D11: unable to create invalid texture"); @@ -121,97 +162,528 @@ D3D11GraphicsDriver::D3D11GraphicsDriver( } } -D3D11GraphicsDriver::~D3D11GraphicsDriver() { - RebuildRenderTargets(0, 0); - for (size_t n = 0; n < XECOUNT(state_.texture_fetchers); n++) { - XESAFERELEASE(state_.texture_fetchers[n].view); - } - XESAFERELEASE(state_.constant_buffers.float_constants); - XESAFERELEASE(state_.constant_buffers.bool_constants); - XESAFERELEASE(state_.constant_buffers.loop_constants); - XESAFERELEASE(state_.constant_buffers.vs_consts); - XESAFERELEASE(state_.constant_buffers.gs_consts); - XESAFERELEASE(invalid_texture_view_); - XESAFERELEASE(invalid_texture_sampler_state_); - delete shader_cache_; - XESAFERELEASE(context_); - XESAFERELEASE(device_); - XESAFERELEASE(swap_chain_); -} - -void D3D11GraphicsDriver::Initialize() { -} - -void D3D11GraphicsDriver::InvalidateState( - uint32_t mask) { - if (mask == XE_GPU_INVALIDATE_MASK_ALL) { - XELOGGPU("D3D11: (invalidate all)"); - } - if (mask & XE_GPU_INVALIDATE_MASK_VERTEX_SHADER) { - XELOGGPU("D3D11: invalidate vertex shader"); - } - if (mask & XE_GPU_INVALIDATE_MASK_PIXEL_SHADER) { - XELOGGPU("D3D11: invalidate pixel shader"); - } -} - -void D3D11GraphicsDriver::SetShader( - XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length) { - // Find or create shader in the cache. - uint8_t* p = memory_->Translate(address); - Shader* shader = shader_cache_->FindOrCreate( - type, p, length); - - // Disassemble. - const char* source = shader->disasm_src(); - if (!source) { - source = ""; - } - XELOGGPU("D3D11: set shader %d at %0.8X (%db):\n%s", - type, address, length, source); - - // Stash for later. - switch (type) { - case XE_GPU_SHADER_TYPE_VERTEX: - state_.vertex_shader = (D3D11VertexShader*)shader; - break; - case XE_GPU_SHADER_TYPE_PIXEL: - state_.pixel_shader = (D3D11PixelShader*)shader; - break; - } -} - -int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { - RegisterFile& rf = register_file_; - - // Ignore copies. - uint32_t enable_mode = rf.values[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; - if (enable_mode != 4) { - XELOGW("D3D11: ignoring draw with enable mode %d", enable_mode); - return 1; - } - - uint32_t state_overrides = 0; - if (prim_type == XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST) { - // Rect lists aren't culled. There may be other things they skip too. - state_overrides |= STATE_OVERRIDE_DISABLE_CULLING; - } +int D3D11GraphicsDriver::Draw(const DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); // Misc state. - if (UpdateState(state_overrides)) { + if (UpdateState(command)) { return 1; } // Build constant buffers. - if (UpdateConstantBuffers()) { + if (SetupConstantBuffers(command)) { return 1; } // Bind shaders. - if (BindShaders()) { + if (SetupShaders(command)) { + return 1; + } + + // Bind vertex buffers/index buffer. + if (SetupInputAssembly(command)) { + return 1; + } + + // Bind texture fetchers. + if (SetupSamplers(command)) { + return 1; + } + + if (command.index_buffer) { + // Have an actual index buffer. + XETRACED3D("D3D11: draw indexed %d (indicies [%d,%d] (%d))", + command.prim_type, command.start_index, + command.start_index + command.index_count, command.index_count); + context_->DrawIndexed(command.index_count, command.start_index, + command.base_vertex); + } else { + // Auto draw. + XETRACED3D("D3D11: draw indexed auto %d (indicies [%d,%d] (%d))", + command.prim_type, command.start_index, + command.start_index + command.index_count, command.index_count); + context_->Draw(command.index_count, 0); + } + + return 0; +} + +int D3D11GraphicsDriver::UpdateState(const DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + // Most information comes from here: + // https://chromium.googlesource.com/chromiumos/third_party/mesa/+/6173cc19c45d92ef0b7bc6aa008aa89bb29abbda/src/gallium/drivers/freedreno/freedreno_zsa.c + // http://cgit.freedesktop.org/mesa/mesa/diff/?id=aac7f06ad843eaa696363e8e9c7781ca30cb4914 + // The only differences so far are extra packets for multiple render targets + // and a few modes being switched around. + + RegisterFile& rf = register_file_; + + uint32_t window_scissor_tl = register_file_[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; + uint32_t window_scissor_br = register_file_[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; + //uint32_t window_width = + // (window_scissor_br & 0x7FFF) - (window_scissor_tl & 0x7FFF); + //uint32_t window_height = + // ((window_scissor_br >> 16) & 0x7FFF) - ((window_scissor_tl >> 16) & 0x7FFF); + uint32_t window_width = 1280; + uint32_t window_height = 720; + if (RebuildRenderTargets(window_width, window_height)) { + XELOGE("Unable to rebuild render targets to %d x %d", + window_width, window_height); + return 1; + } + + // RB_SURFACE_INFO ? + + // Enable buffers. + uint32_t enable_mode = register_file_[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; + // 4 = color + depth + // 6 = copy ? + + // color_info[0-3] has format 8888 + uint32_t color_info[4] = { + register_file_[XE_GPU_REG_RB_COLOR_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR1_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR2_INFO].u32, + register_file_[XE_GPU_REG_RB_COLOR3_INFO].u32, + }; + ID3D11RenderTargetView* render_target_views[4] = { 0 }; + for (int n = 0; n < XECOUNT(color_info); n++) { + auto cb = render_targets_.color_buffers[n]; + uint32_t color_format = (color_info[n] >> 16) & 0xF; + switch (color_format) { + case 0: // D3DFMT_A8R8G8B8 (or ABGR?) + case 1: + render_target_views[n] = cb.color_view_8888; + break; + default: + // Unknown. + XELOGGPU("Unsupported render target format %d", color_format); + break; + } + } + + // depth_info has format 24_8 + uint32_t depth_info = register_file_[XE_GPU_REG_RB_DEPTH_INFO].u32; + uint32_t depth_format = (depth_info >> 16) & 0x1; + ID3D11DepthStencilView* depth_stencil_view = 0; + switch (depth_format) { + case 0: // D3DFMT_D24S8 + depth_stencil_view = render_targets_.depth_view_d28s8; + break; + default: + case 1: // D3DFMT_D24FS8 + //depth_stencil_view = render_targets_.depth_view_d28fs8; + XELOGGPU("Unsupported depth/stencil format %d", depth_format); + break; + } + // TODO(benvanik): when a game switches does it expect to keep the same + // depth buffer contents? + + // TODO(benvanik): only enable the number of valid render targets. + context_->OMSetRenderTargets(4, render_target_views, depth_stencil_view); + + // Viewport. + // If we have resized the window we will want to change this. + uint32_t window_offset = register_file_[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; + // signed? + uint32_t window_offset_x = window_offset & 0x7FFF; + uint32_t window_offset_y = (window_offset >> 16) & 0x7FFF; + + // ? + // TODO(benvanik): figure out how to emulate viewports in D3D11. Could use + // viewport above to scale, though that doesn't support negatives/etc. + uint32_t vte_control = register_file_[XE_GPU_REG_PA_CL_VTE_CNTL].u32; + bool vport_xscale_enable = (vte_control & (1 << 0)) > 0; + float vport_xscale = register_file_[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640 + bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0; + float vport_xoffset = register_file_[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640 + bool vport_yscale_enable = (vte_control & (1 << 2)) > 0; + float vport_yscale = register_file_[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360 + bool vport_yoffset_enable = (vte_control & (1 << 3)) > 0; + float vport_yoffset = register_file_[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360 + bool vport_zscale_enable = (vte_control & (1 << 4)) > 0; + float vport_zscale = register_file_[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1 + bool vport_zoffset_enable = (vte_control & (1 << 5)) > 0; + float vport_zoffset = register_file_[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0 + + // TODO(benvanik): compute viewport values. + D3D11_VIEWPORT viewport; + if (vport_xscale_enable) { + // Viewport enabled. + viewport.MinDepth = 0.0f; + viewport.MaxDepth = 1.0f; + viewport.TopLeftX = 0; + viewport.TopLeftY = 0; + viewport.Width = 1280; + viewport.Height = 720; + } else { + // Viewport disabled. Geometry shaders will compensate for this. + viewport.MinDepth = 0.0f; + viewport.MaxDepth = 1.0f; + viewport.TopLeftX = 0; + viewport.TopLeftY = 0; + viewport.Width = 1280; + viewport.Height = 720; + } + context_->RSSetViewports(1, &viewport); + + // Viewport constants from D3D11VertexShader. + //"cbuffer vs_consts {\n" + //" float4 window;\n" // x,y,w,h + //" float4 viewport_z_enable;\n" // min,(max - min),?,enabled + //" float4 viewport_size;\n" // x,y,w,h + //"};" + // TODO(benvanik): only when viewport changes. + D3D11_MAPPED_SUBRESOURCE res; + context_->Map( + state_.constant_buffers.vs_consts, 0, + D3D11_MAP_WRITE_DISCARD, 0, &res); + float* vsc_buffer = (float*)res.pData; + vsc_buffer[0] = (float)window_offset_x; + vsc_buffer[1] = (float)window_offset_y; + vsc_buffer[2] = (float)window_width; + vsc_buffer[3] = (float)window_height; + vsc_buffer[4] = viewport.MinDepth; + vsc_buffer[5] = viewport.MaxDepth - viewport.MinDepth; + vsc_buffer[6] = 0; // unused + vsc_buffer[7] = vport_xscale_enable ? 1.0f : 0.0f; + vsc_buffer[8] = viewport.TopLeftX; + vsc_buffer[9] = viewport.TopLeftY; + vsc_buffer[10] = viewport.Width; + vsc_buffer[11] = viewport.Height; + context_->Unmap(state_.constant_buffers.vs_consts, 0); + + // Scissoring. + // TODO(benvanik): pull from scissor registers. + // ScissorEnable must be set in raster state above. + uint32_t screen_scissor_tl = register_file_[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32; + uint32_t screen_scissor_br = register_file_[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32; + if (screen_scissor_tl != 0 && screen_scissor_br != 0x20002000) { + D3D11_RECT scissor_rect; + scissor_rect.top = (screen_scissor_tl >> 16) & 0x7FFF; + scissor_rect.left = screen_scissor_tl & 0x7FFF; + scissor_rect.bottom = (screen_scissor_br >> 16) & 0x7FFF; + scissor_rect.right = screen_scissor_br & 0x7FFF; + context_->RSSetScissorRects(1, &scissor_rect); + } else { + context_->RSSetScissorRects(0, NULL); + } + + if (SetupRasterizerState(command)) { + XELOGE("Unable to setup rasterizer state"); + return 1; + } + if (SetupBlendState(command)) { + XELOGE("Unable to setup blend state"); + return 1; + } + if (SetupDepthStencilState(command)) { + XELOGE("Unable to setup depth/stencil state"); + return 1; + } + + return 0; +} + +int D3D11GraphicsDriver::SetupRasterizerState(const DrawCommand& command) { + uint32_t mode_control = register_file_[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; + + // Check cache. + uint64_t key = hash_combine(mode_control); + ID3D11RasterizerState* rasterizer_state = nullptr; + auto it = rasterizer_state_cache_.find(key); + if (it == rasterizer_state_cache_.end()) { + D3D11_RASTERIZER_DESC rasterizer_desc; + xe_zero_struct(&rasterizer_desc, sizeof(rasterizer_desc)); + rasterizer_desc.FillMode = D3D11_FILL_SOLID; // D3D11_FILL_WIREFRAME; + switch (mode_control & 0x3) { + case 0: + rasterizer_desc.CullMode = D3D11_CULL_NONE; + break; + case 1: + rasterizer_desc.CullMode = D3D11_CULL_FRONT; + break; + case 2: + rasterizer_desc.CullMode = D3D11_CULL_BACK; + break; + } + if (command.prim_type == XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST) { + // Rect lists aren't culled. There may be other things they skip too. + rasterizer_desc.CullMode = D3D11_CULL_NONE; + } + rasterizer_desc.FrontCounterClockwise = (mode_control & 0x4) == 0; + rasterizer_desc.DepthBias = 0; + rasterizer_desc.DepthBiasClamp = 0; + rasterizer_desc.SlopeScaledDepthBias = 0; + rasterizer_desc.DepthClipEnable = false; // ? + rasterizer_desc.ScissorEnable = false; + rasterizer_desc.MultisampleEnable = false; + rasterizer_desc.AntialiasedLineEnable = false; + device_->CreateRasterizerState(&rasterizer_desc, &rasterizer_state); + rasterizer_state_cache_.insert({ key, rasterizer_state }); + } else { + rasterizer_state = it->second; + } + + context_->RSSetState(rasterizer_state); + return 0; +} + +int D3D11GraphicsDriver::SetupBlendState(const DrawCommand& command) { + static const D3D11_BLEND blend_map[] = { + /* 0 */ D3D11_BLEND_ZERO, + /* 1 */ D3D11_BLEND_ONE, + /* 2 */ D3D11_BLEND_ZERO, // ? + /* 3 */ D3D11_BLEND_ZERO, // ? + /* 4 */ D3D11_BLEND_SRC_COLOR, + /* 5 */ D3D11_BLEND_INV_SRC_COLOR, + /* 6 */ D3D11_BLEND_SRC_ALPHA, + /* 7 */ D3D11_BLEND_INV_SRC_ALPHA, + /* 8 */ D3D11_BLEND_DEST_COLOR, + /* 9 */ D3D11_BLEND_INV_DEST_COLOR, + /* 10 */ D3D11_BLEND_DEST_ALPHA, + /* 11 */ D3D11_BLEND_INV_DEST_ALPHA, + /* 12 */ D3D11_BLEND_BLEND_FACTOR, + /* 13 */ D3D11_BLEND_INV_BLEND_FACTOR, + /* 14 */ D3D11_BLEND_SRC1_ALPHA, // ? + /* 15 */ D3D11_BLEND_INV_SRC1_ALPHA, // ? + /* 16 */ D3D11_BLEND_SRC_ALPHA_SAT, + }; + static const D3D11_BLEND_OP blend_op_map[] = { + /* 0 */ D3D11_BLEND_OP_ADD, + /* 1 */ D3D11_BLEND_OP_SUBTRACT, + /* 2 */ D3D11_BLEND_OP_MIN, + /* 3 */ D3D11_BLEND_OP_MAX, + /* 4 */ D3D11_BLEND_OP_REV_SUBTRACT, + }; + + // alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE + // Not in D3D11! + // http://msdn.microsoft.com/en-us/library/windows/desktop/bb205120(v=vs.85).aspx + uint32_t color_control = register_file_[XE_GPU_REG_RB_COLORCONTROL].u32; + + uint32_t color_mask = register_file_[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t blend_control[4] = { + register_file_[XE_GPU_REG_RB_BLENDCONTROL_0].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_1].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_2].u32, + register_file_[XE_GPU_REG_RB_BLENDCONTROL_3].u32, + }; + + // Check cache. + uint64_t key = hash_combine(color_mask, + blend_control[0], blend_control[1], + blend_control[2], blend_control[3]); + ID3D11BlendState* blend_state = nullptr; + auto it = blend_state_cache_.find(key); + if (it == blend_state_cache_.end()) { + D3D11_BLEND_DESC blend_desc; + xe_zero_struct(&blend_desc, sizeof(blend_desc)); + //blend_desc.AlphaToCoverageEnable = false; + // ? + blend_desc.IndependentBlendEnable = true; + for (int n = 0; n < XECOUNT(blend_control); n++) { + // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND + blend_desc.RenderTarget[n].SrcBlend = blend_map[(blend_control[n] & 0x0000001F) >> 0]; + // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND + blend_desc.RenderTarget[n].DestBlend = blend_map[(blend_control[n] & 0x00001F00) >> 8]; + // A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN + blend_desc.RenderTarget[n].BlendOp = blend_op_map[(blend_control[n] & 0x000000E0) >> 5]; + // A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND + blend_desc.RenderTarget[n].SrcBlendAlpha = blend_map[(blend_control[n] & 0x001F0000) >> 16]; + // A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND + blend_desc.RenderTarget[n].DestBlendAlpha = blend_map[(blend_control[n] & 0x1F000000) >> 24]; + // A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN + blend_desc.RenderTarget[n].BlendOpAlpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21]; + // A2XX_RB_COLOR_MASK_WRITE_* + blend_desc.RenderTarget[n].RenderTargetWriteMask = (color_mask >> (n * 4)) & 0xF; + // A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this! + // Just guess based on actions. + blend_desc.RenderTarget[n].BlendEnable = !( + (blend_desc.RenderTarget[n].SrcBlend == D3D11_BLEND_ONE) && + (blend_desc.RenderTarget[n].DestBlend == D3D11_BLEND_ZERO) && + (blend_desc.RenderTarget[n].BlendOp == D3D11_BLEND_OP_ADD) && + (blend_desc.RenderTarget[n].SrcBlendAlpha == D3D11_BLEND_ONE) && + (blend_desc.RenderTarget[n].DestBlendAlpha == D3D11_BLEND_ZERO) && + (blend_desc.RenderTarget[n].BlendOpAlpha == D3D11_BLEND_OP_ADD)); + } + device_->CreateBlendState(&blend_desc, &blend_state); + blend_state_cache_.insert({ key, blend_state }); + } else { + blend_state = it->second; + } + + float blend_factor[4] = { + register_file_[XE_GPU_REG_RB_BLEND_RED].f32, + register_file_[XE_GPU_REG_RB_BLEND_GREEN].f32, + register_file_[XE_GPU_REG_RB_BLEND_BLUE].f32, + register_file_[XE_GPU_REG_RB_BLEND_ALPHA].f32, + }; + uint32_t sample_mask = 0xFFFFFFFF; // ? + context_->OMSetBlendState(blend_state, blend_factor, sample_mask); + return 0; +} + +int D3D11GraphicsDriver::SetupDepthStencilState(const DrawCommand& command) { + static const D3D11_COMPARISON_FUNC compare_func_map[] = { + /* 0 */ D3D11_COMPARISON_NEVER, + /* 1 */ D3D11_COMPARISON_LESS, + /* 2 */ D3D11_COMPARISON_EQUAL, + /* 3 */ D3D11_COMPARISON_LESS_EQUAL, + /* 4 */ D3D11_COMPARISON_GREATER, + /* 5 */ D3D11_COMPARISON_NOT_EQUAL, + /* 6 */ D3D11_COMPARISON_GREATER_EQUAL, + /* 7 */ D3D11_COMPARISON_ALWAYS, + }; + static const D3D11_STENCIL_OP stencil_op_map[] = { + /* 0 */ D3D11_STENCIL_OP_KEEP, + /* 1 */ D3D11_STENCIL_OP_ZERO, + /* 2 */ D3D11_STENCIL_OP_REPLACE, + /* 3 */ D3D11_STENCIL_OP_INCR_SAT, + /* 4 */ D3D11_STENCIL_OP_DECR_SAT, + /* 5 */ D3D11_STENCIL_OP_INVERT, + /* 6 */ D3D11_STENCIL_OP_INCR, + /* 7 */ D3D11_STENCIL_OP_DECR, + }; + + uint32_t depth_control = register_file_[XE_GPU_REG_RB_DEPTHCONTROL].u32; + uint32_t stencil_ref_mask = register_file_[XE_GPU_REG_RB_STENCILREFMASK].u32; + + // Check cache. + uint64_t key = (uint64_t(depth_control) << 32) | stencil_ref_mask; + ID3D11DepthStencilState* depth_stencil_state = nullptr; + auto it = depth_stencil_state_cache_.find(key); + if (it == depth_stencil_state_cache_.end()) { + D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; + xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); + // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE + // ? + // A2XX_RB_DEPTHCONTROL_Z_ENABLE + depth_stencil_desc.DepthEnable = (depth_control & 0x00000002) != 0; + // A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE + depth_stencil_desc.DepthWriteMask = (depth_control & 0x00000004) ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO; + // A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE + // ? + // A2XX_RB_DEPTHCONTROL_ZFUNC + depth_stencil_desc.DepthFunc = compare_func_map[(depth_control & 0x00000070) >> 4]; + // A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE + depth_stencil_desc.StencilEnable = (depth_control & 0x00000001) != 0; + // RB_STENCILREFMASK_STENCILMASK + depth_stencil_desc.StencilReadMask = (stencil_ref_mask & 0x0000FF00) >> 8; + // RB_STENCILREFMASK_STENCILWRITEMASK + depth_stencil_desc.StencilWriteMask = (stencil_ref_mask & 0x00FF0000) >> 16; + // A2XX_RB_DEPTHCONTROL_STENCILFUNC + depth_stencil_desc.FrontFace.StencilFunc = compare_func_map[(depth_control & 0x00000700) >> 8]; + // A2XX_RB_DEPTHCONTROL_STENCILFAIL + depth_stencil_desc.FrontFace.StencilFailOp = stencil_op_map[(depth_control & 0x00003800) >> 11]; + // A2XX_RB_DEPTHCONTROL_STENCILZPASS + depth_stencil_desc.FrontFace.StencilPassOp = stencil_op_map[(depth_control & 0x0001C000) >> 14]; + // A2XX_RB_DEPTHCONTROL_STENCILZFAIL + depth_stencil_desc.FrontFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0x000E0000) >> 17]; + // A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF + depth_stencil_desc.BackFace.StencilFunc = compare_func_map[(depth_control & 0x00700000) >> 20]; + // A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF + depth_stencil_desc.BackFace.StencilFailOp = stencil_op_map[(depth_control & 0x03800000) >> 23]; + // A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF + depth_stencil_desc.BackFace.StencilPassOp = stencil_op_map[(depth_control & 0x1C000000) >> 26]; + // A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF + depth_stencil_desc.BackFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0xE0000000) >> 29]; + device_->CreateDepthStencilState(&depth_stencil_desc, &depth_stencil_state); + depth_stencil_state_cache_.insert({ key, depth_stencil_state }); + } else { + depth_stencil_state = it->second; + } + + // RB_STENCILREFMASK_STENCILREF + uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF); + context_->OMSetDepthStencilState(depth_stencil_state, stencil_ref); + return 0; +} + +int D3D11GraphicsDriver::SetupConstantBuffers(const DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + D3D11_MAPPED_SUBRESOURCE res; + context_->Map( + state_.constant_buffers.float_constants, 0, + D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, + command.float4_constants.values, + command.float4_constants.count * 4 * sizeof(float)); + context_->Unmap(state_.constant_buffers.float_constants, 0); + + context_->Map( + state_.constant_buffers.loop_constants, 0, + D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, + command.loop_constants.values, + command.loop_constants.count * sizeof(int)); + context_->Unmap(state_.constant_buffers.loop_constants, 0); + + context_->Map( + state_.constant_buffers.bool_constants, 0, + D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, + command.bool_constants.values, + command.bool_constants.count * sizeof(int)); + context_->Unmap(state_.constant_buffers.bool_constants, 0); + + return 0; +} + +int D3D11GraphicsDriver::SetupShaders(const DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + if (command.vertex_shader) { + context_->VSSetShader( + command.vertex_shader->handle_as(), nullptr, 0); + + // Set constant buffers. + ID3D11Buffer* vs_constant_buffers[] = { + state_.constant_buffers.float_constants, + state_.constant_buffers.bool_constants, + state_.constant_buffers.loop_constants, + state_.constant_buffers.vs_consts, + }; + context_->VSSetConstantBuffers(0, XECOUNT(vs_constant_buffers), + vs_constant_buffers); + + // Setup input layout (as encoded in vertex shader). + auto vs = static_cast(command.vertex_shader); + context_->IASetInputLayout(vs->input_layout()); + } else { + context_->VSSetShader(nullptr, nullptr, 0); + context_->IASetInputLayout(nullptr); + return 1; + } + + // Pixel shader setup. + if (command.pixel_shader) { + context_->PSSetShader( + command.pixel_shader->handle_as(), nullptr, 0); + + // Set constant buffers. + ID3D11Buffer* vs_constant_buffers[] = { + state_.constant_buffers.float_constants, + state_.constant_buffers.bool_constants, + state_.constant_buffers.loop_constants, + }; + context_->PSSetConstantBuffers(0, XECOUNT(vs_constant_buffers), + vs_constant_buffers); + } else { + context_->PSSetShader(nullptr, nullptr, 0); + return 1; + } + + return 0; +} + +int D3D11GraphicsDriver::SetupInputAssembly(const DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + auto vs = static_cast(command.vertex_shader); + if (!vs) { return 1; } @@ -219,14 +691,12 @@ int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { // Some are unsupported on D3D11 and must be emulated. D3D11_PRIMITIVE_TOPOLOGY primitive_topology; D3D11GeometryShader* geometry_shader = NULL; - switch (prim_type) { + switch (command.prim_type) { case XE_GPU_PRIMITIVE_TYPE_POINT_LIST: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST; - if (state_.vertex_shader) { - if (state_.vertex_shader->DemandGeometryShader( - D3D11VertexShader::POINT_SPRITE_SHADER, &geometry_shader)) { - return 1; - } + if (vs->DemandGeometryShader( + D3D11VertexShaderResource::POINT_SPRITE_SHADER, &geometry_shader)) { + return 1; } break; case XE_GPU_PRIMITIVE_TYPE_LINE_LIST: @@ -243,20 +713,16 @@ int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { break; case XE_GPU_PRIMITIVE_TYPE_RECTANGLE_LIST: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; - if (state_.vertex_shader) { - if (state_.vertex_shader->DemandGeometryShader( - D3D11VertexShader::RECT_LIST_SHADER, &geometry_shader)) { - return 1; - } + if (vs->DemandGeometryShader( + D3D11VertexShaderResource::RECT_LIST_SHADER, &geometry_shader)) { + return 1; } break; case XE_GPU_PRIMITIVE_TYPE_QUAD_LIST: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST_ADJ; - if (state_.vertex_shader) { - if (state_.vertex_shader->DemandGeometryShader( - D3D11VertexShader::QUAD_LIST_SHADER, &geometry_shader)) { - return 1; - } + if (vs->DemandGeometryShader( + D3D11VertexShaderResource::QUAD_LIST_SHADER, &geometry_shader)) { + return 1; } break; default: @@ -264,83 +730,98 @@ int D3D11GraphicsDriver::SetupDraw(XE_GPU_PRIMITIVE_TYPE prim_type) { case XE_GPU_PRIMITIVE_TYPE_UNKNOWN_07: case XE_GPU_PRIMITIVE_TYPE_LINE_LOOP: primitive_topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST; - XELOGE("D3D11: unsupported primitive type %d", prim_type); + XELOGE("D3D11: unsupported primitive type %d", command.prim_type); break; } context_->IASetPrimitiveTopology(primitive_topology); + // Set the geometry shader, if we are emulating a primitive type. if (geometry_shader) { context_->GSSetShader(geometry_shader->handle(), NULL, NULL); - context_->GSSetConstantBuffers( - 0, 1, &state_.constant_buffers.gs_consts); + context_->GSSetConstantBuffers(0, 1, &state_.constant_buffers.gs_consts); } else { context_->GSSetShader(NULL, NULL, NULL); } - // Setup all fetchers (vertices/textures). - if (PrepareFetchers()) { - return 1; + // Index buffer, if any. May be auto draw. + if (command.index_buffer) { + DXGI_FORMAT format; + switch (command.index_buffer->info().format) { + case INDEX_FORMAT_16BIT: + format = DXGI_FORMAT_R16_UINT; + break; + case INDEX_FORMAT_32BIT: + format = DXGI_FORMAT_R32_UINT; + break; + } + context_->IASetIndexBuffer( + command.index_buffer->handle_as(), + format, 0); + } else { + context_->IASetIndexBuffer(nullptr, DXGI_FORMAT_UNKNOWN, 0); } - // All ready to draw (except index buffer)! + // All vertex buffers. + for (auto i = 0; i < command.vertex_buffer_count; ++i) { + const auto& vb = command.vertex_buffers[i]; + auto buffer = vb.buffer->handle_as(); + auto stride = vb.stride; + auto offset = vb.offset; + context_->IASetVertexBuffers(vb.input_index, 1, &buffer, + &stride, &offset); + } return 0; } -void D3D11GraphicsDriver::DrawIndexBuffer( - XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) { - RegisterFile& rf = register_file_; +int D3D11GraphicsDriver::SetupSamplers(const DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); - XELOGGPU("D3D11: draw indexed %d (%d indicies) from %.8X", - prim_type, index_count, index_base); - - // Setup shaders/etc. - if (SetupDraw(prim_type)) { - return; + for (auto i = 0; i < command.vertex_shader_sampler_count; ++i) { + const auto& input = command.vertex_shader_samplers[i]; + if (input.texture) { + auto texture = input.texture->handle_as(); + context_->VSSetShaderResources(input.input_index, 1, &texture); + } else { + context_->VSSetShaderResources(input.input_index, 1, &invalid_texture_view_); + } + if (input.sampler_state) { + auto sampler_state = input.sampler_state->handle_as(); + context_->VSSetSamplers(input.input_index, 1, &sampler_state); + } else { + context_->VSSetSamplers(input.input_index, 1, &invalid_texture_sampler_state_); + } } - // Setup index buffer. - if (PrepareIndexBuffer( - index_32bit, index_count, index_base, index_size, endianness)) { - return; + for (auto i = 0; i < command.pixel_shader_sampler_count; ++i) { + const auto& input = command.pixel_shader_samplers[i]; + if (input.texture) { + auto texture = input.texture->handle_as(); + context_->PSSetShaderResources(input.input_index, 1, &texture); + } else { + context_->PSSetShaderResources(input.input_index, 1, &invalid_texture_view_); + } + if (input.sampler_state) { + auto sampler_state = input.sampler_state->handle_as(); + context_->PSSetSamplers(input.input_index, 1, &sampler_state); + } else { + context_->PSSetSamplers(input.input_index, 1, &invalid_texture_sampler_state_); + } } - // Issue draw. - uint32_t start_index = rf.values[XE_GPU_REG_VGT_INDX_OFFSET].u32; - uint32_t base_vertex = 0; - context_->DrawIndexed(index_count, start_index, base_vertex); + return 0; } -void D3D11GraphicsDriver::DrawIndexAuto( - XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count) { - RegisterFile& rf = register_file_; - - XELOGGPU("D3D11: draw indexed %d (%d indicies)", - prim_type, index_count); - - // Setup shaders/etc. - if (SetupDraw(prim_type)) { - return; - } - - // Issue draw. - uint32_t start_index = rf.values[XE_GPU_REG_VGT_INDX_OFFSET].u32; - uint32_t base_vertex = 0; - //context_->DrawIndexed(index_count, start_index, base_vertex); - context_->Draw(index_count, 0); -} - -int D3D11GraphicsDriver::RebuildRenderTargets( - uint32_t width, uint32_t height) { +int D3D11GraphicsDriver::RebuildRenderTargets(uint32_t width, + uint32_t height) { if (width == render_targets_.width && height == render_targets_.height) { // Cached copies are good. return 0; } + SCOPE_profile_cpu_f("gpu"); + // Remove old versions. for (int n = 0; n < XECOUNT(render_targets_.color_buffers); n++) { auto& cb = render_targets_.color_buffers[n]; @@ -400,8 +881,7 @@ int D3D11GraphicsDriver::RebuildRenderTargets( depth_stencil_desc.SampleDesc.Count = 1; depth_stencil_desc.SampleDesc.Quality = 0; depth_stencil_desc.Usage = D3D11_USAGE_DEFAULT; - depth_stencil_desc.BindFlags = - D3D11_BIND_DEPTH_STENCIL; + depth_stencil_desc.BindFlags = D3D11_BIND_DEPTH_STENCIL; depth_stencil_desc.CPUAccessFlags = 0; depth_stencil_desc.MiscFlags = 0; device_->CreateTexture2D( @@ -420,1215 +900,9 @@ int D3D11GraphicsDriver::RebuildRenderTargets( return 0; } -int D3D11GraphicsDriver::UpdateState(uint32_t state_overrides) { - // Most information comes from here: - // https://chromium.googlesource.com/chromiumos/third_party/mesa/+/6173cc19c45d92ef0b7bc6aa008aa89bb29abbda/src/gallium/drivers/freedreno/freedreno_zsa.c - // http://cgit.freedesktop.org/mesa/mesa/diff/?id=aac7f06ad843eaa696363e8e9c7781ca30cb4914 - // The only differences so far are extra packets for multiple render targets - // and a few modes being switched around. - - RegisterFile& rf = register_file_; - - uint32_t window_scissor_tl = rf.values[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; - uint32_t window_scissor_br = rf.values[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; - //uint32_t window_width = - // (window_scissor_br & 0x7FFF) - (window_scissor_tl & 0x7FFF); - //uint32_t window_height = - // ((window_scissor_br >> 16) & 0x7FFF) - ((window_scissor_tl >> 16) & 0x7FFF); - uint32_t window_width = 1280; - uint32_t window_height = 720; - if (RebuildRenderTargets(window_width, window_height)) { - XELOGE("Unable to rebuild render targets to %d x %d", - window_width, window_height); - return 1; - } - - // RB_SURFACE_INFO ? - - // Enable buffers. - uint32_t enable_mode = rf.values[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; - // 4 = color + depth - // 6 = copy ? - - // color_info[0-3] has format 8888 - uint32_t color_info[4] = { - rf.values[XE_GPU_REG_RB_COLOR_INFO].u32, - rf.values[XE_GPU_REG_RB_COLOR1_INFO].u32, - rf.values[XE_GPU_REG_RB_COLOR2_INFO].u32, - rf.values[XE_GPU_REG_RB_COLOR3_INFO].u32, - }; - ID3D11RenderTargetView* render_target_views[4] = { 0 }; - for (int n = 0; n < XECOUNT(color_info); n++) { - auto cb = render_targets_.color_buffers[n]; - uint32_t color_format = (color_info[n] >> 16) & 0xF; - switch (color_format) { - case 0: // D3DFMT_A8R8G8B8 (or ABGR?) - case 1: - render_target_views[n] = cb.color_view_8888; - break; - default: - // Unknown. - XELOGGPU("Unsupported render target format %d", color_format); - break; - } - } - - // depth_info has format 24_8 - uint32_t depth_info = rf.values[XE_GPU_REG_RB_DEPTH_INFO].u32; - uint32_t depth_format = (depth_info >> 16) & 0x1; - ID3D11DepthStencilView* depth_stencil_view = 0; - switch (depth_format) { - case 0: // D3DFMT_D24S8 - depth_stencil_view = render_targets_.depth_view_d28s8; - break; - default: - case 1: // D3DFMT_D24FS8 - //depth_stencil_view = render_targets_.depth_view_d28fs8; - XELOGGPU("Unsupported depth/stencil format %d", depth_format); - break; - } - // TODO(benvanik): when a game switches does it expect to keep the same - // depth buffer contents? - - // TODO(benvanik): only enable the number of valid render targets. - context_->OMSetRenderTargets(4, render_target_views, depth_stencil_view); - - // General rasterizer state. - uint32_t mode_control = rf.values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; - D3D11_RASTERIZER_DESC rasterizer_desc; - xe_zero_struct(&rasterizer_desc, sizeof(rasterizer_desc)); - rasterizer_desc.FillMode = D3D11_FILL_SOLID; // D3D11_FILL_WIREFRAME; - switch (mode_control & 0x3) { - case 0: - rasterizer_desc.CullMode = D3D11_CULL_NONE; - break; - case 1: - rasterizer_desc.CullMode = D3D11_CULL_FRONT; - break; - case 2: - rasterizer_desc.CullMode = D3D11_CULL_BACK; - break; - } - if (state_overrides & STATE_OVERRIDE_DISABLE_CULLING) { - rasterizer_desc.CullMode = D3D11_CULL_NONE; - } - rasterizer_desc.FrontCounterClockwise = (mode_control & 0x4) == 0; - rasterizer_desc.DepthBias = 0; - rasterizer_desc.DepthBiasClamp = 0; - rasterizer_desc.SlopeScaledDepthBias = 0; - rasterizer_desc.DepthClipEnable = false; // ? - rasterizer_desc.ScissorEnable = false; - rasterizer_desc.MultisampleEnable = false; - rasterizer_desc.AntialiasedLineEnable = false; - ID3D11RasterizerState* rasterizer_state = 0; - device_->CreateRasterizerState(&rasterizer_desc, &rasterizer_state); - context_->RSSetState(rasterizer_state); - XESAFERELEASE(rasterizer_state); - - // Viewport. - // If we have resized the window we will want to change this. - uint32_t window_offset = rf.values[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; - // signed? - uint32_t window_offset_x = window_offset & 0x7FFF; - uint32_t window_offset_y = (window_offset >> 16) & 0x7FFF; - - // ? - // TODO(benvanik): figure out how to emulate viewports in D3D11. Could use - // viewport above to scale, though that doesn't support negatives/etc. - uint32_t vte_control = rf.values[XE_GPU_REG_PA_CL_VTE_CNTL].u32; - bool vport_xscale_enable = (vte_control & (1 << 0)) > 0; - float vport_xscale = rf.values[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640 - bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0; - float vport_xoffset = rf.values[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640 - bool vport_yscale_enable = (vte_control & (1 << 2)) > 0; - float vport_yscale = rf.values[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360 - bool vport_yoffset_enable = (vte_control & (1 << 3)) > 0; - float vport_yoffset = rf.values[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360 - bool vport_zscale_enable = (vte_control & (1 << 4)) > 0; - float vport_zscale = rf.values[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1 - bool vport_zoffset_enable = (vte_control & (1 << 5)) > 0; - float vport_zoffset = rf.values[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0 - - // TODO(benvanik): compute viewport values. - D3D11_VIEWPORT viewport; - if (vport_xscale_enable) { - // Viewport enabled. - viewport.MinDepth = 0.0f; - viewport.MaxDepth = 1.0f; - viewport.TopLeftX = 0; - viewport.TopLeftY = 0; - viewport.Width = 1280; - viewport.Height = 720; - } else { - // Viewport disabled. Geometry shaders will compensate for this. - viewport.MinDepth = 0.0f; - viewport.MaxDepth = 1.0f; - viewport.TopLeftX = 0; - viewport.TopLeftY = 0; - viewport.Width = 1280; - viewport.Height = 720; - } - context_->RSSetViewports(1, &viewport); - - // Viewport constants from D3D11VertexShader. - //"cbuffer vs_consts {\n" - //" float4 window;\n" // x,y,w,h - //" float4 viewport_z_enable;\n" // min,(max - min),?,enabled - //" float4 viewport_size;\n" // x,y,w,h - //"};" - // TODO(benvanik): only when viewport changes. - D3D11_MAPPED_SUBRESOURCE res; - context_->Map( - state_.constant_buffers.vs_consts, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - float* vsc_buffer = (float*)res.pData; - vsc_buffer[0] = (float)window_offset_x; - vsc_buffer[1] = (float)window_offset_y; - vsc_buffer[2] = (float)window_width; - vsc_buffer[3] = (float)window_height; - vsc_buffer[4] = viewport.MinDepth; - vsc_buffer[5] = viewport.MaxDepth - viewport.MinDepth; - vsc_buffer[6] = 0; // unused - vsc_buffer[7] = vport_xscale_enable ? 1.0f : 0.0f; - vsc_buffer[8] = viewport.TopLeftX; - vsc_buffer[9] = viewport.TopLeftY; - vsc_buffer[10] = viewport.Width; - vsc_buffer[11] = viewport.Height; - context_->Unmap(state_.constant_buffers.vs_consts, 0); - - // Scissoring. - // TODO(benvanik): pull from scissor registers. - // ScissorEnable must be set in raster state above. - uint32_t screen_scissor_tl = rf.values[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32; - uint32_t screen_scissor_br = rf.values[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32; - if (screen_scissor_tl != 0 && screen_scissor_br != 0x20002000) { - D3D11_RECT scissor_rect; - scissor_rect.top = (screen_scissor_tl >> 16) & 0x7FFF; - scissor_rect.left = screen_scissor_tl & 0x7FFF; - scissor_rect.bottom = (screen_scissor_br >> 16) & 0x7FFF; - scissor_rect.right = screen_scissor_br & 0x7FFF; - context_->RSSetScissorRects(1, &scissor_rect); - } else { - context_->RSSetScissorRects(0, NULL); - } - - static const D3D11_COMPARISON_FUNC compare_func_map[] = { - /* 0 */ D3D11_COMPARISON_NEVER, - /* 1 */ D3D11_COMPARISON_LESS, - /* 2 */ D3D11_COMPARISON_EQUAL, - /* 3 */ D3D11_COMPARISON_LESS_EQUAL, - /* 4 */ D3D11_COMPARISON_GREATER, - /* 5 */ D3D11_COMPARISON_NOT_EQUAL, - /* 6 */ D3D11_COMPARISON_GREATER_EQUAL, - /* 7 */ D3D11_COMPARISON_ALWAYS, - }; - static const D3D11_STENCIL_OP stencil_op_map[] = { - /* 0 */ D3D11_STENCIL_OP_KEEP, - /* 1 */ D3D11_STENCIL_OP_ZERO, - /* 2 */ D3D11_STENCIL_OP_REPLACE, - /* 3 */ D3D11_STENCIL_OP_INCR_SAT, - /* 4 */ D3D11_STENCIL_OP_DECR_SAT, - /* 5 */ D3D11_STENCIL_OP_INVERT, - /* 6 */ D3D11_STENCIL_OP_INCR, - /* 7 */ D3D11_STENCIL_OP_DECR, - }; - - // Depth-stencil state. - uint32_t depth_control = rf.values[XE_GPU_REG_RB_DEPTHCONTROL].u32; - uint32_t stencil_ref_mask = rf.values[XE_GPU_REG_RB_STENCILREFMASK].u32; - D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; - xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); - // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE - // ? - // A2XX_RB_DEPTHCONTROL_Z_ENABLE - depth_stencil_desc.DepthEnable = (depth_control & 0x00000002) != 0; - // A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE - depth_stencil_desc.DepthWriteMask = (depth_control & 0x00000004) ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO; - // A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE - // ? - // A2XX_RB_DEPTHCONTROL_ZFUNC - depth_stencil_desc.DepthFunc = compare_func_map[(depth_control & 0x00000070) >> 4]; - // A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE - depth_stencil_desc.StencilEnable = (depth_control & 0x00000001) != 0; - // RB_STENCILREFMASK_STENCILMASK - depth_stencil_desc.StencilReadMask = (stencil_ref_mask & 0x0000FF00) >> 8; - // RB_STENCILREFMASK_STENCILWRITEMASK - depth_stencil_desc.StencilWriteMask = (stencil_ref_mask & 0x00FF0000) >> 16; - // A2XX_RB_DEPTHCONTROL_STENCILFUNC - depth_stencil_desc.FrontFace.StencilFunc = compare_func_map[(depth_control & 0x00000700) >> 8]; - // A2XX_RB_DEPTHCONTROL_STENCILFAIL - depth_stencil_desc.FrontFace.StencilFailOp = stencil_op_map[(depth_control & 0x00003800) >> 11]; - // A2XX_RB_DEPTHCONTROL_STENCILZPASS - depth_stencil_desc.FrontFace.StencilPassOp = stencil_op_map[(depth_control & 0x0001C000) >> 14]; - // A2XX_RB_DEPTHCONTROL_STENCILZFAIL - depth_stencil_desc.FrontFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0x000E0000) >> 17]; - // A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF - depth_stencil_desc.BackFace.StencilFunc = compare_func_map[(depth_control & 0x00700000) >> 20]; - // A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF - depth_stencil_desc.BackFace.StencilFailOp = stencil_op_map[(depth_control & 0x03800000) >> 23]; - // A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF - depth_stencil_desc.BackFace.StencilPassOp = stencil_op_map[(depth_control & 0x1C000000) >> 26]; - // A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF - depth_stencil_desc.BackFace.StencilDepthFailOp = stencil_op_map[(depth_control & 0xE0000000) >> 29]; - // RB_STENCILREFMASK_STENCILREF - uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF); - ID3D11DepthStencilState* depth_stencil_state = 0; - device_->CreateDepthStencilState(&depth_stencil_desc, &depth_stencil_state); - context_->OMSetDepthStencilState(depth_stencil_state, stencil_ref); - XESAFERELEASE(depth_stencil_state); - - static const D3D11_BLEND blend_map[] = { - /* 0 */ D3D11_BLEND_ZERO, - /* 1 */ D3D11_BLEND_ONE, - /* 2 */ D3D11_BLEND_ZERO, // ? - /* 3 */ D3D11_BLEND_ZERO, // ? - /* 4 */ D3D11_BLEND_SRC_COLOR, - /* 5 */ D3D11_BLEND_INV_SRC_COLOR, - /* 6 */ D3D11_BLEND_SRC_ALPHA, - /* 7 */ D3D11_BLEND_INV_SRC_ALPHA, - /* 8 */ D3D11_BLEND_DEST_COLOR, - /* 9 */ D3D11_BLEND_INV_DEST_COLOR, - /* 10 */ D3D11_BLEND_DEST_ALPHA, - /* 11 */ D3D11_BLEND_INV_DEST_ALPHA, - /* 12 */ D3D11_BLEND_BLEND_FACTOR, - /* 13 */ D3D11_BLEND_INV_BLEND_FACTOR, - /* 14 */ D3D11_BLEND_SRC1_ALPHA, // ? - /* 15 */ D3D11_BLEND_INV_SRC1_ALPHA, // ? - /* 16 */ D3D11_BLEND_SRC_ALPHA_SAT, - }; - static const D3D11_BLEND_OP blend_op_map[] = { - /* 0 */ D3D11_BLEND_OP_ADD, - /* 1 */ D3D11_BLEND_OP_SUBTRACT, - /* 2 */ D3D11_BLEND_OP_MIN, - /* 3 */ D3D11_BLEND_OP_MAX, - /* 4 */ D3D11_BLEND_OP_REV_SUBTRACT, - }; - - // alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE - // Not in D3D11! - // http://msdn.microsoft.com/en-us/library/windows/desktop/bb205120(v=vs.85).aspx - uint32_t color_control = rf.values[XE_GPU_REG_RB_COLORCONTROL].u32; - - // Blend state. - uint32_t color_mask = rf.values[XE_GPU_REG_RB_COLOR_MASK].u32; - uint32_t sample_mask = 0xFFFFFFFF; // ? - float blend_factor[4] = { - rf.values[XE_GPU_REG_RB_BLEND_RED].f32, - rf.values[XE_GPU_REG_RB_BLEND_GREEN].f32, - rf.values[XE_GPU_REG_RB_BLEND_BLUE].f32, - rf.values[XE_GPU_REG_RB_BLEND_ALPHA].f32, - }; - uint32_t blend_control[4] = { - rf.values[XE_GPU_REG_RB_BLENDCONTROL_0].u32, - rf.values[XE_GPU_REG_RB_BLENDCONTROL_1].u32, - rf.values[XE_GPU_REG_RB_BLENDCONTROL_2].u32, - rf.values[XE_GPU_REG_RB_BLENDCONTROL_3].u32, - }; - D3D11_BLEND_DESC blend_desc; - xe_zero_struct(&blend_desc, sizeof(blend_desc)); - //blend_desc.AlphaToCoverageEnable = false; - // ? - blend_desc.IndependentBlendEnable = true; - for (int n = 0; n < XECOUNT(blend_control); n++) { - // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND - blend_desc.RenderTarget[n].SrcBlend = blend_map[(blend_control[n] & 0x0000001F) >> 0]; - // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND - blend_desc.RenderTarget[n].DestBlend = blend_map[(blend_control[n] & 0x00001F00) >> 8]; - // A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN - blend_desc.RenderTarget[n].BlendOp = blend_op_map[(blend_control[n] & 0x000000E0) >> 5]; - // A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND - blend_desc.RenderTarget[n].SrcBlendAlpha = blend_map[(blend_control[n] & 0x001F0000) >> 16]; - // A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND - blend_desc.RenderTarget[n].DestBlendAlpha = blend_map[(blend_control[n] & 0x1F000000) >> 24]; - // A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN - blend_desc.RenderTarget[n].BlendOpAlpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21]; - // A2XX_RB_COLOR_MASK_WRITE_* - blend_desc.RenderTarget[n].RenderTargetWriteMask = (color_mask >> (n * 4)) & 0xF; - // A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this! - // Just guess based on actions. - blend_desc.RenderTarget[n].BlendEnable = !( - (blend_desc.RenderTarget[n].SrcBlend == D3D11_BLEND_ONE) && - (blend_desc.RenderTarget[n].DestBlend == D3D11_BLEND_ZERO) && - (blend_desc.RenderTarget[n].BlendOp == D3D11_BLEND_OP_ADD) && - (blend_desc.RenderTarget[n].SrcBlendAlpha == D3D11_BLEND_ONE) && - (blend_desc.RenderTarget[n].DestBlendAlpha == D3D11_BLEND_ZERO) && - (blend_desc.RenderTarget[n].BlendOpAlpha == D3D11_BLEND_OP_ADD)); - } - ID3D11BlendState* blend_state = 0; - device_->CreateBlendState(&blend_desc, &blend_state); - context_->OMSetBlendState(blend_state, blend_factor, sample_mask); - XESAFERELEASE(blend_state); - - return 0; -} - -int D3D11GraphicsDriver::UpdateConstantBuffers() { - RegisterFile& rf = register_file_; - - D3D11_MAPPED_SUBRESOURCE res; - context_->Map( - state_.constant_buffers.float_constants, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - memcpy(res.pData, - &rf.values[XE_GPU_REG_SHADER_CONSTANT_000_X], - (512 * 4) * sizeof(float)); - context_->Unmap(state_.constant_buffers.float_constants, 0); - - context_->Map( - state_.constant_buffers.loop_constants, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - memcpy(res.pData, - &rf.values[XE_GPU_REG_SHADER_CONSTANT_LOOP_00], - (32) * sizeof(int)); - context_->Unmap(state_.constant_buffers.loop_constants, 0); - - context_->Map( - state_.constant_buffers.bool_constants, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - memcpy(res.pData, - &rf.values[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031], - (8) * sizeof(int)); - context_->Unmap(state_.constant_buffers.bool_constants, 0); - - return 0; -} - -int D3D11GraphicsDriver::BindShaders() { - RegisterFile& rf = register_file_; - xe_gpu_program_cntl_t program_cntl; - program_cntl.dword_0 = rf.values[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; - - // Vertex shader setup. - D3D11VertexShader* vs = state_.vertex_shader; - if (vs) { - if (!vs->is_prepared()) { - // Prepare for use. - if (vs->Prepare(&program_cntl)) { - XELOGGPU("D3D11: failed to prepare vertex shader"); - state_.vertex_shader = NULL; - return 1; - } - } - - // Bind. - context_->VSSetShader(vs->handle(), NULL, 0); - - // Set constant buffers. - ID3D11Buffer* vs_constant_buffers[] = { - state_.constant_buffers.float_constants, - state_.constant_buffers.bool_constants, - state_.constant_buffers.loop_constants, - state_.constant_buffers.vs_consts, - }; - context_->VSSetConstantBuffers( - 0, XECOUNT(vs_constant_buffers), vs_constant_buffers); - - // Setup input layout (as encoded in vertex shader). - context_->IASetInputLayout(vs->input_layout()); - - //context_->VSSetSamplers - //context_->VSSetShaderResources - } else { - context_->VSSetShader(NULL, NULL, 0); - context_->IASetInputLayout(NULL); - return 1; - } - - // Pixel shader setup. - D3D11PixelShader* ps = state_.pixel_shader; - if (ps) { - if (!ps->is_prepared()) { - // Prepare for use. - if (ps->Prepare(&program_cntl, vs)) { - XELOGGPU("D3D11: failed to prepare pixel shader"); - state_.pixel_shader = NULL; - return 1; - } - } - - // Bind. - context_->PSSetShader(ps->handle(), NULL, 0); - - // Set constant buffers. - ID3D11Buffer* vs_constant_buffers[] = { - state_.constant_buffers.float_constants, - state_.constant_buffers.bool_constants, - state_.constant_buffers.loop_constants, - }; - context_->PSSetConstantBuffers( - 0, XECOUNT(vs_constant_buffers), vs_constant_buffers); - - // TODO(benvanik): set samplers for all inputs. - D3D11_SAMPLER_DESC sampler_desc; - xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); - //sampler_desc.Filter = ? - sampler_desc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; - sampler_desc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; - sampler_desc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; - sampler_desc.MipLODBias = 0; - sampler_desc.MaxAnisotropy = 1; - sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; - //sampler_desc.BorderColor = ...; - sampler_desc.MinLOD = 0; - sampler_desc.MaxLOD = 0; - ID3D11SamplerState* sampler_state = NULL; - device_->CreateSamplerState(&sampler_desc, &sampler_state); - ID3D11SamplerState* sampler_states[] = { sampler_state }; - context_->PSSetSamplers(0, XECOUNT(sampler_states), sampler_states); - sampler_state->Release(); - - //context_->PSSetShaderResources - } else { - context_->PSSetShader(NULL, NULL, 0); - return 1; - } - - return 0; -} - -int D3D11GraphicsDriver::PrepareFetchers() { - // Input assembly. - XEASSERTNOTNULL(state_.vertex_shader); - auto vtx_inputs = state_.vertex_shader->GetVertexBufferInputs(); - for (size_t n = 0; n < vtx_inputs->count; n++) { - auto input = vtx_inputs->descs[n]; - if (PrepareVertexBuffer(input)) { - XELOGE("D3D11: unable to prepare vertex buffer"); - return 1; - } - } - - // All texture inputs. - if (PrepareTextureFetchers()) { - XELOGE("D3D11: unable to prepare texture fetchers"); - return 1; - } - - // Vertex texture samplers. - auto tex_inputs = state_.vertex_shader->GetTextureBufferInputs(); - for (size_t n = 0; n < tex_inputs->count; n++) { - auto input = tex_inputs->descs[n]; - if (PrepareTextureSampler(XE_GPU_SHADER_TYPE_VERTEX, input)) { - XELOGE("D3D11: unable to prepare texture buffer"); - return 1; - } - } - - // Pixel shader texture sampler. - XEASSERTNOTNULL(state_.pixel_shader); - tex_inputs = state_.pixel_shader->GetTextureBufferInputs(); - for (size_t n = 0; n < tex_inputs->count; n++) { - auto input = tex_inputs->descs[n]; - if (PrepareTextureSampler(XE_GPU_SHADER_TYPE_PIXEL, input)) { - XELOGE("D3D11: unable to prepare texture buffer"); - return 1; - } - } - - return 0; -} - -int D3D11GraphicsDriver::PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc) { - RegisterFile& rf = register_file_; - int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; - xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; - xe_gpu_vertex_fetch_t* fetch = NULL; - switch (desc.fetch_slot % 3) { - case 0: - fetch = &group->vertex_fetch_0; - break; - case 1: - fetch = &group->vertex_fetch_1; - break; - case 2: - fetch = &group->vertex_fetch_2; - break; - } - XEASSERTNOTNULL(fetch); - // If this assert doesn't hold, maybe we just abort? - XEASSERT(fetch->type == 0x3); - XEASSERTNOTZERO(fetch->size); - - ID3D11Buffer* buffer = 0; - D3D11_BUFFER_DESC buffer_desc; - xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = fetch->size * 4; - buffer_desc.Usage = D3D11_USAGE_DYNAMIC; - buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; - buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - HRESULT hr = device_->CreateBuffer(&buffer_desc, NULL, &buffer); - if (FAILED(hr)) { - XELOGE("D3D11: unable to create vertex fetch buffer"); - return 1; - } - D3D11_MAPPED_SUBRESOURCE res; - hr = context_->Map(buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: unable to map vertex fetch buffer"); - XESAFERELEASE(buffer); - return 1; - } - uint32_t address = (fetch->address << 2) + address_translation_; - uint8_t* src = (uint8_t*)memory_->Translate(address); - uint8_t* dest = (uint8_t*)res.pData; - // TODO(benvanik): rewrite to be faster/special case common/etc - for (size_t n = 0; n < desc.element_count; n++) { - auto& el = desc.elements[n]; - uint32_t stride = desc.stride_words; - uint32_t count = fetch->size / stride; - uint32_t* src_ptr = (uint32_t*)(src + el.offset_words * 4); - uint32_t* dest_ptr = (uint32_t*)(dest + el.offset_words * 4); - uint32_t o = 0; - for (uint32_t i = 0; i < count; i++) { - for (uint32_t j = 0; j < el.size_words; j++) { - dest_ptr[o + j] = XESWAP32(src_ptr[o + j]); - } - o += stride; - } - } - context_->Unmap(buffer, 0); - - D3D11VertexShader* vs = state_.vertex_shader; - if (!vs) { - return 1; - } - // TODO(benvanik): always dword aligned? - uint32_t stride = desc.stride_words * 4; - uint32_t offset = 0; - int vb_slot = desc.input_index; - context_->IASetVertexBuffers(vb_slot, 1, &buffer, &stride, &offset); - - buffer->Release(); - - return 0; -} - -int D3D11GraphicsDriver::PrepareTextureFetchers() { - RegisterFile& rf = register_file_; - - for (int n = 0; n < XECOUNT(state_.texture_fetchers); n++) { - auto& fetcher = state_.texture_fetchers[n]; - - // TODO(benvanik): caching. - fetcher.enabled = false; - XESAFERELEASE(fetcher.view); - fetcher.view = NULL; - - int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + n * 6; - xe_gpu_fetch_group_t* group = (xe_gpu_fetch_group_t*)&rf.values[r]; - auto& fetch = group->texture_fetch; - if (fetch.type != 0x2) { - continue; - } - - // Stash a copy of the fetch register. - fetcher.fetch = fetch; - - fetcher.info = GetTextureInfo(fetch); - if (fetcher.info.format == DXGI_FORMAT_UNKNOWN) { - XELOGW("D3D11: unknown texture format %d", fetch.format); - continue; - } - - D3D11_SHADER_RESOURCE_VIEW_DESC texture_view_desc; - xe_zero_struct(&texture_view_desc, sizeof(texture_view_desc)); - // TODO(benvanik): this may need to be typed on the fetch instruction (float/int/etc?) - texture_view_desc.Format = fetcher.info.format; - - ID3D11Resource* texture = NULL; - D3D_SRV_DIMENSION dimension = D3D11_SRV_DIMENSION_UNKNOWN; - switch (fetch.dimension) { - case DIMENSION_1D: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; - texture_view_desc.Texture1D.MipLevels = 1; - texture_view_desc.Texture1D.MostDetailedMip = 0; - if (FetchTexture1D(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch Texture1D"); - return 1; - } - break; - case DIMENSION_2D: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; - texture_view_desc.Texture2D.MipLevels = 1; - texture_view_desc.Texture2D.MostDetailedMip = 0; - if (FetchTexture2D(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch Texture2D"); - return 1; - } - break; - case DIMENSION_3D: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; - texture_view_desc.Texture3D.MipLevels = 1; - texture_view_desc.Texture3D.MostDetailedMip = 0; - if (FetchTexture3D(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch Texture3D"); - return 1; - } - break; - case DIMENSION_CUBE: - texture_view_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; - texture_view_desc.TextureCube.MipLevels = 1; - texture_view_desc.TextureCube.MostDetailedMip = 0; - if (FetchTextureCube(fetch, fetcher.info, &texture)) { - XELOGE("D3D11: failed to fetch TextureCube"); - return 1; - } - break; - } - - XEASSERTNOTNULL(texture); - - ID3D11ShaderResourceView* texture_view = NULL; - HRESULT hr = device_->CreateShaderResourceView( - texture, &texture_view_desc, &texture_view); - if (FAILED(hr)) { - XELOGE("D3D11: unable to create texture resource view"); - texture->Release(); - return 1; - } - texture->Release(); - - fetcher.enabled = true; - fetcher.view = texture_view; - } - - return 0; -} - -// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx -D3D11GraphicsDriver::TextureInfo D3D11GraphicsDriver::GetTextureInfo( - xe_gpu_texture_fetch_t& fetch) { - // a2xx_sq_surfaceformat - TextureInfo info; - info.format = DXGI_FORMAT_UNKNOWN; - info.block_size = 0; - info.texel_pitch = 0; - info.is_compressed = false; - switch (fetch.format) { - case FMT_8: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RRR1: - info.format = DXGI_FORMAT_R8_UNORM; - break; - case XE_GPU_SWIZZLE_000R: - info.format = DXGI_FORMAT_A8_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_8"); - info.format = DXGI_FORMAT_A8_UNORM; - break; - } - info.block_size = 1; - info.texel_pitch = 1; - break; - case FMT_1_5_5_5: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_BGRA: - info.format = DXGI_FORMAT_B5G5R5A1_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_1_5_5_5"); - info.format = DXGI_FORMAT_B5G5R5A1_UNORM; - break; - } - info.block_size = 1; - info.texel_pitch = 2; - break; - case FMT_8_8_8_8: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RGBA: - info.format = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - case XE_GPU_SWIZZLE_BGRA: - info.format = DXGI_FORMAT_B8G8R8A8_UNORM; - break; - case XE_GPU_SWIZZLE_RGB1: - info.format = DXGI_FORMAT_R8G8B8A8_UNORM; // ? - break; - case XE_GPU_SWIZZLE_BGR1: - info.format = DXGI_FORMAT_B8G8R8X8_UNORM; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_8_8_8_8"); - info.format = DXGI_FORMAT_R8G8B8A8_UNORM; - break; - } - info.block_size = 1; - info.texel_pitch = 4; - break; - case FMT_4_4_4_4: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_BGRA: - info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_4_4_4_4"); - info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ - break; - } - info.block_size = 1; - info.texel_pitch = 2; - break; - case FMT_16_16_16_16_FLOAT: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_RGBA: - info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_16_16_16_16_FLOAT"); - info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - } - info.block_size = 1; - info.texel_pitch = 8; - break; - case FMT_32_FLOAT: - switch (fetch.swizzle) { - case XE_GPU_SWIZZLE_R111: - info.format = DXGI_FORMAT_R32_FLOAT; - break; - default: - XELOGW("D3D11: unhandled swizzle for FMT_32_FLOAT"); - info.format = DXGI_FORMAT_R32_FLOAT; - break; - } - info.block_size = 1; - info.texel_pitch = 4; - break; - case FMT_DXT1: - info.format = DXGI_FORMAT_BC1_UNORM; - info.block_size = 4; - info.texel_pitch = 8; - info.is_compressed = true; - break; - case FMT_DXT2_3: - case FMT_DXT4_5: - info.format = (fetch.format == FMT_DXT4_5 ? DXGI_FORMAT_BC3_UNORM : DXGI_FORMAT_BC2_UNORM); - info.block_size = 4; - info.texel_pitch = 16; - info.is_compressed = true; - break; - case FMT_1_REVERSE: - case FMT_1: - case FMT_5_6_5: - case FMT_6_5_5: - case FMT_2_10_10_10: - case FMT_8_A: - case FMT_8_B: - case FMT_8_8: - case FMT_Cr_Y1_Cb_Y0: - case FMT_Y1_Cr_Y0_Cb: - case FMT_5_5_5_1: - case FMT_8_8_8_8_A: - case FMT_10_11_11: - case FMT_11_11_10: - case FMT_24_8: - case FMT_24_8_FLOAT: - case FMT_16: - case FMT_16_16: - case FMT_16_16_16_16: - case FMT_16_EXPAND: - case FMT_16_16_EXPAND: - case FMT_16_16_16_16_EXPAND: - case FMT_16_FLOAT: - case FMT_16_16_FLOAT: - case FMT_32: - case FMT_32_32: - case FMT_32_32_32_32: - case FMT_32_32_FLOAT: - case FMT_32_32_32_32_FLOAT: - case FMT_32_AS_8: - case FMT_32_AS_8_8: - case FMT_16_MPEG: - case FMT_16_16_MPEG: - case FMT_8_INTERLACED: - case FMT_32_AS_8_INTERLACED: - case FMT_32_AS_8_8_INTERLACED: - case FMT_16_INTERLACED: - case FMT_16_MPEG_INTERLACED: - case FMT_16_16_MPEG_INTERLACED: - case FMT_DXN: - case FMT_8_8_8_8_AS_16_16_16_16: - case FMT_DXT1_AS_16_16_16_16: - case FMT_DXT2_3_AS_16_16_16_16: - case FMT_DXT4_5_AS_16_16_16_16: - case FMT_2_10_10_10_AS_16_16_16_16: - case FMT_10_11_11_AS_16_16_16_16: - case FMT_11_11_10_AS_16_16_16_16: - case FMT_32_32_32_FLOAT: - case FMT_DXT3A: - case FMT_DXT5A: - case FMT_CTX1: - case FMT_DXT3A_AS_1_1_1_1: - info.format = DXGI_FORMAT_UNKNOWN; - break; - } - return info; -} - -int D3D11GraphicsDriver::FetchTexture1D( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - uint32_t address = (fetch.address << 12) + address_translation_; - - uint32_t width = 1 + fetch.size_1d.width; - - D3D11_TEXTURE1D_DESC texture_desc; - xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = width; - texture_desc.MipLevels = 1; - texture_desc.ArraySize = 1; - texture_desc.Format = info.format; - texture_desc.Usage = D3D11_USAGE_DYNAMIC; - texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? - HRESULT hr = device_->CreateTexture1D( - &texture_desc, NULL, (ID3D11Texture1D**)out_texture); - if (FAILED(hr)) { - return 1; - } - - return 0; -} - -XEFORCEINLINE void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch, XE_GPU_ENDIAN endianness) { - switch (endianness) { - case XE_GPU_ENDIAN_8IN16: - for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { - *(uint16_t*)dest = XESWAP16(*(uint16_t*)src); - } - break; - case XE_GPU_ENDIAN_8IN32: // Swap bytes. - for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { - *(uint32_t*)dest = XESWAP32(*(uint32_t*)src); - } - break; - case XE_GPU_ENDIAN_16IN32: // Swap half words. - for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { - uint32_t value = *(uint32_t*)src; - *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); - } - break; - default: - case XE_GPU_ENDIAN_NONE: - memcpy(dest, src, pitch); - break; - } -} - -// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104 -XEFORCEINLINE uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, uint32_t log_bpp) -{ - uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); - uint32_t micro = ((y & 6) << 2) << log_bpp; - return macro + ((micro & ~15) << 1) + (micro & 15) + ((y & 8) << (3 + log_bpp)) + ((y & 1) << 4); -} - -XEFORCEINLINE uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, uint32_t base_offset) -{ - uint32_t macro = (x >> 5) << (bpp + 7); - uint32_t micro = (x & 7) << bpp; - uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); - return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + - ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); -} - -int D3D11GraphicsDriver::FetchTexture2D( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - XEASSERTTRUE(fetch.dimension == 1); - - uint32_t address = (fetch.address << 12) + address_translation_; - - uint32_t logical_width = 1 + fetch.size_2d.width; - uint32_t logical_height = 1 + fetch.size_2d.height; - - uint32_t block_width = logical_width / info.block_size; - uint32_t block_height = logical_height / info.block_size; - - uint32_t input_width, input_height; - uint32_t output_width, output_height; - - if (!info.is_compressed) { - // must be 32x32, but also must have a pitch that is a multiple of 256 bytes - uint32_t bytes_per_block = info.block_size * info.block_size * info.texel_pitch; - uint32_t width_multiple = 32; - if (bytes_per_block) { - uint32_t minimum_multiple = 256 / bytes_per_block; - if (width_multiple < minimum_multiple) { - width_multiple = minimum_multiple; - } - } - - input_width = XEROUNDUP(logical_width, width_multiple); - input_height = XEROUNDUP(logical_height, 32); - output_width = logical_width; - output_height = logical_height; - } - else { - // must be 128x128 - input_width = XEROUNDUP(logical_width, 128); - input_height = XEROUNDUP(logical_height, 128); - output_width = XENEXTPOW2(logical_width); - output_height = XENEXTPOW2(logical_height); - } - - D3D11_TEXTURE2D_DESC texture_desc; - xe_zero_struct(&texture_desc, sizeof(texture_desc)); - texture_desc.Width = output_width; - texture_desc.Height = output_height; - texture_desc.MipLevels = 1; - texture_desc.ArraySize = 1; - texture_desc.Format = info.format; - texture_desc.SampleDesc.Count = 1; - texture_desc.SampleDesc.Quality = 0; - texture_desc.Usage = D3D11_USAGE_DYNAMIC; - texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; - texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? - HRESULT hr = device_->CreateTexture2D( - &texture_desc, NULL, (ID3D11Texture2D**)out_texture); - if (FAILED(hr)) { - return 1; - } - - // TODO(benvanik): all mip levels. - D3D11_MAPPED_SUBRESOURCE res; - hr = context_->Map(*out_texture, 0, - D3D11_MAP_WRITE_DISCARD, 0, &res); - if (FAILED(hr)) { - XELOGE("D3D11: failed to map texture"); - return 1; - } - - auto logical_pitch = (logical_width / info.block_size) * info.texel_pitch; - auto input_pitch = (input_width / info.block_size) * info.texel_pitch; - auto output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; - - const uint8_t* src = memory_->Translate(address); - uint8_t* dest = (uint8_t*)res.pData; - - memset(dest, 0, output_pitch * (output_height / info.block_size)); // TODO(gibbed): remove me later - - if (!fetch.tiled) { - dest = (uint8_t*)res.pData; - for (uint32_t y = 0; y < block_height; y++) { - for (uint32_t x = 0; x < logical_pitch; x += info.texel_pitch) { - TextureSwap(dest + x, src + x, info.texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); - } - src += input_pitch; - dest += output_pitch; - } - } - else { - auto bpp = (info.texel_pitch >> 2) + ((info.texel_pitch >> 1) >> (info.texel_pitch >> 2)); - for (uint32_t y = 0, output_base_offset = 0; y < block_height; y++, output_base_offset += output_pitch) { - auto input_base_offset = TiledOffset2DOuter(y, (input_width / info.block_size), bpp); - for (uint32_t x = 0, output_offset = output_base_offset; x < block_width; x++, output_offset += info.texel_pitch) { - auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; - TextureSwap(dest + output_offset, - src + input_offset * info.texel_pitch, - info.texel_pitch, (XE_GPU_ENDIAN)fetch.endianness); - } - } - } - context_->Unmap(*out_texture, 0); - return 0; -} - -int D3D11GraphicsDriver::FetchTexture3D( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - XELOGE("D3D11: FetchTexture2D not yet implemented"); - XEASSERTALWAYS(); - return 1; - //D3D11_TEXTURE3D_DESC texture_desc; - //xe_zero_struct(&texture_desc, sizeof(texture_desc)); - //texture_desc.Width; - //texture_desc.Height; - //texture_desc.Depth; - //texture_desc.MipLevels; - //texture_desc.Format; - //texture_desc.Usage; - //texture_desc.BindFlags; - //texture_desc.CPUAccessFlags; - //texture_desc.MiscFlags; - //hr = device_->CreateTexture3D( - // &texture_desc, &initial_data, (ID3D11Texture3D**)&texture); -} - -int D3D11GraphicsDriver::FetchTextureCube( - xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture) { - XELOGE("D3D11: FetchTextureCube not yet implemented"); - XEASSERTALWAYS(); - return 1; -} - -int D3D11GraphicsDriver::PrepareTextureSampler( - xenos::XE_GPU_SHADER_TYPE shader_type, Shader::tex_buffer_desc_t& desc) { - - auto& fetcher = state_.texture_fetchers[desc.fetch_slot]; - auto& info = fetcher.info; - if (!fetcher.enabled || - info.format == DXGI_FORMAT_UNKNOWN) { - XELOGW("D3D11: ignoring texture fetch: disabled or an unknown format"); - if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetShaderResources(desc.input_index, - 1, &invalid_texture_view_); - context_->VSSetSamplers(desc.input_index, - 1, &invalid_texture_sampler_state_); - } else { - context_->PSSetShaderResources(desc.input_index, - 1, &invalid_texture_view_); - context_->PSSetSamplers(desc.input_index, - 1, &invalid_texture_sampler_state_); - } - return 0; - } - - HRESULT hr; - - if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetShaderResources(desc.input_index, 1, &fetcher.view); - } else { - context_->PSSetShaderResources(desc.input_index, 1, &fetcher.view); - } - - D3D11_SAMPLER_DESC sampler_desc; - xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); - uint32_t min_filter = desc.tex_fetch.min_filter == 3 ? - fetcher.fetch.min_filter : desc.tex_fetch.min_filter; - uint32_t mag_filter = desc.tex_fetch.mag_filter == 3 ? - fetcher.fetch.mag_filter : desc.tex_fetch.mag_filter; - uint32_t mip_filter = desc.tex_fetch.mip_filter == 3 ? - fetcher.fetch.mip_filter : desc.tex_fetch.mip_filter; - // MIN, MAG, MIP - static const D3D11_FILTER filter_matrix[2][2][3] = { - { - // min = POINT - { - // mag = POINT - D3D11_FILTER_MIN_MAG_MIP_POINT, - D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, - D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, // basemap? - }, - { - // mag = LINEAR - D3D11_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT, - D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, - D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, // basemap? - }, - }, - { - // min = LINEAR - { - // mag = POINT - D3D11_FILTER_MIN_LINEAR_MAG_MIP_POINT, - D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, - D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, // basemap? - }, - { - // mag = LINEAR - D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT, - D3D11_FILTER_MIN_MAG_MIP_LINEAR, - D3D11_FILTER_MIN_MAG_MIP_LINEAR, // basemap? - }, - }, - }; - sampler_desc.Filter = filter_matrix[min_filter][mag_filter][mip_filter]; - static const D3D11_TEXTURE_ADDRESS_MODE mode_map[] = { - D3D11_TEXTURE_ADDRESS_WRAP, - D3D11_TEXTURE_ADDRESS_MIRROR, - D3D11_TEXTURE_ADDRESS_CLAMP, // ? - D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? - D3D11_TEXTURE_ADDRESS_CLAMP, // ? - D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? - D3D11_TEXTURE_ADDRESS_BORDER, // ? - D3D11_TEXTURE_ADDRESS_MIRROR, // ? - }; - sampler_desc.AddressU = mode_map[fetcher.fetch.clamp_x]; - sampler_desc.AddressV = mode_map[fetcher.fetch.clamp_y]; - sampler_desc.AddressW = mode_map[fetcher.fetch.clamp_z]; - sampler_desc.MipLODBias; - sampler_desc.MaxAnisotropy = 1; - sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; - sampler_desc.BorderColor[0]; - sampler_desc.BorderColor[1]; - sampler_desc.BorderColor[2]; - sampler_desc.BorderColor[3]; - sampler_desc.MinLOD; - sampler_desc.MaxLOD; - ID3D11SamplerState* sampler_state = NULL; - hr = device_->CreateSamplerState(&sampler_desc, &sampler_state); - if (FAILED(hr)) { - XELOGE("D3D11: unable to create sampler state"); - return 1; - } - if (shader_type == XE_GPU_SHADER_TYPE_VERTEX) { - context_->VSSetSamplers(desc.input_index, 1, &sampler_state); - } else { - context_->PSSetSamplers(desc.input_index, 1, &sampler_state); - } - sampler_state->Release(); - - return 0; -} - -int D3D11GraphicsDriver::PrepareIndexBuffer( - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) { - RegisterFile& rf = register_file_; - - uint32_t address = index_base + address_translation_; - - // All that's done so far: - XEASSERT(endianness == 0x2); - - ID3D11Buffer* buffer = 0; - D3D11_BUFFER_DESC buffer_desc; - xe_zero_struct(&buffer_desc, sizeof(buffer_desc)); - buffer_desc.ByteWidth = index_size; - buffer_desc.Usage = D3D11_USAGE_DYNAMIC; - buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER; - buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - device_->CreateBuffer(&buffer_desc, NULL, &buffer); - D3D11_MAPPED_SUBRESOURCE res; - context_->Map(buffer, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); - if (index_32bit) { - uint32_t* src = (uint32_t*)memory_->Translate(address); - uint32_t* dest = (uint32_t*)res.pData; - for (uint32_t n = 0; n < index_count; n++) { - uint32_t d = { XESWAP32(src[n]) }; - //XELOGGPU("i%.4d %0.8X", n, d); - dest[n] = d; - } - } else { - uint16_t* src = (uint16_t*)memory_->Translate(address); - uint16_t* dest = (uint16_t*)res.pData; - for (uint32_t n = 0; n < index_count; n++) { - uint16_t d = XESWAP16(src[n]); - //XELOGGPU("i%.4d, %.4X", n, d); - dest[n] = d; - } - } - context_->Unmap(buffer, 0); - - DXGI_FORMAT format; - format = index_32bit ? DXGI_FORMAT_R32_UINT : DXGI_FORMAT_R16_UINT; - context_->IASetIndexBuffer(buffer, format, 0); - - buffer->Release(); - - return 0; -} - int D3D11GraphicsDriver::Resolve() { + SCOPE_profile_cpu_f("gpu"); + // No clue how this is supposed to work yet. ID3D11Texture2D* back_buffer = 0; swap_chain_->GetBuffer(0, __uuidof(ID3D11Texture2D), diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h index 94ccfe748..2d23b142f 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_driver.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_driver.h @@ -13,8 +13,8 @@ #include #include -#include #include +#include #include #include @@ -24,10 +24,6 @@ namespace xe { namespace gpu { namespace d3d11 { -class D3D11PixelShader; -class D3D11ShaderCache; -class D3D11VertexShader; - class D3D11GraphicsDriver : public GraphicsDriver { public: @@ -35,69 +31,43 @@ public: Memory* memory, IDXGISwapChain* swap_chain, ID3D11Device* device); virtual ~D3D11GraphicsDriver(); - virtual void Initialize(); + ResourceCache* resource_cache() const override { return resource_cache_; } - virtual void InvalidateState( - uint32_t mask); - virtual void SetShader( - xenos::XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length); - virtual void DrawIndexBuffer( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness); - virtual void DrawIndexAuto( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count); + int Initialize() override; + + int Draw(const DrawCommand& command) override; // TODO(benvanik): figure this out. - virtual int Resolve(); + int Resolve() override; private: - int SetupDraw(xenos::XE_GPU_PRIMITIVE_TYPE prim_type); + void InitializeInvalidTexture(); + + int UpdateState(const DrawCommand& command); + int SetupRasterizerState(const DrawCommand& command); + int SetupBlendState(const DrawCommand& command); + int SetupDepthStencilState(const DrawCommand& command); + int SetupConstantBuffers(const DrawCommand& command); + int SetupShaders(const DrawCommand& command); + int SetupInputAssembly(const DrawCommand& command); + int SetupSamplers(const DrawCommand& command); + int RebuildRenderTargets(uint32_t width, uint32_t height); - int UpdateState(uint32_t state_overrides = 0); - int UpdateConstantBuffers(); - int BindShaders(); - int PrepareFetchers(); - int PrepareVertexBuffer(Shader::vtx_buffer_desc_t& desc); - int PrepareTextureFetchers(); - int PrepareTextureSampler(xenos::XE_GPU_SHADER_TYPE shader_type, - Shader::tex_buffer_desc_t& desc); - typedef struct { - DXGI_FORMAT format; - uint32_t block_size; - uint32_t texel_pitch; - bool is_compressed; - } TextureInfo; - TextureInfo GetTextureInfo(xenos::xe_gpu_texture_fetch_t& fetch); - int FetchTexture1D(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int FetchTexture2D(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int FetchTexture3D(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int FetchTextureCube(xenos::xe_gpu_texture_fetch_t& fetch, - TextureInfo& info, - ID3D11Resource** out_texture); - int PrepareIndexBuffer( - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness); private: IDXGISwapChain* swap_chain_; ID3D11Device* device_; ID3D11DeviceContext* context_; - D3D11ShaderCache* shader_cache_; + + D3D11ResourceCache* resource_cache_; ID3D11ShaderResourceView* invalid_texture_view_; ID3D11SamplerState* invalid_texture_sampler_state_; + std::unordered_map rasterizer_state_cache_; + std::unordered_map blend_state_cache_; + std::unordered_map depth_stencil_state_cache_; + struct { uint32_t width; uint32_t height; @@ -111,9 +81,6 @@ private: } render_targets_; struct { - D3D11VertexShader* vertex_shader; - D3D11PixelShader* pixel_shader; - struct { ID3D11Buffer* float_constants; ID3D11Buffer* bool_constants; @@ -121,18 +88,7 @@ private: ID3D11Buffer* vs_consts; ID3D11Buffer* gs_consts; } constant_buffers; - - struct { - bool enabled; - xenos::xe_gpu_texture_fetch_t fetch; - TextureInfo info; - ID3D11ShaderResourceView* view; - } texture_fetchers[32]; } state_; - - enum StateOverrides { - STATE_OVERRIDE_DISABLE_CULLING = (1 << 0), - }; }; diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index f1432dc50..8e6fc5a7e 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -20,21 +20,11 @@ using namespace xe::gpu; using namespace xe::gpu::d3d11; -namespace { - -void __stdcall D3D11GraphicsSystemVsyncCallback( - D3D11GraphicsSystem* gs, BOOLEAN) { - gs->MarkVblank(); - gs->DispatchInterruptCallback(0); -} - -} - - -D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator) : - window_(0), dxgi_factory_(0), device_(0), - timer_queue_(NULL), vsync_timer_(NULL), - GraphicsSystem(emulator) { +D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator) + : GraphicsSystem(emulator), + window_(nullptr), dxgi_factory_(nullptr), device_(nullptr), + timer_queue_(nullptr), vsync_timer_(nullptr), + last_swap_time_(0.0) { } D3D11GraphicsSystem::~D3D11GraphicsSystem() { @@ -50,10 +40,10 @@ void D3D11GraphicsSystem::Initialize() { CreateTimerQueueTimer( &vsync_timer_, timer_queue_, - (WAITORTIMERCALLBACK)D3D11GraphicsSystemVsyncCallback, + (WAITORTIMERCALLBACK)VsyncCallback, this, 16, - 100, + 16, WT_EXECUTEINTIMERTHREAD); // Create DXGI factory so we can get a swap chain/etc. @@ -139,31 +129,55 @@ void D3D11GraphicsSystem::Initialize() { XEASSERTNULL(driver_); driver_ = new D3D11GraphicsDriver( memory_, window_->swap_chain(), device_); + if (driver_->Initialize()) { + XELOGE("Unable to initialize D3D11 driver"); + return; + } // Initial vsync kick. DispatchInterruptCallback(0); } void D3D11GraphicsSystem::Pump() { - if (swap_pending_) { - swap_pending_ = false; + SCOPE_profile_cpu_f("gpu"); - // TODO(benvanik): remove this when commands are understood. - driver_->Resolve(); - - // Swap window. - // If we are set to vsync this will block. - window_->Swap(); - - DispatchInterruptCallback(0); - } else { - // If we have gone too long without an interrupt, fire one. - if (xe_pal_now() - last_interrupt_time_ > 500 / 1000.0) { - DispatchInterruptCallback(0); + double time_since_last_swap = xe_pal_now() - last_swap_time_; + if (time_since_last_swap > 1.0) { + // Force a swap when profiling. + if (Profiler::is_enabled()) { + window_->Swap(); } } } +void D3D11GraphicsSystem::Swap() { + // TODO(benvanik): remove this when commands are understood. + driver_->Resolve(); + + // Swap window. + // If we are set to vsync this will block. + window_->Swap(); + + last_swap_time_ = xe_pal_now(); +} + +void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs, + BOOLEAN) { + static bool thread_name_set = false; + if (!thread_name_set) { + thread_name_set = true; + Profiler::ThreadEnter("VsyncTimer"); + } + SCOPE_profile_cpu_f("gpu"); + + gs->MarkVblank(); + + // TODO(benvanik): we shouldn't need to do the dispatch here, but there's + // something wrong and the CP will block waiting for code that + // needs to be run in the interrupt. + gs->DispatchInterruptCallback(0); +} + void D3D11GraphicsSystem::Shutdown() { GraphicsSystem::Shutdown(); diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.h b/src/xenia/gpu/d3d11/d3d11_graphics_system.h index 0414d1bb5..7bd641667 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.h @@ -35,17 +35,23 @@ public: virtual void Shutdown(); + void Swap() override; + protected: virtual void Initialize(); virtual void Pump(); private: + static void __stdcall VsyncCallback(D3D11GraphicsSystem* gs, BOOLEAN); + IDXGIFactory1* dxgi_factory_; ID3D11Device* device_; D3D11Window* window_; HANDLE timer_queue_; HANDLE vsync_timer_; + + double last_swap_time_; }; diff --git a/src/xenia/gpu/d3d11/d3d11_profiler_display.cc b/src/xenia/gpu/d3d11/d3d11_profiler_display.cc new file mode 100644 index 000000000..276b73c40 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_profiler_display.cc @@ -0,0 +1,647 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; + + +namespace { +const uint8_t profiler_font[] = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x10,0x78,0x38,0x78,0x7c,0x7c,0x3c,0x44,0x38,0x04,0x44,0x40,0x44,0x44,0x38,0x78, + 0x38,0x78,0x38,0x7c,0x44,0x44,0x44,0x44,0x44,0x7c,0x00,0x00,0x40,0x00,0x04,0x00, + 0x18,0x00,0x40,0x10,0x08,0x40,0x30,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x38,0x10,0x38,0x7c,0x08,0x7c,0x1c,0x7c,0x38,0x38, + 0x10,0x28,0x28,0x10,0x00,0x20,0x10,0x08,0x10,0x10,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x04,0x00,0x20,0x38,0x38,0x70,0x00,0x1c,0x10,0x00,0x1c,0x10,0x70,0x30,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x28,0x44,0x44,0x44,0x40,0x40,0x40,0x44,0x10,0x04,0x48,0x40,0x6c,0x44,0x44,0x44, + 0x44,0x44,0x44,0x10,0x44,0x44,0x44,0x44,0x44,0x04,0x00,0x00,0x40,0x00,0x04,0x00, + 0x24,0x00,0x40,0x00,0x00,0x40,0x10,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x10,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x44,0x30,0x44,0x04,0x18,0x40,0x20,0x04,0x44,0x44, + 0x10,0x28,0x28,0x3c,0x44,0x50,0x10,0x10,0x08,0x54,0x10,0x00,0x00,0x00,0x04,0x00, + 0x00,0x08,0x00,0x10,0x44,0x44,0x40,0x40,0x04,0x28,0x00,0x30,0x10,0x18,0x58,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x44,0x40,0x44,0x40,0x40,0x40,0x44,0x10,0x04,0x50,0x40,0x54,0x64,0x44,0x44, + 0x44,0x44,0x40,0x10,0x44,0x44,0x44,0x28,0x28,0x08,0x00,0x38,0x78,0x3c,0x3c,0x38, + 0x20,0x38,0x78,0x30,0x18,0x44,0x10,0x6c,0x78,0x38,0x78,0x3c,0x5c,0x3c,0x3c,0x44, + 0x44,0x44,0x44,0x44,0x7c,0x00,0x4c,0x10,0x04,0x08,0x28,0x78,0x40,0x08,0x44,0x44, + 0x10,0x00,0x7c,0x50,0x08,0x50,0x00,0x20,0x04,0x38,0x10,0x00,0x00,0x00,0x08,0x10, + 0x10,0x10,0x7c,0x08,0x08,0x54,0x40,0x20,0x04,0x44,0x00,0x30,0x10,0x18,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x78,0x40,0x44,0x78,0x78,0x40,0x7c,0x10,0x04,0x60,0x40,0x54,0x54,0x44,0x78, + 0x44,0x78,0x38,0x10,0x44,0x44,0x54,0x10,0x10,0x10,0x00,0x04,0x44,0x40,0x44,0x44, + 0x78,0x44,0x44,0x10,0x08,0x48,0x10,0x54,0x44,0x44,0x44,0x44,0x60,0x40,0x10,0x44, + 0x44,0x44,0x28,0x44,0x08,0x00,0x54,0x10,0x18,0x18,0x48,0x04,0x78,0x10,0x38,0x3c, + 0x10,0x00,0x28,0x38,0x10,0x20,0x00,0x20,0x04,0x10,0x7c,0x00,0x7c,0x00,0x10,0x00, + 0x00,0x20,0x00,0x04,0x10,0x5c,0x40,0x10,0x04,0x00,0x00,0x60,0x10,0x0c,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x7c,0x44,0x40,0x44,0x40,0x40,0x4c,0x44,0x10,0x04,0x50,0x40,0x44,0x4c,0x44,0x40, + 0x54,0x50,0x04,0x10,0x44,0x44,0x54,0x28,0x10,0x20,0x00,0x3c,0x44,0x40,0x44,0x7c, + 0x20,0x44,0x44,0x10,0x08,0x70,0x10,0x54,0x44,0x44,0x44,0x44,0x40,0x38,0x10,0x44, + 0x44,0x54,0x10,0x44,0x10,0x00,0x64,0x10,0x20,0x04,0x7c,0x04,0x44,0x20,0x44,0x04, + 0x10,0x00,0x7c,0x14,0x20,0x54,0x00,0x20,0x04,0x38,0x10,0x10,0x00,0x00,0x20,0x10, + 0x10,0x10,0x7c,0x08,0x10,0x58,0x40,0x08,0x04,0x00,0x00,0x30,0x10,0x18,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x44,0x44,0x44,0x40,0x40,0x44,0x44,0x10,0x44,0x48,0x40,0x44,0x44,0x44,0x40, + 0x48,0x48,0x44,0x10,0x44,0x28,0x6c,0x44,0x10,0x40,0x00,0x44,0x44,0x40,0x44,0x40, + 0x20,0x3c,0x44,0x10,0x08,0x48,0x10,0x54,0x44,0x44,0x44,0x44,0x40,0x04,0x12,0x4c, + 0x28,0x54,0x28,0x3c,0x20,0x00,0x44,0x10,0x40,0x44,0x08,0x44,0x44,0x20,0x44,0x08, + 0x00,0x00,0x28,0x78,0x44,0x48,0x00,0x10,0x08,0x54,0x10,0x10,0x00,0x00,0x40,0x00, + 0x10,0x08,0x00,0x10,0x00,0x40,0x40,0x04,0x04,0x00,0x00,0x30,0x10,0x18,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x44,0x78,0x38,0x78,0x7c,0x40,0x3c,0x44,0x38,0x38,0x44,0x7c,0x44,0x44,0x38,0x40, + 0x34,0x44,0x38,0x10,0x38,0x10,0x44,0x44,0x10,0x7c,0x00,0x3c,0x78,0x3c,0x3c,0x3c, + 0x20,0x04,0x44,0x38,0x48,0x44,0x38,0x44,0x44,0x38,0x78,0x3c,0x40,0x78,0x0c,0x34, + 0x10,0x6c,0x44,0x04,0x7c,0x00,0x38,0x38,0x7c,0x38,0x08,0x38,0x38,0x20,0x38,0x70, + 0x10,0x00,0x28,0x10,0x00,0x34,0x00,0x08,0x10,0x10,0x00,0x20,0x00,0x10,0x00,0x00, + 0x20,0x04,0x00,0x20,0x10,0x3c,0x70,0x00,0x1c,0x00,0x7c,0x1c,0x10,0x70,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x38,0x00,0x00,0x30,0x00,0x00,0x00,0x00,0x00,0x40,0x04,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +}; + +const char* shader_code = " \ +cbuffer MatrixBuffer {\n \ + float4x4 projection_matrix;\n \ +};\n \ +Texture2D texture0;\n \ +SamplerState sampler0;\n \ +struct Vertex {\n \ + float2 position : POSITION0;\n \ + float2 tex : TEXCOORD0;\n \ + float4 color : COLOR0;\n \ +};\n \ +struct Pixel {\n \ + float4 position : SV_POSITION;\n \ + float2 tex : TEXCOORD0;\n \ + float4 color : COLOR0;\n \ +};\n \ +Pixel vs(Vertex v) {\n \ + Pixel p;\n \ + p.position = float4(mul(float4(v.position, 0.0f, 1.0f), projection_matrix).xy - float2(1.0f, -1.0f), 0.0f, 1.0f);\n \ + p.tex = v.tex;\n \ + p.color = v.color;\n \ + return p;\n \ +}\n \ +float4 ps(Pixel p) : SV_TARGET {\n \ + if (p.tex.x > 1.0f) {\n \ + return float4(p.color.rgb, 0.5f);\n \ + } else {\n \ + float4 sample = texture0.Sample(sampler0, p.tex);\n \ + if(sample.w < 0.5f) {\n \ + discard;\n \ + }\n \ + return p.color * sample;\n \ + }\n \ +}\n"; + +} // namespace + + +D3D11ProfilerDisplay::D3D11ProfilerDisplay(D3D11Window* window) : window_(window) { + draw_state_ = { 0 }; + if (!SetupState() || + !SetupShaders() || + !SetupFont()) { + // Hrm. + XEASSERTALWAYS(); + } + + // Pass through mouse events. + window->mouse_down.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseDown( + e.button() == xe::ui::MouseEvent::MOUSE_BUTTON_LEFT, + e.button() == xe::ui::MouseEvent::MOUSE_BUTTON_RIGHT); + }); + window->mouse_up.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseUp(); + }); + window->mouse_move.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseMove(e.x(), e.y()); + }); + window->mouse_wheel.AddListener([](xe::ui::MouseEvent& e) { + Profiler::OnMouseWheel(e.x(), e.y(), -e.dy()); + }); + + // Watch for toggle/mode keys and such. + window->key_down.AddListener([](xe::ui::KeyEvent& e) { + Profiler::OnKeyDown(e.key_code()); + }); + window->key_up.AddListener([](xe::ui::KeyEvent& e) { + Profiler::OnKeyUp(e.key_code()); + }); +} + +bool D3D11ProfilerDisplay::SetupState() { + HRESULT hr; + auto device = window_->device(); + + D3D11_BLEND_DESC blend_desc; + xe_zero_struct(&blend_desc, sizeof(blend_desc)); + blend_desc.RenderTarget[0].BlendEnable = true; + blend_desc.RenderTarget[0].BlendOp = D3D11_BLEND_OP_ADD; + blend_desc.RenderTarget[0].BlendOpAlpha = D3D11_BLEND_OP_ADD; + blend_desc.RenderTarget[0].SrcBlend = D3D11_BLEND_SRC_ALPHA; + blend_desc.RenderTarget[0].SrcBlendAlpha = D3D11_BLEND_ZERO; + blend_desc.RenderTarget[0].DestBlend = D3D11_BLEND_INV_SRC_ALPHA; + blend_desc.RenderTarget[0].DestBlendAlpha = D3D11_BLEND_ZERO; + blend_desc.RenderTarget[0].RenderTargetWriteMask = 0x0F; + hr = device->CreateBlendState(&blend_desc, &blend_state_); + XEASSERT(SUCCEEDED(hr)); + + D3D11_DEPTH_STENCIL_DESC depth_stencil_desc; + xe_zero_struct(&depth_stencil_desc, sizeof(depth_stencil_desc)); + depth_stencil_desc.DepthEnable = false; + depth_stencil_desc.StencilEnable = false; + depth_stencil_desc.DepthWriteMask = D3D11_DEPTH_WRITE_MASK_ZERO; + hr = device->CreateDepthStencilState(&depth_stencil_desc, &depth_stencil_state_); + XEASSERT(SUCCEEDED(hr)); + + return true; +} + +bool D3D11ProfilerDisplay::SetupShaders() { + HRESULT hr; + auto device = window_->device(); + + ID3DBlob* vs_code_blob = nullptr; + ID3DBlob* vs_errors = nullptr; + hr = D3DCompile( + shader_code, xestrlena(shader_code), + "D3D11ProfilerDisplay.vs", + nullptr, + nullptr, + "vs", + "vs_5_0", + D3DCOMPILE_ENABLE_STRICTNESS, + 0, + &vs_code_blob, + &vs_errors); + if (FAILED(hr)) { + XELOGE("Failed to compile profiler vs: %s", + reinterpret_cast(vs_errors->GetBufferPointer())); + return false; + } + hr = device->CreateVertexShader(vs_code_blob->GetBufferPointer(), + vs_code_blob->GetBufferSize(), + nullptr, + &vertex_shader_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler vs"); + return false; + } + ID3DBlob* ps_code_blob = nullptr; + ID3DBlob* ps_errors = nullptr; + hr = D3DCompile( + shader_code, xestrlena(shader_code), + "D3D11ProfilerDisplay.ps", + nullptr, + nullptr, + "ps", + "ps_5_0", + D3DCOMPILE_ENABLE_STRICTNESS, + 0, + &ps_code_blob, + &ps_errors); + if (FAILED(hr)) { + XELOGE("Failed to compile profiler ps: %s", + reinterpret_cast(ps_errors->GetBufferPointer())); + return false; + } + hr = device->CreatePixelShader(ps_code_blob->GetBufferPointer(), + ps_code_blob->GetBufferSize(), + nullptr, + &pixel_shader_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler ps"); + return false; + } + + D3D11_BUFFER_DESC buffer_desc = { 0 }; + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + buffer_desc.ByteWidth = sizeof(float) * 16; + hr = device->CreateBuffer(&buffer_desc, nullptr, &shader_constants_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler constant buffer"); + return false; + } + + D3D11_INPUT_ELEMENT_DESC element_descs[] = { + { "POSITION", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0, }, + { "TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_VERTEX_DATA, 0, }, + { "COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, D3D11_APPEND_ALIGNED_ELEMENT, D3D11_INPUT_PER_VERTEX_DATA, 0, }, + }; + hr = device->CreateInputLayout(element_descs, (UINT)XECOUNT(element_descs), + vs_code_blob->GetBufferPointer(), + vs_code_blob->GetBufferSize(), + &shader_layout_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler input layout"); + return false; + } + + buffer_desc.Usage = D3D11_USAGE_DYNAMIC; + buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER; + buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + buffer_desc.ByteWidth = sizeof(draw_state_.vertex_buffer); + hr = device->CreateBuffer(&buffer_desc, nullptr, &vertex_buffer_); + if (FAILED(hr)) { + XELOGE("Failed to create profiler vertex buffer"); + return false; + } + + return true; +} + +bool D3D11ProfilerDisplay::SetupFont() { + HRESULT hr; + auto device = window_->device(); + + // Setup font lookup table. + for (uint32_t i = 0; i < XECOUNT(font_description_.char_offsets); ++i) { + font_description_.char_offsets[i] = 206; + } + for (uint32_t i = 'A'; i <= 'Z'; ++i) { + font_description_.char_offsets[i] = (i-'A')*8+1; + } + for (uint32_t i = 'a'; i <= 'z'; ++i) { + font_description_.char_offsets[i] = (i-'a')*8+217; + } + for (uint32_t i = '0'; i <= '9'; ++i) { + font_description_.char_offsets[i] = (i-'0')*8+433; + } + for (uint32_t i = '!'; i <= '/'; ++i) { + font_description_.char_offsets[i] = (i-'!')*8+513; + } + for (uint32_t i = ':'; i <= '@'; ++i) { + font_description_.char_offsets[i] = (i-':')*8+625+8; + } + for (uint32_t i = '['; i <= '_'; ++i) { + font_description_.char_offsets[i] = (i-'[')*8+681+8; + } + for (uint32_t i = '{'; i <= '~'; ++i) { + font_description_.char_offsets[i] = (i-'{')*8+721+8; + } + + // Unpack font bitmap into an RGBA texture. + const int FONT_TEX_X = 1024; + const int FONT_TEX_Y = 9; + const int UNPACKED_SIZE = FONT_TEX_X * FONT_TEX_Y * 4; + uint32_t unpacked[UNPACKED_SIZE]; + int idx = 0; + int end = FONT_TEX_X * FONT_TEX_Y / 8; + for (int i = 0; i < end; i++) { + uint8_t b = profiler_font[i]; + for (int j = 0; j < 8; ++j) { + unpacked[idx++] = b & 0x80 ? 0xFFFFFFFFu : 0; + b <<= 1; + } + } + + D3D11_TEXTURE2D_DESC texture_desc = { 0 }; + texture_desc.Width = FONT_TEX_X; + texture_desc.Height = FONT_TEX_Y; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = DXGI_FORMAT_R8G8B8A8_UNORM; + texture_desc.SampleDesc.Count = 1; + texture_desc.SampleDesc.Quality = 0; + texture_desc.Usage = D3D11_USAGE_IMMUTABLE; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = 0; + texture_desc.MiscFlags = 0; + D3D11_SUBRESOURCE_DATA initial_data = { 0 }; + initial_data.pSysMem = unpacked; + initial_data.SysMemPitch = FONT_TEX_X * 4; + initial_data.SysMemSlicePitch = 0; + ID3D11Texture2D* font_texture = nullptr; + hr = device->CreateTexture2D(&texture_desc, &initial_data, &font_texture); + if (FAILED(hr)) { + XELOGE("Unable to create profiler font texture"); + return false; + } + + D3D11_SHADER_RESOURCE_VIEW_DESC texture_view_desc; + xe_zero_struct(&texture_view_desc, sizeof(texture_view_desc)); + texture_view_desc.Format = texture_desc.Format; + texture_view_desc.ViewDimension = D3D10_SRV_DIMENSION_TEXTURE2D; + texture_view_desc.Texture2D.MipLevels = 1; + texture_view_desc.Texture2D.MostDetailedMip = 0; + hr = device->CreateShaderResourceView( + font_texture, &texture_view_desc, &font_texture_view_); + XESAFERELEASE(font_texture); + if (FAILED(hr)) { + XELOGE("Unable to create profiler font texture view"); + return false; + } + + D3D11_SAMPLER_DESC sampler_desc; + xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); + sampler_desc.Filter = D3D11_ENCODE_BASIC_FILTER( + D3D11_FILTER_TYPE_POINT, D3D11_FILTER_TYPE_POINT, + D3D11_FILTER_TYPE_POINT, false); + sampler_desc.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; + sampler_desc.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; + sampler_desc.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + sampler_desc.MipLODBias; + sampler_desc.MaxAnisotropy = 1; + sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; + sampler_desc.BorderColor[0]; + sampler_desc.BorderColor[1]; + sampler_desc.BorderColor[2]; + sampler_desc.BorderColor[3]; + sampler_desc.MinLOD; + sampler_desc.MaxLOD; + hr = device->CreateSamplerState( + &sampler_desc, &font_sampler_state_); + if (FAILED(hr)) { + XEFATAL("D3D11: unable to create invalid sampler state"); + return false; + } + + return true; +} + +D3D11ProfilerDisplay::~D3D11ProfilerDisplay() { + XESAFERELEASE(blend_state_); + XESAFERELEASE(depth_stencil_state_); + XESAFERELEASE(vertex_shader_); + XESAFERELEASE(pixel_shader_); + XESAFERELEASE(shader_constants_); + XESAFERELEASE(shader_layout_); + XESAFERELEASE(font_texture_view_); + XESAFERELEASE(font_sampler_state_); + XESAFERELEASE(vertex_buffer_); +} + +uint32_t D3D11ProfilerDisplay::width() const { + return window_->width(); +} + +uint32_t D3D11ProfilerDisplay::height() const { + return window_->height(); +} + +void D3D11ProfilerDisplay::Begin() { + auto context = window_->context(); + + D3D11_VIEWPORT viewport; + viewport.TopLeftX = 0.0f; + viewport.TopLeftY = 0.0f; + viewport.Width = static_cast(width()); + viewport.Height = static_cast(height()); + viewport.MinDepth = 0.0f; + viewport.MaxDepth = 1.0f; + context->RSSetViewports(1, &viewport); + + // Setup projection matrix. + float left = viewport.TopLeftX; + float right = viewport.TopLeftX + viewport.Width; + float bottom = viewport.TopLeftY + viewport.Height; + float top = viewport.TopLeftY; + float z_near = viewport.MinDepth; + float z_far = viewport.MaxDepth; + float projection[16] = { 0 }; + projection[0] = 2.0f / (right - left); + projection[5] = 2.0f / (top - bottom); + projection[10] = -2.0f / (z_far - z_near); + projection[12] = -(right + left) / (right - left); + projection[13] = -(top + bottom) / (top - bottom); + projection[14] = -(z_far + z_near) / (z_far - z_near); + projection[15] = 1.0f; + D3D11_MAPPED_SUBRESOURCE res; + context->Map(shader_constants_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, projection, sizeof(projection)); + context->Unmap(shader_constants_, 0); + + // Setup state. + context->OMSetBlendState(blend_state_, { 0 }, 0xFFFFFFFF); + context->OMSetDepthStencilState(depth_stencil_state_, 0); + + // Bind shaders. + context->GSSetShader(nullptr, nullptr, 0); + context->VSSetShader(vertex_shader_, nullptr, 0); + context->VSSetConstantBuffers(0, 1, &shader_constants_); + context->PSSetShader(pixel_shader_, nullptr, 0); + context->PSSetConstantBuffers(0, 1, &shader_constants_); + ID3D11SamplerState* ps_samplers[D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT] = { + font_sampler_state_, + nullptr, + }; + context->PSSetSamplers(0, XECOUNT(ps_samplers), ps_samplers); + ID3D11ShaderResourceView* ps_resources[D3D11_COMMONSHADER_INPUT_RESOURCE_SLOT_COUNT] = { + font_texture_view_, + nullptr, + }; + context->PSSetShaderResources(0, XECOUNT(ps_resources), ps_resources); + context->IASetInputLayout(shader_layout_); +} + +void D3D11ProfilerDisplay::End() { + Flush(); +} + +D3D11ProfilerDisplay::Vertex* D3D11ProfilerDisplay::AllocateVertices( + D3D_PRIMITIVE_TOPOLOGY primitive, size_t count) { + if (draw_state_.vertex_index + count > XECOUNT(draw_state_.vertex_buffer)) { + Flush(); + } + XEASSERT(draw_state_.vertex_index + count <= XECOUNT(draw_state_.vertex_buffer)); + + size_t head = draw_state_.vertex_index; + draw_state_.vertex_index += count; + + if (draw_state_.command_index && + draw_state_.commands[draw_state_.command_index - 1].primitive == primitive) { + draw_state_.commands[draw_state_.command_index - 1].vertex_count += count; + } else { + XEASSERT(draw_state_.command_index < XECOUNT(draw_state_.commands)); + draw_state_.commands[draw_state_.command_index].primitive = primitive; + draw_state_.commands[draw_state_.command_index].vertex_count = count; + ++draw_state_.command_index; + } + return &draw_state_.vertex_buffer[head]; +} + +void D3D11ProfilerDisplay::Flush() { + auto context = window_->context(); + if (!draw_state_.vertex_index) { + return; + } + + D3D11_MAPPED_SUBRESOURCE res; + context->Map(vertex_buffer_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + memcpy(res.pData, draw_state_.vertex_buffer, sizeof(Vertex) * draw_state_.vertex_index); + context->Unmap(vertex_buffer_, 0); + + uint32_t stride = 20; + uint32_t offset = 0; + context->IASetVertexBuffers(0, 1, &vertex_buffer_, &stride, &offset); + + size_t vertex_index = 0; + for (int i = 0; i < draw_state_.command_index; ++i) { + size_t count = draw_state_.commands[i].vertex_count; + context->IASetPrimitiveTopology(draw_state_.commands[i].primitive); + context->Draw((UINT)count, (UINT)vertex_index); + vertex_index += count; + } + + draw_state_.vertex_index = 0; + draw_state_.command_index = 0; +} + +#define Q0(d, member, v) d[0].member = v +#define Q1(d, member, v) d[1].member = v; d[3].member = v +#define Q2(d, member, v) d[4].member = v +#define Q3(d, member, v) d[2].member = v; d[5].member = v + +void D3D11ProfilerDisplay::DrawBox( + int x, int y, int x1, int y1, uint32_t color, BoxType type) { + Vertex* v = AllocateVertices(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, 6); + uint32_t color0; + uint32_t color1; + if (type == BOX_TYPE_FLAT) { + color0 = 0xFF000000 | + ((color & 0xFF) << 16) | + (color & 0xFF00FF00) | + ((color >> 16) & 0xFF); + color1 = color0; + } else { + uint32_t r = 0xFF & (color >> 16); + uint32_t g = 0xFF & (color >> 8); + uint32_t b = 0xFF & color; + uint32_t max_c = MAX(MAX(MAX(r, g), b), 30u); + uint32_t min_c = MIN(MIN(MIN(r, g), b), 180u); + uint32_t r0 = 0xFF & ((r + max_c)/2); + uint32_t g0 = 0xFF & ((g + max_c)/2); + uint32_t b0 = 0xFF & ((b + max_c)/2); + uint32_t r1 = 0xFF & ((r + min_c) / 2); + uint32_t g1 = 0xFF & ((g + min_c) / 2); + uint32_t b1 = 0xFF & ((b + min_c) / 2); + color0 = r0 | (g0 << 8) | (b0 << 16) | (0xFF000000 & color); + color1 = r1 | (g1 << 8) | (b1 << 16) | (0xFF000000 & color); + } + Q0(v, x, (float)x); + Q0(v, y, (float)y); + Q0(v, color, color0); + Q0(v, u, 2.0f); + Q0(v, v, 2.0f); + Q1(v, x, (float)x1); + Q1(v, y, (float)y); + Q1(v, color, color0); + Q1(v, u, 3.0f); + Q1(v, v, 2.0f); + Q2(v, x, (float)x1); + Q2(v, y, (float)y1); + Q2(v, color, color1); + Q2(v, u, 3.0f); + Q2(v, v, 3.0f); + Q3(v, x, (float)x); + Q3(v, y, (float)y1); + Q3(v, color, color1); + Q3(v, u, 2.0f); + Q3(v, v, 3.0f); +} + +void D3D11ProfilerDisplay::DrawLine2D( + uint32_t count, float* vertices, uint32_t color) { + if (!count || !vertices) { + return; + } + color = 0xFF000000 | + ((color & 0xFF) << 16) | + (color & 0xFF00FF00) | + ((color >> 16) & 0xFF); + Vertex* v = AllocateVertices(D3D11_PRIMITIVE_TOPOLOGY_LINELIST, 2 * (count - 1)); + for (uint32_t i = 0; i < count - 1; ++i) { + v[0].x = vertices[i * 2]; + v[0].y = vertices[i * 2 + 1]; + v[0].color = color; + v[0].u = 2.0f; + v[0].v = 2.0f; + v[1].x = vertices[(i + 1) * 2]; + v[1].y = vertices[(i + 1) * 2 + 1] ; + v[1].color = color; + v[1].u = 2.0f; + v[1].v = 2.0f; + v += 2; + } +} + +void D3D11ProfilerDisplay::DrawText( + int x, int y, uint32_t color, const char* text, size_t text_length) { + const float offset_u = 5.0f / 1024.0f; + float fx = (float)x; + float fy = (float)y; + float fy2 = fy + (MICROPROFILE_TEXT_HEIGHT + 1); + color = 0xFF000000 | + ((color & 0xFF) << 16) | + (color & 0xFF00FF00) | + ((color >> 16) & 0xFF); + Vertex* v = AllocateVertices(D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST, 6 * text_length); + const char* s = text; + for (uint32_t j = 0; j < text_length; ++j) { + int16_t nOffset = font_description_.char_offsets[(int)*s++]; + float fOffset = nOffset / 1024.f; + Q0(v, x, fx); + Q0(v, y, fy); + Q0(v, color, color); + Q0(v, u, fOffset); + Q0(v, v, 0.0f); + Q1(v, x, fx + MICROPROFILE_TEXT_WIDTH); + Q1(v, y, fy); + Q1(v, color, color); + Q1(v, u, fOffset + offset_u); + Q1(v, v, 0.0f); + Q2(v, x, fx + MICROPROFILE_TEXT_WIDTH); + Q2(v, y, fy2); + Q2(v, color, color); + Q2(v, u, fOffset + offset_u); + Q2(v, v, 1.0f); + Q3(v, x, fx); + Q3(v, y, fy2); + Q3(v, color, color); + Q3(v, u, fOffset); + Q3(v, v, 1.0f); + fx += MICROPROFILE_TEXT_WIDTH + 1; + v += 6; + } +} diff --git a/src/xenia/gpu/d3d11/d3d11_profiler_display.h b/src/xenia/gpu/d3d11/d3d11_profiler_display.h new file mode 100644 index 000000000..fd9f970f9 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_profiler_display.h @@ -0,0 +1,86 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_PROFILER_DISPLAY_H_ +#define XENIA_GPU_D3D11_D3D11_PROFILER_DISPLAY_H_ + +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11Window; + + +class D3D11ProfilerDisplay : public ProfilerDisplay { +public: + D3D11ProfilerDisplay(D3D11Window* window); + virtual ~D3D11ProfilerDisplay(); + + uint32_t width() const override; + uint32_t height() const override; + + // TODO(benvanik): GPU timestamping. + + void Begin() override; + void End() override; + void DrawBox(int x, int y, int x1, int y1, uint32_t color, BoxType type) override; + void DrawLine2D(uint32_t count, float* vertices, uint32_t color) override; + void DrawText(int x, int y, uint32_t color, const char* text, size_t text_length) override; + +private: + bool SetupState(); + bool SetupShaders(); + bool SetupFont(); + + struct Vertex { + float x, y; + float u, v; + uint32_t color; + }; + struct { + size_t vertex_index; + Vertex vertex_buffer[16 << 10]; + struct { + D3D11_PRIMITIVE_TOPOLOGY primitive; + size_t vertex_count; + } commands[32]; + size_t command_index; + } draw_state_; + Vertex* AllocateVertices(D3D_PRIMITIVE_TOPOLOGY primitive, size_t count); + void Flush(); + + D3D11Window* window_; + ID3D11BlendState* blend_state_; + ID3D11DepthStencilState* depth_stencil_state_; + ID3D11VertexShader* vertex_shader_; + ID3D11PixelShader* pixel_shader_; + ID3D11Buffer* shader_constants_; + ID3D11InputLayout* shader_layout_; + ID3D11ShaderResourceView* font_texture_view_; + ID3D11SamplerState* font_sampler_state_; + ID3D11Buffer* vertex_buffer_; + + struct { + uint16_t char_offsets[256]; + } font_description_; +}; + + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_PROFILER_DISPLAY_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_resource_cache.cc b/src/xenia/gpu/d3d11/d3d11_resource_cache.cc new file mode 100644 index 000000000..145e3d395 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_resource_cache.cc @@ -0,0 +1,71 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; + + +D3D11ResourceCache::D3D11ResourceCache(Memory* memory, + ID3D11Device* device, + ID3D11DeviceContext* context) + : ResourceCache(memory), + device_(device), context_(context) { + device_->AddRef(); + context_->AddRef(); +} + +D3D11ResourceCache::~D3D11ResourceCache() { + XESAFERELEASE(device_); + XESAFERELEASE(context_); +} + +VertexShaderResource* D3D11ResourceCache::CreateVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) { + return new D3D11VertexShaderResource(this, memory_range, info); +} + +PixelShaderResource* D3D11ResourceCache::CreatePixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) { + return new D3D11PixelShaderResource(this, memory_range, info); +} + +TextureResource* D3D11ResourceCache::CreateTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) { + return new D3D11TextureResource(this, memory_range, info); +} + +SamplerStateResource* D3D11ResourceCache::CreateSamplerState( + const SamplerStateResource::Info& info) { + return new D3D11SamplerStateResource(this, info); +} + +IndexBufferResource* D3D11ResourceCache::CreateIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) { + return new D3D11IndexBufferResource(this, memory_range, info); +} + +VertexBufferResource* D3D11ResourceCache::CreateVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) { + return new D3D11VertexBufferResource(this, memory_range, info); +} diff --git a/src/xenia/gpu/d3d11/d3d11_resource_cache.h b/src/xenia/gpu/d3d11/d3d11_resource_cache.h new file mode 100644 index 000000000..27248eb9c --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_resource_cache.h @@ -0,0 +1,64 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_RESOURCE_CACHE_H_ +#define XENIA_GPU_D3D11_D3D11_RESOURCE_CACHE_H_ + +#include + +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + + +class D3D11ResourceCache : public ResourceCache { +public: + D3D11ResourceCache(Memory* memory, + ID3D11Device* device, ID3D11DeviceContext* context); + virtual ~D3D11ResourceCache(); + + ID3D11Device* device() const { return device_; } + ID3D11DeviceContext* context() const { return context_; } + +protected: + VertexShaderResource* CreateVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) override; + PixelShaderResource* CreatePixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) override; + TextureResource* CreateTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) override; + SamplerStateResource* CreateSamplerState( + const SamplerStateResource::Info& info) override; + IndexBufferResource* CreateIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) override; + VertexBufferResource* CreateVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) override; + +private: + ID3D11Device* device_; + ID3D11DeviceContext* context_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_RESOURCE_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.cc b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.cc new file mode 100644 index 000000000..7fb09858a --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.cc @@ -0,0 +1,106 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11SamplerStateResource::D3D11SamplerStateResource( + D3D11ResourceCache* resource_cache, const Info& info) + : SamplerStateResource(info), + resource_cache_(resource_cache), + handle_(nullptr) { +} + +D3D11SamplerStateResource::~D3D11SamplerStateResource() { + XESAFERELEASE(handle_); +} + +int D3D11SamplerStateResource::Prepare() { + if (handle_) { + return 0; + } + + D3D11_SAMPLER_DESC sampler_desc; + xe_zero_struct(&sampler_desc, sizeof(sampler_desc)); + // MIN, MAG, MIP + static const D3D11_FILTER filter_matrix[2][2][3] = { + { + // min = POINT + { + // mag = POINT + D3D11_FILTER_MIN_MAG_MIP_POINT, + D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, + D3D11_FILTER_MIN_MAG_POINT_MIP_LINEAR, // basemap? + }, + { + // mag = LINEAR + D3D11_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT, + D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, + D3D11_FILTER_MIN_POINT_MAG_MIP_LINEAR, // basemap? + }, + }, + { + // min = LINEAR + { + // mag = POINT + D3D11_FILTER_MIN_LINEAR_MAG_MIP_POINT, + D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, + D3D11_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR, // basemap? + }, + { + // mag = LINEAR + D3D11_FILTER_MIN_MAG_LINEAR_MIP_POINT, + D3D11_FILTER_MIN_MAG_MIP_LINEAR, + D3D11_FILTER_MIN_MAG_MIP_LINEAR, // basemap? + }, + }, + }; + sampler_desc.Filter = + filter_matrix[info_.min_filter][info_.mag_filter][info_.mip_filter]; + static const D3D11_TEXTURE_ADDRESS_MODE mode_map[] = { + D3D11_TEXTURE_ADDRESS_WRAP, + D3D11_TEXTURE_ADDRESS_MIRROR, + D3D11_TEXTURE_ADDRESS_CLAMP, // ? + D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? + D3D11_TEXTURE_ADDRESS_CLAMP, // ? + D3D11_TEXTURE_ADDRESS_MIRROR_ONCE, // ? + D3D11_TEXTURE_ADDRESS_BORDER, // ? + D3D11_TEXTURE_ADDRESS_MIRROR, // ? + }; + sampler_desc.AddressU = mode_map[info_.clamp_u]; + sampler_desc.AddressV = mode_map[info_.clamp_v]; + sampler_desc.AddressW = mode_map[info_.clamp_w]; + sampler_desc.MipLODBias; + sampler_desc.MaxAnisotropy = 1; + sampler_desc.ComparisonFunc = D3D11_COMPARISON_ALWAYS; + sampler_desc.BorderColor[0]; + sampler_desc.BorderColor[1]; + sampler_desc.BorderColor[2]; + sampler_desc.BorderColor[3]; + sampler_desc.MinLOD; + sampler_desc.MaxLOD; + + HRESULT hr = resource_cache_->device()->CreateSamplerState( + &sampler_desc, &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: unable to create sampler state"); + return 1; + } + + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h new file mode 100644 index 000000000..6097339b4 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_sampler_state_resource.h @@ -0,0 +1,48 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_SAMPLER_STATE_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_SAMPLER_STATE_RESOURCE_H_ + +#include +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11ResourceCache; + + +class D3D11SamplerStateResource : public SamplerStateResource { +public: + D3D11SamplerStateResource(D3D11ResourceCache* resource_cache, + const Info& info); + ~D3D11SamplerStateResource() override; + + void* handle() const override { return handle_; } + + int Prepare() override; + +protected: + D3D11ResourceCache* resource_cache_; + ID3D11SamplerState* handle_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_SAMPLER_STATE_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader.cc b/src/xenia/gpu/d3d11/d3d11_shader.cc deleted file mode 100644 index a60a7bdf3..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader.cc +++ /dev/null @@ -1,2052 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include -#include -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -namespace { - -const int OUTPUT_CAPACITY = 64 * 1024; - -int GetFormatComponentCount(uint32_t format) { - switch (format) { - case FMT_32: - case FMT_32_FLOAT: - return 1; - case FMT_16_16: - case FMT_16_16_FLOAT: - case FMT_32_32: - case FMT_32_32_FLOAT: - return 2; - case FMT_10_11_11: - case FMT_11_11_10: - case FMT_32_32_32_FLOAT: - return 3; - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_16_16_16_16: - case FMT_16_16_16_16_FLOAT: - case FMT_32_32_32_32: - case FMT_32_32_32_32_FLOAT: - return 4; - default: - XELOGE("Unknown vertex format: %d", format); - XEASSERTALWAYS(); - return 4; - } -} - -const char* GetFormatTypeName( - uint32_t format, uint32_t format_comp_all, uint32_t num_format_all) { - switch (format) { - case FMT_32: - return format_comp_all ? "int" : "uint"; - case FMT_32_FLOAT: - return "float"; - case FMT_16_16: - case FMT_32_32: - if (!num_format_all) { - return format_comp_all ? "snorm float2" : "unorm float2"; - } else { - return format_comp_all ? "int2" : "uint2"; - } - case FMT_16_16_FLOAT: - case FMT_32_32_FLOAT: - return "float2"; - case FMT_10_11_11: - case FMT_11_11_10: - return "int3"; // ? - case FMT_32_32_32_FLOAT: - return "float3"; - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_16_16_16_16: - case FMT_32_32_32_32: - if (!num_format_all) { - return format_comp_all ? "snorm float4" : "unorm float4"; - } else { - return format_comp_all ? "int4" : "uint4"; - } - case FMT_16_16_16_16_FLOAT: - case FMT_32_32_32_32_FLOAT: - return "float4"; - default: - XELOGE("Unknown vertex format: %d", format); - XEASSERTALWAYS(); - return "float4"; - } -} - -} // anonymous namespace - - -struct xe::gpu::d3d11::Output { - char buffer[OUTPUT_CAPACITY]; - size_t capacity; - size_t offset; - Output() : - capacity(OUTPUT_CAPACITY), - offset(0) { - buffer[0] = 0; - } - void append(const char* format, ...) { - va_list args; - va_start(args, format); - int len = xevsnprintfa( - buffer + offset, capacity - offset, format, args); - va_end(args); - offset += len; - buffer[offset] = 0; - } -}; - - -D3D11Shader::D3D11Shader( - ID3D11Device* device, - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - translated_src_(NULL), - Shader(type, src_ptr, length, hash) { - device_ = device; - device_->AddRef(); -} - -D3D11Shader::~D3D11Shader() { - if (translated_src_) { - xe_free(translated_src_); - } - XESAFERELEASE(device_); -} - -void D3D11Shader::set_translated_src(char* value) { - if (translated_src_) { - xe_free(translated_src_); - } - translated_src_ = xestrdupa(value); -} - -ID3D10Blob* D3D11Shader::Compile(const char* shader_source) { - // TODO(benvanik): pick shared runtime mode defines. - D3D10_SHADER_MACRO defines[] = { - "TEST_DEFINE", "1", - 0, 0, - }; - - uint32_t flags1 = 0; - flags1 |= D3D10_SHADER_DEBUG; - flags1 |= D3D10_SHADER_ENABLE_STRICTNESS; - uint32_t flags2 = 0; - - // Create a name. - const char* base_path = ""; - if (FLAGS_dump_shaders.size()) { - base_path = FLAGS_dump_shaders.c_str(); - } - char file_name[XE_MAX_PATH]; - xesnprintfa(file_name, XECOUNT(file_name), - "%s/gen_%.16llX.%s", - base_path, - hash_, - type_ == XE_GPU_SHADER_TYPE_VERTEX ? "vs" : "ps"); - - if (FLAGS_dump_shaders.size()) { - FILE* f = fopen(file_name, "w"); - fprintf(f, shader_source); - fprintf(f, "\n\n"); - fprintf(f, "/*\n"); - fprintf(f, disasm_src_); - fprintf(f, " */\n"); - fclose(f); - } - - // Compile shader to bytecode blob. - ID3D10Blob* shader_blob = 0; - ID3D10Blob* error_blob = 0; - HRESULT hr = D3DCompile( - shader_source, strlen(shader_source), - file_name, - defines, NULL, - "main", - type_ == XE_GPU_SHADER_TYPE_VERTEX ? - "vs_5_0" : "ps_5_0", - flags1, flags2, - &shader_blob, &error_blob); - if (error_blob) { - char* msg = (char*)error_blob->GetBufferPointer(); - XELOGE("D3D11: shader compile failed with %s", msg); - } - XESAFERELEASE(error_blob); - if (FAILED(hr)) { - return NULL; - } - return shader_blob; -} - -void D3D11Shader::AppendTextureHeader(Output* output) { - bool fetch_setup[32] = { false }; - - // 1 texture per constant slot, 1 sampler per fetch. - for (uint32_t n = 0; n < tex_buffer_inputs_.count; n++) { - auto& input = tex_buffer_inputs_.descs[n]; - auto& fetch = input.tex_fetch; - - // Add texture, if needed. - if (!fetch_setup[fetch.const_idx]) { - fetch_setup[fetch.const_idx] = true; - const char* texture_type = NULL; - switch (fetch.dimension) { - case DIMENSION_1D: - texture_type = "Texture1D"; - break; - default: - case DIMENSION_2D: - texture_type = "Texture2D"; - break; - case DIMENSION_3D: - texture_type = "Texture3D"; - break; - case DIMENSION_CUBE: - texture_type = "TextureCube"; - break; - } - output->append("%s x_texture_%d;\n", texture_type, fetch.const_idx); - } - - // Add sampler. - output->append("SamplerState x_sampler_%d;\n", n); - } -} - - -D3D11VertexShader::D3D11VertexShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - handle_(0), input_layout_(0), - D3D11Shader(device, XE_GPU_SHADER_TYPE_VERTEX, - src_ptr, length, hash) { - xe_zero_struct(geometry_shaders_, sizeof(geometry_shaders_)); -} - -D3D11VertexShader::~D3D11VertexShader() { - for (size_t n = 0; n < XECOUNT(geometry_shaders_); n++) { - delete geometry_shaders_[n]; - } - XESAFERELEASE(input_layout_); - XESAFERELEASE(handle_); -} - -int D3D11VertexShader::Prepare(xe_gpu_program_cntl_t* program_cntl) { - if (handle_) { - return 0; - } - - // TODO(benvanik): look in file based on hash/etc. - void* byte_code = NULL; - size_t byte_code_length = 0; - - // Translate and compile source. - const char* shader_source = Translate(program_cntl); - if (!shader_source) { - return 1; - } - ID3D10Blob* shader_blob = Compile(shader_source); - if (!shader_blob) { - return 1; - } - byte_code_length = shader_blob->GetBufferSize(); - byte_code = xe_malloc(byte_code_length); - xe_copy_struct( - byte_code, shader_blob->GetBufferPointer(), byte_code_length); - XESAFERELEASE(shader_blob); - - // Create shader. - HRESULT hr = device_->CreateVertexShader( - byte_code, byte_code_length, - NULL, - &handle_); - if (FAILED(hr)) { - XELOGE("D3D11: failed to create vertex shader"); - xe_free(byte_code); - return 1; - } - - // Create input layout. - size_t element_count = 0; - for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { - element_count += vtx_buffer_inputs_.descs[n].element_count; - } - if (!element_count) { - XELOGW("D3D11: vertex shader with zero inputs -- retaining previous values?"); - input_layout_ = NULL; - return 0; - } - - D3D11_INPUT_ELEMENT_DESC* element_descs = - (D3D11_INPUT_ELEMENT_DESC*)xe_alloca( - sizeof(D3D11_INPUT_ELEMENT_DESC) * element_count); - uint32_t el_index = 0; - for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { - auto& input = vtx_buffer_inputs_.descs[n]; - for (uint32_t m = 0; m < input.element_count; m++) { - auto& el = input.elements[m]; - uint32_t vb_slot = input.input_index; - uint32_t num_format_all = el.vtx_fetch.num_format_all; - uint32_t format_comp_all = el.vtx_fetch.format_comp_all; - DXGI_FORMAT vtx_format; - switch (el.format) { - case FMT_8_8_8_8: - if (!num_format_all) { - vtx_format = format_comp_all ? - DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM; - } else { - vtx_format = format_comp_all ? - DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT; - } - break; - case FMT_2_10_10_10: - if (!num_format_all) { - vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM; - } else { - vtx_format = DXGI_FORMAT_R10G10B10A2_UINT; - } - break; - // DXGI_FORMAT_R11G11B10_FLOAT? - case FMT_16_16: - if (!num_format_all) { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM; - } else { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT; - } - break; - case FMT_16_16_16_16: - if (!num_format_all) { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM; - } else { - vtx_format = format_comp_all ? - DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT; - } - break; - case FMT_16_16_FLOAT: - vtx_format = DXGI_FORMAT_R16G16_FLOAT; - break; - case FMT_16_16_16_16_FLOAT: - vtx_format = DXGI_FORMAT_R16G16B16A16_FLOAT; - break; - case FMT_32: - vtx_format = format_comp_all ? - DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT; - break; - case FMT_32_32: - vtx_format = format_comp_all ? - DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT; - break; - case FMT_32_32_32_32: - vtx_format = format_comp_all ? - DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT; - break; - case FMT_32_FLOAT: - vtx_format = DXGI_FORMAT_R32_FLOAT; - break; - case FMT_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32_FLOAT; - break; - case FMT_32_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32B32_FLOAT; - break; - case FMT_32_32_32_32_FLOAT: - vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT; - break; - default: - XEASSERTALWAYS(); - break; - } - element_descs[el_index].SemanticName = "XE_VF"; - element_descs[el_index].SemanticIndex = el_index; - element_descs[el_index].Format = vtx_format; - element_descs[el_index].InputSlot = vb_slot; - element_descs[el_index].AlignedByteOffset = el.offset_words * 4; - element_descs[el_index].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA; - element_descs[el_index].InstanceDataStepRate = 0; - el_index++; - } - } - hr = device_->CreateInputLayout( - element_descs, - (UINT)element_count, - byte_code, byte_code_length, - &input_layout_); - if (FAILED(hr)) { - XELOGE("D3D11: failed to create vertex shader input layout"); - xe_free(byte_code); - return 1; - } - - xe_free(byte_code); - - is_prepared_ = true; - return 0; -} - -const char* D3D11VertexShader::Translate(xe_gpu_program_cntl_t* program_cntl) { - Output* output = new Output(); - xe_gpu_translate_ctx_t ctx; - ctx.output = output; - ctx.type = type_; - ctx.tex_fetch_index = 0; - - // Add constants buffers. - // We could optimize this by only including used buffers, but the compiler - // seems to do a good job of doing this for us. - // It also does read detection, so c[512] can end up c[4] in the asm - - // instead of doing this optimization ourselves we could maybe just query - // this from the compiler. - output->append( - "cbuffer float_consts : register(b0) {\n" - " float4 c[512];\n" - "};\n"); - // TODO(benvanik): add bool/loop constants. - - AppendTextureHeader(output); - - // Transform utilities. We adjust the output position in various ways - // as we can't do this via D3D11 APIs. - output->append( - "cbuffer vs_consts : register(b3) {\n" - " float4 window;\n" // x,y,w,h - " float4 viewport_z_enable;\n" // min,(max - min),?,enabled - " float4 viewport_size;\n" // x,y,w,h - "};" - "float4 applyViewport(float4 pos) {\n" - " if (viewport_z_enable.w) {\n" - //" pos.x = (pos.x + 1) * viewport_size.z * 0.5 + viewport_size.x;\n" - //" pos.y = (1 - pos.y) * viewport_size.w * 0.5 + viewport_size.y;\n" - //" pos.z = viewport_z_enable.x + pos.z * viewport_z_enable.y;\n" - // w? - " } else {\n" - " pos.xy = pos.xy / float2(window.z / 2.0, -window.w / 2.0) + float2(-1.0, 1.0);\n" - " pos.zw = float2(0.0, 1.0);\n" - " }\n" - " pos.xy += window.xy;\n" - " return pos;\n" - "}\n"); - - // Add vertex shader input. - output->append( - "struct VS_INPUT {\n"); - uint32_t el_index = 0; - for (uint32_t n = 0; n < vtx_buffer_inputs_.count; n++) { - auto& input = vtx_buffer_inputs_.descs[n]; - for (uint32_t m = 0; m < input.element_count; m++) { - auto& el = input.elements[m]; - auto& vtx = el.vtx_fetch; - const char* type_name = GetFormatTypeName( - el.format, el.vtx_fetch.format_comp_all, el.vtx_fetch.num_format_all); - uint32_t fetch_slot = vtx.const_index * 3 + vtx.const_index_sel; - output->append( - " %s vf%u_%d : XE_VF%u;\n", - type_name, fetch_slot, vtx.offset, el_index); - el_index++; - } - } - output->append( - "};\n"); - - // Add vertex shader output (pixel shader input). - output->append( - "struct VS_OUTPUT {\n"); - if (alloc_counts_.positions) { - XEASSERT(alloc_counts_.positions == 1); - output->append( - " float4 oPos : SV_POSITION;\n"); - } - if (alloc_counts_.params) { - output->append( - " float4 o[%d] : XE_O;\n", - MAX_INTERPOLATORS); - } - if (alloc_counts_.point_size) { - output->append( - " float4 oPointSize : PSIZE;\n"); - } - output->append( - "};\n"); - - // Vertex shader main() header. - output->append( - "VS_OUTPUT main(VS_INPUT i) {\n" - " VS_OUTPUT o;\n"); - - // Always write position, as some shaders seem to only write certain values. - output->append( - " o.oPos = float4(0.0, 0.0, 0.0, 0.0);\n"); - if (alloc_counts_.point_size) { - output->append( - " o.oPointSize = float4(1.0, 0.0, 0.0, 0.0);\n"); - } - - // TODO(benvanik): remove this, if possible (though the compiler may be smart - // enough to do it for us). - if (alloc_counts_.params) { - for (uint32_t n = 0; n < MAX_INTERPOLATORS; n++) { - output->append( - " o.o[%d] = float4(0.0, 0.0, 0.0, 0.0);\n", n); - } - } - - // Add temporaries for any registers we may use. - uint32_t temp_regs = program_cntl->vs_regs + program_cntl->ps_regs; - for (uint32_t n = 0; n <= temp_regs; n++) { - output->append( - " float4 r%d = c[%d];\n", n, n); - } - output->append(" float4 t;\n"); - - // Execute blocks. - for (std::vector::iterator it = execs_.begin(); - it != execs_.end(); ++it) { - instr_cf_exec_t& cf = *it; - // TODO(benvanik): figure out how sequences/jmps/loops/etc work. - if (TranslateExec(ctx, cf)) { - delete output; - return NULL; - } - } - - // main footer. - output->append( - " o.oPos = applyViewport(o.oPos);\n" - " return o;\n" - "};\n"); - - set_translated_src(output->buffer); - delete output; - return translated_src_; -} - -int D3D11VertexShader::DemandGeometryShader(GeometryShaderType type, - D3D11GeometryShader** out_shader) { - if (geometry_shaders_[type]) { - *out_shader = geometry_shaders_[type]; - return 0; - } - - // Demand generate. - D3D11GeometryShader* shader = NULL; - switch (type) { - case POINT_SPRITE_SHADER: - shader = new D3D11PointSpriteGeometryShader(device_, hash_); - break; - case RECT_LIST_SHADER: - shader = new D3D11RectListGeometryShader(device_, hash_); - break; - case QUAD_LIST_SHADER: - shader = new D3D11QuadListGeometryShader(device_, hash_); - break; - default: - XEASSERTALWAYS(); - return 1; - } - if (!shader) { - return 1; - } - - if (shader->Prepare(this)) { - delete shader; - return 1; - } - - geometry_shaders_[type] = shader; - *out_shader = geometry_shaders_[type]; - return 0; -} - - -D3D11PixelShader::D3D11PixelShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - handle_(0), - D3D11Shader(device, XE_GPU_SHADER_TYPE_PIXEL, - src_ptr, length, hash) { -} - -D3D11PixelShader::~D3D11PixelShader() { - XESAFERELEASE(handle_); -} - -int D3D11PixelShader::Prepare(xe_gpu_program_cntl_t* program_cntl, - D3D11VertexShader* input_shader) { - if (handle_) { - return 0; - } - - // TODO(benvanik): look in file based on hash/etc. - void* byte_code = NULL; - size_t byte_code_length = 0; - - // Translate and compile source. - const char* shader_source = Translate(program_cntl, input_shader); - if (!shader_source) { - return 1; - } - ID3D10Blob* shader_blob = Compile(shader_source); - if (!shader_blob) { - return 1; - } - byte_code_length = shader_blob->GetBufferSize(); - byte_code = xe_malloc(byte_code_length); - xe_copy_struct( - byte_code, shader_blob->GetBufferPointer(), byte_code_length); - XESAFERELEASE(shader_blob); - - // Create shader. - HRESULT hr = device_->CreatePixelShader( - byte_code, byte_code_length, - NULL, - &handle_); - if (FAILED(hr)) { - XELOGE("D3D11: failed to create pixel shader"); - xe_free(byte_code); - return 1; - } - - xe_free(byte_code); - - is_prepared_ = true; - return 0; -} - -const char* D3D11PixelShader::Translate( - xe_gpu_program_cntl_t* program_cntl, D3D11VertexShader* input_shader) { - Output* output = new Output(); - xe_gpu_translate_ctx_t ctx; - ctx.output = output; - ctx.type = type_; - ctx.tex_fetch_index = 0; - - // We need an input VS to make decisions here. - // TODO(benvanik): do we need to pair VS/PS up and store the combination? - // If the same PS is used with different VS that output different amounts - // (and less than the number of required registers), things may die. - XEASSERTNOTNULL(input_shader); - const Shader::alloc_counts_t& input_alloc_counts = - input_shader->alloc_counts(); - - // Add constants buffers. - // We could optimize this by only including used buffers, but the compiler - // seems to do a good job of doing this for us. - // It also does read detection, so c[512] can end up c[4] in the asm - - // instead of doing this optimization ourselves we could maybe just query - // this from the compiler. - output->append( - "cbuffer float_consts : register(b0) {\n" - " float4 c[512];\n" - "};\n"); - // TODO(benvanik): add bool/loop constants. - - AppendTextureHeader(output); - - // Add vertex shader output (pixel shader input). - output->append( - "struct VS_OUTPUT {\n"); - if (input_alloc_counts.positions) { - XEASSERT(input_alloc_counts.positions == 1); - output->append( - " float4 oPos : SV_POSITION;\n"); - } - if (input_alloc_counts.params) { - output->append( - " float4 o[%d] : XE_O;\n", - MAX_INTERPOLATORS); - } - output->append( - "};\n"); - - // Add pixel shader output. - output->append( - "struct PS_OUTPUT {\n"); - for (uint32_t n = 0; n < alloc_counts_.params; n++) { - output->append( - " float4 oC%d : SV_TARGET%d;\n", n, n); - if (program_cntl->ps_export_depth) { - // Is this per render-target? - output->append( - " float oD%d : SV_DEPTH%d;\n", n, n); - } - } - output->append( - "};\n"); - - // Pixel shader main() header. - output->append( - "PS_OUTPUT main(VS_OUTPUT i) {\n" - " PS_OUTPUT o;\n"); - - // Add temporary registers. - uint32_t temp_regs = program_cntl->vs_regs + program_cntl->ps_regs; - for (uint32_t n = 0; n <= MAX(15, temp_regs); n++) { - output->append( - " float4 r%d = c[%d];\n", n, n); - } - output->append(" float4 t;\n"); - - // Bring registers local. - if (input_alloc_counts.params) { - for (uint32_t n = 0; n < MAX_INTERPOLATORS; n++) { - output->append( - " r%d = i.o[%d];\n", n, n); - } - } - - // Execute blocks. - for (std::vector::iterator it = execs_.begin(); - it != execs_.end(); ++it) { - instr_cf_exec_t& cf = *it; - // TODO(benvanik): figure out how sequences/jmps/loops/etc work. - if (TranslateExec(ctx, cf)) { - delete output; - return NULL; - } - } - - // main footer. - output->append( - " return o;\n" - "}\n"); - - set_translated_src(output->buffer); - delete output; - return translated_src_; -} - - -namespace { - -static const char chan_names[] = { - 'x', 'y', 'z', 'w', - // these only apply to FETCH dst's, and we shouldn't be using them: - '0', '1', '?', '_', -}; - -void AppendSrcReg( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t type, - uint32_t swiz, uint32_t negate, uint32_t abs) { - if (negate) { - ctx.output->append("-"); - } - if (abs) { - ctx.output->append("abs("); - } - if (type) { - // Register. - ctx.output->append("r%u", num); - } else { - // Constant. - ctx.output->append("c[%u]", num); - } - if (swiz) { - ctx.output->append("."); - for (int i = 0; i < 4; i++) { - ctx.output->append("%c", chan_names[(swiz + i) & 0x3]); - swiz >>= 2; - } - } - if (abs) { - ctx.output->append(")"); - } -} - -void AppendDestRegName( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t dst_exp) { - if (!dst_exp) { - // Register. - ctx.output->append("r%u", num); - } else { - // Export. - switch (ctx.type) { - case XE_GPU_SHADER_TYPE_VERTEX: - switch (num) { - case 62: - ctx.output->append("o.oPos"); - break; - case 63: - ctx.output->append("o.oPointSize"); - break; - default: - // Varying. - ctx.output->append("o.o[%u]", num);; - break; - } - break; - case XE_GPU_SHADER_TYPE_PIXEL: - switch (num) { - case 0: - ctx.output->append("o.oC0"); - break; - default: - // TODO(benvanik): other render targets? - // TODO(benvanik): depth? - XEASSERTALWAYS(); - break; - } - break; - } - } -} - -void AppendDestReg( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t mask, uint32_t dst_exp) { - if (mask != 0xF) { - // If masking, store to a temporary variable and clean it up later. - ctx.output->append("t"); - } else { - // Store directly to output. - AppendDestRegName(ctx, num, dst_exp); - } -} - -void AppendDestRegPost( - xe_gpu_translate_ctx_t& ctx, - uint32_t num, uint32_t mask, uint32_t dst_exp) { - if (mask != 0xF) { - // Masking. - ctx.output->append(" "); - AppendDestRegName(ctx, num, dst_exp); - ctx.output->append(" = float4("); - for (int i = 0; i < 4; i++) { - // TODO(benvanik): mask out values? mix in old value as temp? - // ctx.output->append("%c", (mask & 0x1) ? chan_names[i] : 'w'); - if (!(mask & 0x1)) { - AppendDestRegName(ctx, num, dst_exp); - } else { - ctx.output->append("t"); - } - ctx.output->append(".%c", chan_names[i]); - mask >>= 1; - if (i < 3) { - ctx.output->append(", "); - } - } - ctx.output->append(");\n"); - } -} - -void print_srcreg( - Output* output, - uint32_t num, uint32_t type, - uint32_t swiz, uint32_t negate, uint32_t abs) { - if (negate) { - output->append("-"); - } - if (abs) { - output->append("|"); - } - output->append("%c%u", type ? 'R' : 'C', num); - if (swiz) { - output->append("."); - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[(swiz + i) & 0x3]); - swiz >>= 2; - } - } - if (abs) { - output->append("|"); - } -} - -void print_dstreg( - Output* output, uint32_t num, uint32_t mask, uint32_t dst_exp) { - output->append("%s%u", dst_exp ? "export" : "R", num); - if (mask != 0xf) { - output->append("."); - for (int i = 0; i < 4; i++) { - output->append("%c", (mask & 0x1) ? chan_names[i] : '_'); - mask >>= 1; - } - } -} - -void print_export_comment( - Output* output, uint32_t num, XE_GPU_SHADER_TYPE type) { - const char *name = NULL; - switch (type) { - case XE_GPU_SHADER_TYPE_VERTEX: - switch (num) { - case 62: name = "gl_Position"; break; - case 63: name = "gl_PointSize"; break; - } - break; - case XE_GPU_SHADER_TYPE_PIXEL: - switch (num) { - case 0: name = "gl_FragColor"; break; - } - break; - } - /* if we had a symbol table here, we could look - * up the name of the varying.. - */ - if (name) { - output->append("\t; %s", name); - } -} - -int TranslateALU_ADDv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(" + "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MULv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(" * "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MAXv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - if (alu.src1_reg == alu.src2_reg && - alu.src1_sel == alu.src2_sel && - alu.src1_swiz == alu.src2_swiz && - alu.src1_reg_negate == alu.src2_reg_negate && - alu.src1_reg_abs == alu.src2_reg_abs) { - // This is a mov. - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - } else { - ctx.output->append("max("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - } - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MINv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("min("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_SETXXv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu, const char* op) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("float4(("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").x %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").x ? 1.0 : 0.0, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").y %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").y ? 1.0 : 0.0, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").z %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").z ? 1.0 : 0.0, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").w %s (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").w ? 1.0 : 0.0)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} -int TranslateALU_SETEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, "=="); -} -int TranslateALU_SETGTv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, ">"); -} -int TranslateALU_SETGTEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, ">="); -} -int TranslateALU_SETNEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXv(ctx, alu, "!="); -} - -int TranslateALU_FRACv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("frac("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_TRUNCv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("trunc("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_FLOORv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("floor("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MULADDv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("mad("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_CNDXXv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu, const char* op) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - // TODO(benvanik): check argument order - could be 3 as compare and 1 and 2 as values. - ctx.output->append("float4(("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").x %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").x : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").x, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").y %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").y : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").y, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").z %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").z : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").z, ("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").w %s 0.0 ? (", op); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").w : ("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(").w)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} -int TranslateALU_CNDEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_CNDXXv(ctx, alu, "=="); -} -int TranslateALU_CNDGTEv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_CNDXXv(ctx, alu, ">="); -} -int TranslateALU_CNDGTv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_CNDXXv(ctx, alu, ">"); -} - -int TranslateALU_DOT4v( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("dot("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(", "); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(")"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_DOT3v( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("dot(float4("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").xyz, float4("); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").xyz)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_DOT2ADDv( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("dot(float4("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(").xy, float4("); - AppendSrcReg(ctx, alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); - ctx.output->append(").xy) + "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -// CUBEv - -int TranslateALU_MAX4v( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.vector_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("max("); - ctx.output->append("max("); - ctx.output->append("max("); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".x, "); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".y), "); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".z), "); - AppendSrcReg(ctx, alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); - ctx.output->append(".w)"); - if (alu.vector_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.vector_dest, alu.vector_write_mask, alu.export_data); - return 0; -} - -// ... - -int TranslateALU_MAXs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - if ((alu.src3_swiz & 0x3) == (((alu.src3_swiz >> 2) + 1) & 0x3)) { - // This is a mov. - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - } else { - ctx.output->append("max("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x, "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".y).xxxx"); - } - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MINs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("min("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x, "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".y).xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_SETXXs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu, const char* op) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("(("); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".x %s 0.0) ? 1.0 : 0.0).xxxx", op); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_SETEs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, "=="); -} -int TranslateALU_SETGTs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, ">"); -} -int TranslateALU_SETGTEs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, ">="); -} -int TranslateALU_SETNEs( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SETXXs(ctx, alu, "!="); -} - -int TranslateALU_RECIP_IEEE( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - ctx.output->append("(1.0 / "); - AppendSrcReg(ctx, alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(")"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} - -int TranslateALU_MUL_CONST_0( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - uint32_t src3_swiz = alu.src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); - ctx.output->append("("); - AppendSrcReg(ctx, alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c * ", chan_names[swiz_a]); - AppendSrcReg(ctx, reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c", chan_names[swiz_b]); - ctx.output->append(").xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_MUL_CONST_1( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_MUL_CONST_0(ctx, alu); -} - -int TranslateALU_ADD_CONST_0( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - uint32_t src3_swiz = alu.src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); - ctx.output->append("("); - AppendSrcReg(ctx, alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c + ", chan_names[swiz_a]); - AppendSrcReg(ctx, reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c", chan_names[swiz_b]); - ctx.output->append(").xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_ADD_CONST_1( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_ADD_CONST_0(ctx, alu); -} - -int TranslateALU_SUB_CONST_0( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - AppendDestReg(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - ctx.output->append(" = "); - if (alu.scalar_clamp) { - ctx.output->append("saturate("); - } - uint32_t src3_swiz = alu.src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); - ctx.output->append("("); - AppendSrcReg(ctx, alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c - ", chan_names[swiz_a]); - AppendSrcReg(ctx, reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); - ctx.output->append(".%c", chan_names[swiz_b]); - ctx.output->append(").xxxx"); - if (alu.scalar_clamp) { - ctx.output->append(")"); - } - ctx.output->append(";\n"); - AppendDestRegPost(ctx, alu.scalar_dest, alu.scalar_write_mask, alu.export_data); - return 0; -} -int TranslateALU_SUB_CONST_1( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu) { - return TranslateALU_SUB_CONST_0(ctx, alu); -} - -typedef int (*xe_gpu_translate_alu_fn)( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t& alu); -typedef struct { - uint32_t num_srcs; - const char* name; - xe_gpu_translate_alu_fn fn; -} xe_gpu_translate_alu_info_t; -#define ALU_INSTR(opc, num_srcs) \ - { num_srcs, #opc, 0 } -#define ALU_INSTR_IMPL(opc, num_srcs) \ - { num_srcs, #opc, TranslateALU_##opc } -static xe_gpu_translate_alu_info_t vector_alu_instrs[0x20] = { - ALU_INSTR_IMPL(ADDv, 2), // 0 - ALU_INSTR_IMPL(MULv, 2), // 1 - ALU_INSTR_IMPL(MAXv, 2), // 2 - ALU_INSTR_IMPL(MINv, 2), // 3 - ALU_INSTR_IMPL(SETEv, 2), // 4 - ALU_INSTR_IMPL(SETGTv, 2), // 5 - ALU_INSTR_IMPL(SETGTEv, 2), // 6 - ALU_INSTR_IMPL(SETNEv, 2), // 7 - ALU_INSTR_IMPL(FRACv, 1), // 8 - ALU_INSTR_IMPL(TRUNCv, 1), // 9 - ALU_INSTR_IMPL(FLOORv, 1), // 10 - ALU_INSTR_IMPL(MULADDv, 3), // 11 - ALU_INSTR_IMPL(CNDEv, 3), // 12 - ALU_INSTR_IMPL(CNDGTEv, 3), // 13 - ALU_INSTR_IMPL(CNDGTv, 3), // 14 - ALU_INSTR_IMPL(DOT4v, 2), // 15 - ALU_INSTR_IMPL(DOT3v, 2), // 16 - ALU_INSTR_IMPL(DOT2ADDv, 3), // 17 -- ??? - ALU_INSTR(CUBEv, 2), // 18 - ALU_INSTR_IMPL(MAX4v, 1), // 19 - ALU_INSTR(PRED_SETE_PUSHv, 2), // 20 - ALU_INSTR(PRED_SETNE_PUSHv, 2), // 21 - ALU_INSTR(PRED_SETGT_PUSHv, 2), // 22 - ALU_INSTR(PRED_SETGTE_PUSHv, 2), // 23 - ALU_INSTR(KILLEv, 2), // 24 - ALU_INSTR(KILLGTv, 2), // 25 - ALU_INSTR(KILLGTEv, 2), // 26 - ALU_INSTR(KILLNEv, 2), // 27 - ALU_INSTR(DSTv, 2), // 28 - ALU_INSTR(MOVAv, 1), // 29 -}; -static xe_gpu_translate_alu_info_t scalar_alu_instrs[0x40] = { - ALU_INSTR(ADDs, 1), // 0 - ALU_INSTR(ADD_PREVs, 1), // 1 - ALU_INSTR(MULs, 1), // 2 - ALU_INSTR(MUL_PREVs, 1), // 3 - ALU_INSTR(MUL_PREV2s, 1), // 4 - ALU_INSTR_IMPL(MAXs, 1), // 5 - ALU_INSTR_IMPL(MINs, 1), // 6 - ALU_INSTR_IMPL(SETEs, 1), // 7 - ALU_INSTR_IMPL(SETGTs, 1), // 8 - ALU_INSTR_IMPL(SETGTEs, 1), // 9 - ALU_INSTR_IMPL(SETNEs, 1), // 10 - ALU_INSTR(FRACs, 1), // 11 - ALU_INSTR(TRUNCs, 1), // 12 - ALU_INSTR(FLOORs, 1), // 13 - ALU_INSTR(EXP_IEEE, 1), // 14 - ALU_INSTR(LOG_CLAMP, 1), // 15 - ALU_INSTR(LOG_IEEE, 1), // 16 - ALU_INSTR(RECIP_CLAMP, 1), // 17 - ALU_INSTR(RECIP_FF, 1), // 18 - ALU_INSTR_IMPL(RECIP_IEEE, 1), // 19 - ALU_INSTR(RECIPSQ_CLAMP, 1), // 20 - ALU_INSTR(RECIPSQ_FF, 1), // 21 - ALU_INSTR(RECIPSQ_IEEE, 1), // 22 - ALU_INSTR(MOVAs, 1), // 23 - ALU_INSTR(MOVA_FLOORs, 1), // 24 - ALU_INSTR(SUBs, 1), // 25 - ALU_INSTR(SUB_PREVs, 1), // 26 - ALU_INSTR(PRED_SETEs, 1), // 27 - ALU_INSTR(PRED_SETNEs, 1), // 28 - ALU_INSTR(PRED_SETGTs, 1), // 29 - ALU_INSTR(PRED_SETGTEs, 1), // 30 - ALU_INSTR(PRED_SET_INVs, 1), // 31 - ALU_INSTR(PRED_SET_POPs, 1), // 32 - ALU_INSTR(PRED_SET_CLRs, 1), // 33 - ALU_INSTR(PRED_SET_RESTOREs, 1), // 34 - ALU_INSTR(KILLEs, 1), // 35 - ALU_INSTR(KILLGTs, 1), // 36 - ALU_INSTR(KILLGTEs, 1), // 37 - ALU_INSTR(KILLNEs, 1), // 38 - ALU_INSTR(KILLONEs, 1), // 39 - ALU_INSTR(SQRT_IEEE, 1), // 40 - { 0, 0, false }, - ALU_INSTR_IMPL(MUL_CONST_0, 2), // 42 - ALU_INSTR_IMPL(MUL_CONST_1, 2), // 43 - ALU_INSTR_IMPL(ADD_CONST_0, 2), // 44 - ALU_INSTR_IMPL(ADD_CONST_1, 2), // 45 - ALU_INSTR_IMPL(SUB_CONST_0, 2), // 46 - ALU_INSTR_IMPL(SUB_CONST_1, 2), // 47 - ALU_INSTR(SIN, 1), // 48 - ALU_INSTR(COS, 1), // 49 - ALU_INSTR(RETAIN_PREV, 1), // 50 -}; -#undef ALU_INSTR - -int TranslateALU( - xe_gpu_translate_ctx_t& ctx, const instr_alu_t* alu, int sync) { - Output* output = ctx.output; - - if (!alu->scalar_write_mask && !alu->vector_write_mask) { - output->append(" // \n"); - return 0; - } - - if (alu->vector_write_mask) { - // Disassemble vector op. - xe_gpu_translate_alu_info_t& iv = vector_alu_instrs[alu->vector_opc]; - output->append(" // %sALU:\t", sync ? "(S)" : " "); - output->append("%s", iv.name); - if (alu->pred_select & 0x2) { - // seems to work similar to conditional execution in ARM instruction - // set, so let's use a similar syntax for now: - output->append((alu->pred_select & 0x1) ? "EQ" : "NE"); - } - output->append("\t"); - print_dstreg(output, - alu->vector_dest, alu->vector_write_mask, alu->export_data); - output->append(" = "); - if (iv.num_srcs == 3) { - print_srcreg(output, - alu->src3_reg, alu->src3_sel, alu->src3_swiz, - alu->src3_reg_negate, alu->src3_reg_abs); - output->append(", "); - } - print_srcreg(output, - alu->src1_reg, alu->src1_sel, alu->src1_swiz, - alu->src1_reg_negate, alu->src1_reg_abs); - if (iv.num_srcs > 1) { - output->append(", "); - print_srcreg(output, - alu->src2_reg, alu->src2_sel, alu->src2_swiz, - alu->src2_reg_negate, alu->src2_reg_abs); - } - if (alu->vector_clamp) { - output->append(" CLAMP"); - } - if (alu->export_data) { - print_export_comment(output, alu->vector_dest, ctx.type); - } - output->append("\n"); - - // Translate vector op. - if (iv.fn) { - output->append(" "); - if (iv.fn(ctx, *alu)) { - return 1; - } - } else { - output->append(" // \n"); - } - } - - if (alu->scalar_write_mask || !alu->vector_write_mask) { - // 2nd optional scalar op: - - // Disassemble scalar op. - xe_gpu_translate_alu_info_t& is = scalar_alu_instrs[alu->scalar_opc]; - output->append(" // "); - output->append("\t"); - if (is.name) { - output->append("\t \t%s\t", is.name); - } else { - output->append("\t \tOP(%u)\t", alu->scalar_opc); - } - print_dstreg(output, - alu->scalar_dest, alu->scalar_write_mask, alu->export_data); - output->append(" = "); - if (is.num_srcs == 2) { - // ADD_CONST_0 dest, [const], [reg] - uint32_t src3_swiz = alu->src3_swiz & ~0x3C; - uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; - uint32_t swiz_b = (src3_swiz & 0x3); - print_srcreg(output, - alu->src3_reg, 0, 0, - alu->src3_reg_negate, alu->src3_reg_abs); - output->append(".%c", chan_names[swiz_a]); - output->append(", "); - uint32_t reg2 = (alu->scalar_opc & 1) | (alu->src3_swiz & 0x3C) | (alu->src3_sel << 1); - print_srcreg(output, - reg2, 1, 0, - alu->src3_reg_negate, alu->src3_reg_abs); - output->append(".%c", chan_names[swiz_b]); - } else { - print_srcreg(output, - alu->src3_reg, alu->src3_sel, alu->src3_swiz, - alu->src3_reg_negate, alu->src3_reg_abs); - } - if (alu->scalar_clamp) { - output->append(" CLAMP"); - } - if (alu->export_data) { - print_export_comment(output, alu->scalar_dest, ctx.type); - } - output->append("\n"); - - // Translate scalar op. - if (is.fn) { - output->append(" "); - if (is.fn(ctx, *alu)) { - return 1; - } - } else { - output->append(" // \n"); - } - } - - return 0; -} - -struct { - const char *name; -} fetch_types[0xff] = { -#define TYPE(id) { #id } - TYPE(FMT_1_REVERSE), // 0 - {0}, - TYPE(FMT_8), // 2 - {0}, - {0}, - {0}, - TYPE(FMT_8_8_8_8), // 6 - TYPE(FMT_2_10_10_10), // 7 - {0}, - {0}, - TYPE(FMT_8_8), // 10 - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - TYPE(FMT_16), // 24 - TYPE(FMT_16_16), // 25 - TYPE(FMT_16_16_16_16), // 26 - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - TYPE(FMT_32), // 33 - TYPE(FMT_32_32), // 34 - TYPE(FMT_32_32_32_32), // 35 - TYPE(FMT_32_FLOAT), // 36 - TYPE(FMT_32_32_FLOAT), // 37 - TYPE(FMT_32_32_32_32_FLOAT), // 38 - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - {0}, - TYPE(FMT_32_32_32_FLOAT), // 57 -#undef TYPE -}; - -void print_fetch_dst(Output* output, uint32_t dst_reg, uint32_t dst_swiz) { - output->append("\tR%u.", dst_reg); - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[dst_swiz & 0x7]); - dst_swiz >>= 3; - } -} - -void AppendFetchDest(Output* output, uint32_t dst_reg, uint32_t dst_swiz) { - output->append("r%u.", dst_reg); - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[dst_swiz & 0x7]); - dst_swiz >>= 3; - } -} - -int TranslateVertexFetch( - xe_gpu_translate_ctx_t& ctx, const instr_fetch_vtx_t* vtx, int sync) { - Output* output = ctx.output; - - // Disassemble. - output->append(" // %sFETCH:\t", sync ? "(S)" : " "); - if (vtx->pred_select) { - output->append(vtx->pred_condition ? "EQ" : "NE"); - } - print_fetch_dst(output, vtx->dst_reg, vtx->dst_swiz); - output->append(" = R%u.", vtx->src_reg); - output->append("%c", chan_names[vtx->src_swiz & 0x3]); - if (fetch_types[vtx->format].name) { - output->append(" %s", fetch_types[vtx->format].name); - } else { - output->append(" TYPE(0x%x)", vtx->format); - } - output->append(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED"); - if (!vtx->num_format_all) { - output->append(" NORMALIZED"); - } - output->append(" STRIDE(%u)", vtx->stride); - if (vtx->offset) { - output->append(" OFFSET(%u)", vtx->offset); - } - output->append(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel); - if (1) { - // XXX - output->append(" src_reg_am=%u", vtx->src_reg_am); - output->append(" dst_reg_am=%u", vtx->dst_reg_am); - output->append(" num_format_all=%u", vtx->num_format_all); - output->append(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all); - output->append(" exp_adjust_all=%u", vtx->exp_adjust_all); - } - output->append("\n"); - - // Translate. - output->append(" "); - output->append("r%u.xyzw", vtx->dst_reg); - output->append(" = float4("); - uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; - // TODO(benvanik): detect xyzw = xyzw, etc. - // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc - uint32_t component_count = GetFormatComponentCount(vtx->format); - uint32_t dst_swiz = vtx->dst_swiz; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - output->append("0.0"); - } else if ((dst_swiz & 0x7) == 5) { - output->append("1.0"); - } else if ((dst_swiz & 0x7) == 6) { - // ? - output->append("?"); - } else if ((dst_swiz & 0x7) == 7) { - output->append("r%u.%c", vtx->dst_reg, chan_names[i]); - } else { - output->append("i.vf%u_%d.%c", - fetch_slot, vtx->offset, - chan_names[dst_swiz & 0x3]); - } - if (i < 3) { - output->append(", "); - } - dst_swiz >>= 3; - } - output->append(");\n"); - return 0; -} - -int TranslateTextureFetch( - xe_gpu_translate_ctx_t& ctx, const instr_fetch_tex_t* tex, int sync) { - Output* output = ctx.output; - - // Disassemble. - static const char *filter[] = { - "POINT", // TEX_FILTER_POINT - "LINEAR", // TEX_FILTER_LINEAR - "BASEMAP", // TEX_FILTER_BASEMAP - }; - static const char *aniso_filter[] = { - "DISABLED", // ANISO_FILTER_DISABLED - "MAX_1_1", // ANISO_FILTER_MAX_1_1 - "MAX_2_1", // ANISO_FILTER_MAX_2_1 - "MAX_4_1", // ANISO_FILTER_MAX_4_1 - "MAX_8_1", // ANISO_FILTER_MAX_8_1 - "MAX_16_1", // ANISO_FILTER_MAX_16_1 - }; - static const char *arbitrary_filter[] = { - "2x4_SYM", // ARBITRARY_FILTER_2X4_SYM - "2x4_ASYM", // ARBITRARY_FILTER_2X4_ASYM - "4x2_SYM", // ARBITRARY_FILTER_4X2_SYM - "4x2_ASYM", // ARBITRARY_FILTER_4X2_ASYM - "4x4_SYM", // ARBITRARY_FILTER_4X4_SYM - "4x4_ASYM", // ARBITRARY_FILTER_4X4_ASYM - }; - static const char *sample_loc[] = { - "CENTROID", // SAMPLE_CENTROID - "CENTER", // SAMPLE_CENTER - }; - uint32_t src_swiz = tex->src_swiz; - output->append(" // %sFETCH:\t", sync ? "(S)" : " "); - if (tex->pred_select) { - output->append(tex->pred_condition ? "EQ" : "NE"); - } - print_fetch_dst(output, tex->dst_reg, tex->dst_swiz); - output->append(" = R%u.", tex->src_reg); - for (int i = 0; i < 3; i++) { - output->append("%c", chan_names[src_swiz & 0x3]); - src_swiz >>= 2; - } - output->append(" CONST(%u)", tex->const_idx); - if (tex->fetch_valid_only) { - output->append(" VALID_ONLY"); - } - if (tex->tx_coord_denorm) { - output->append(" DENORM"); - } - if (tex->mag_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" MAG(%s)", filter[tex->mag_filter]); - } - if (tex->min_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" MIN(%s)", filter[tex->min_filter]); - } - if (tex->mip_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" MIP(%s)", filter[tex->mip_filter]); - } - if (tex->aniso_filter != ANISO_FILTER_USE_FETCH_CONST) { - output->append(" ANISO(%s)", aniso_filter[tex->aniso_filter]); - } - if (tex->arbitrary_filter != ARBITRARY_FILTER_USE_FETCH_CONST) { - output->append(" ARBITRARY(%s)", arbitrary_filter[tex->arbitrary_filter]); - } - if (tex->vol_mag_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" VOL_MAG(%s)", filter[tex->vol_mag_filter]); - } - if (tex->vol_min_filter != TEX_FILTER_USE_FETCH_CONST) { - output->append(" VOL_MIN(%s)", filter[tex->vol_min_filter]); - } - if (!tex->use_comp_lod) { - output->append(" LOD(%u)", tex->use_comp_lod); - output->append(" LOD_BIAS(%u)", tex->lod_bias); - } - if (tex->use_reg_lod) { - output->append(" REG_LOD(%u)", tex->use_reg_lod); - } - if (tex->use_reg_gradients) { - output->append(" USE_REG_GRADIENTS"); - } - output->append(" LOCATION(%s)", sample_loc[tex->sample_location]); - if (tex->offset_x || tex->offset_y || tex->offset_z) { - output->append(" OFFSET(%u,%u,%u)", tex->offset_x, tex->offset_y, tex->offset_z); - } - output->append("\n"); - - int src_component_count = 0; - switch (tex->dimension) { - case DIMENSION_1D: - src_component_count = 1; - break; - default: - case DIMENSION_2D: - src_component_count = 2; - break; - case DIMENSION_3D: - src_component_count = 3; - break; - case DIMENSION_CUBE: - src_component_count = 3; - break; - } - - // Translate. - output->append(" "); - output->append("r%u.xyzw", tex->dst_reg); - output->append(" = "); - output->append( - "x_texture_%d.Sample(x_sampler_%d, r%u.", - tex->const_idx, - ctx.tex_fetch_index++, // hacky way to line up to tex buffers - tex->src_reg); - src_swiz = tex->src_swiz; - for (int i = 0; i < src_component_count; i++) { - output->append("%c", chan_names[src_swiz & 0x3]); - src_swiz >>= 2; - } - output->append(")."); - - // Pass one over dest does xyzw and fakes the special values. - // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc - uint32_t dst_swiz = tex->dst_swiz; - for (int i = 0; i < 4; i++) { - output->append("%c", chan_names[dst_swiz & 0x3]); - dst_swiz >>= 3; - } - output->append(";\n"); - // Do another pass to set constant values. - dst_swiz = tex->dst_swiz; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - output->append(" r%u.%c = 0.0;\n", tex->dst_reg, chan_names[i]); - } else if ((dst_swiz & 0x7) == 5) { - output->append(" r%u.%c = 1.0;\n", tex->dst_reg, chan_names[i]); - } - dst_swiz >>= 3; - } - return 0; -} - -struct { - const char *name; -} cf_instructions[] = { -#define INSTR(opc, fxn) { #opc } - INSTR(NOP, print_cf_nop), - INSTR(EXEC, print_cf_exec), - INSTR(EXEC_END, print_cf_exec), - INSTR(COND_EXEC, print_cf_exec), - INSTR(COND_EXEC_END, print_cf_exec), - INSTR(COND_PRED_EXEC, print_cf_exec), - INSTR(COND_PRED_EXEC_END, print_cf_exec), - INSTR(LOOP_START, print_cf_loop), - INSTR(LOOP_END, print_cf_loop), - INSTR(COND_CALL, print_cf_jmp_call), - INSTR(RETURN, print_cf_jmp_call), - INSTR(COND_JMP, print_cf_jmp_call), - INSTR(ALLOC, print_cf_alloc), - INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec), - INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec), - INSTR(MARK_VS_FETCH_DONE, print_cf_nop), // ?? -#undef INSTR -}; - -} // anonymous namespace - - -int D3D11Shader::TranslateExec(xe_gpu_translate_ctx_t& ctx, const instr_cf_exec_t& cf) { - Output* output = ctx.output; - - output->append( - " // %s ADDR(0x%x) CNT(0x%x)", - cf_instructions[cf.opc].name, cf.address, cf.count); - if (cf.yeild) { - output->append(" YIELD"); - } - uint8_t vc = cf.vc_hi | (cf.vc_lo << 2); - if (vc) { - output->append(" VC(0x%x)", vc); - } - if (cf.bool_addr) { - output->append(" BOOL_ADDR(0x%x)", cf.bool_addr); - } - if (cf.address_mode == ABSOLUTE_ADDR) { - output->append(" ABSOLUTE_ADDR"); - } - if (cf.is_cond_exec()) { - output->append(" COND(%d)", cf.condition); - } - output->append("\n"); - - uint32_t sequence = cf.serialize; - for (uint32_t i = 0; i < cf.count; i++) { - uint32_t alu_off = (cf.address + i); - int sync = sequence & 0x2; - if (sequence & 0x1) { - const instr_fetch_t* fetch = - (const instr_fetch_t*)(dwords_ + alu_off * 3); - switch (fetch->opc) { - case VTX_FETCH: - if (TranslateVertexFetch(ctx, &fetch->vtx, sync)) { - return 1; - } - break; - case TEX_FETCH: - if (TranslateTextureFetch(ctx, &fetch->tex, sync)) { - return 1; - } - break; - case TEX_GET_BORDER_COLOR_FRAC: - case TEX_GET_COMP_TEX_LOD: - case TEX_GET_GRADIENTS: - case TEX_GET_WEIGHTS: - case TEX_SET_TEX_LOD: - case TEX_SET_GRADIENTS_H: - case TEX_SET_GRADIENTS_V: - default: - XEASSERTALWAYS(); - break; - } - } else { - const instr_alu_t* alu = - (const instr_alu_t*)(dwords_ + alu_off * 3); - if (TranslateALU(ctx, alu, sync)) { - return 1; - } - } - sequence >>= 2; - } - - return 0; -} diff --git a/src/xenia/gpu/d3d11/d3d11_shader.h b/src/xenia/gpu/d3d11/d3d11_shader.h deleted file mode 100644 index 0b0bb492c..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader.h +++ /dev/null @@ -1,125 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_SHADER_H_ -#define XENIA_GPU_D3D11_D3D11_SHADER_H_ - -#include - -#include -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - -struct Output; - -typedef struct { - Output* output; - xenos::XE_GPU_SHADER_TYPE type; - uint32_t tex_fetch_index; -} xe_gpu_translate_ctx_t; - -class D3D11GeometryShader; - - -class D3D11Shader : public Shader { -public: - virtual ~D3D11Shader(); - - const static uint32_t MAX_INTERPOLATORS = 16; - -protected: - D3D11Shader( - ID3D11Device* device, - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - - const char* translated_src() const { return translated_src_; } - void set_translated_src(char* value); - - void AppendTextureHeader(Output* output); - int TranslateExec( - xe_gpu_translate_ctx_t& ctx, const xenos::instr_cf_exec_t& cf); - - ID3D10Blob* Compile(const char* shader_source); - -protected: - ID3D11Device* device_; - - char* translated_src_; -}; - - -class D3D11VertexShader : public D3D11Shader { -public: - D3D11VertexShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - virtual ~D3D11VertexShader(); - - ID3D11VertexShader* handle() const { return handle_; } - ID3D11InputLayout* input_layout() const { return input_layout_; } - - int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl); - - enum GeometryShaderType { - POINT_SPRITE_SHADER, - RECT_LIST_SHADER, - QUAD_LIST_SHADER, - - MAX_GEOMETRY_SHADER_TYPE, - }; - int DemandGeometryShader(GeometryShaderType type, - D3D11GeometryShader** out_shader); - -private: - const char* Translate(xenos::xe_gpu_program_cntl_t* program_cntl); - -private: - ID3D11VertexShader* handle_; - ID3D11InputLayout* input_layout_; - D3D11GeometryShader* geometry_shaders_[MAX_GEOMETRY_SHADER_TYPE]; -}; - - -class D3D11PixelShader : public D3D11Shader { -public: - D3D11PixelShader( - ID3D11Device* device, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - virtual ~D3D11PixelShader(); - - ID3D11PixelShader* handle() const { return handle_; } - - int Prepare(xenos::xe_gpu_program_cntl_t* program_cntl, - D3D11VertexShader* input_shader); - -private: - const char* Translate(xenos::xe_gpu_program_cntl_t* program_cntl, - D3D11VertexShader* input_shader); - -private: - ID3D11PixelShader* handle_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_SHADER_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc b/src/xenia/gpu/d3d11/d3d11_shader_cache.cc deleted file mode 100644 index 7f6a5a722..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.cc +++ /dev/null @@ -1,45 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::d3d11; -using namespace xe::gpu::xenos; - - -D3D11ShaderCache::D3D11ShaderCache(ID3D11Device* device) { - device_ = device; - device_->AddRef(); -} - -D3D11ShaderCache::~D3D11ShaderCache() { - device_->Release(); -} - -Shader* D3D11ShaderCache::CreateCore( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) { - switch (type) { - case XE_GPU_SHADER_TYPE_VERTEX: - return new D3D11VertexShader( - device_, src_ptr, length, hash); - case XE_GPU_SHADER_TYPE_PIXEL: - return new D3D11PixelShader( - device_, src_ptr, length, hash); - default: - XEASSERTALWAYS(); - return NULL; - } -} \ No newline at end of file diff --git a/src/xenia/gpu/d3d11/d3d11_shader_cache.h b/src/xenia/gpu/d3d11/d3d11_shader_cache.h deleted file mode 100644 index 661fb38f8..000000000 --- a/src/xenia/gpu/d3d11/d3d11_shader_cache.h +++ /dev/null @@ -1,46 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_D3D11_D3D11_SHADER_CACHE_H_ -#define XENIA_GPU_D3D11_D3D11_SHADER_CACHE_H_ - -#include - -#include - -#include - - -namespace xe { -namespace gpu { -namespace d3d11 { - - -class D3D11ShaderCache : public ShaderCache { -public: - D3D11ShaderCache(ID3D11Device* device); - virtual ~D3D11ShaderCache(); - -protected: - virtual Shader* CreateCore( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - -protected: - ID3D11Device* device_; -}; - - -} // namespace d3d11 -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_D3D11_D3D11_SHADER_CACHE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader_resource.cc b/src/xenia/gpu/d3d11/d3d11_shader_resource.cc new file mode 100644 index 000000000..e4be7e2cf --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_resource.cc @@ -0,0 +1,381 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include +#include + +#include + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +namespace { + +ID3D10Blob* D3D11ShaderCompile(XE_GPU_SHADER_TYPE type, + const char* shader_source, + const char* disasm_source) { + SCOPE_profile_cpu_f("gpu"); + + // TODO(benvanik): pick shared runtime mode defines. + D3D10_SHADER_MACRO defines[] = { + "TEST_DEFINE", "1", + 0, 0, + }; + + uint32_t flags1 = 0; + flags1 |= D3D10_SHADER_DEBUG; + flags1 |= D3D10_SHADER_ENABLE_STRICTNESS; + uint32_t flags2 = 0; + + // Create a name. + const char* base_path = ""; + if (FLAGS_dump_shaders.size()) { + base_path = FLAGS_dump_shaders.c_str(); + } + size_t hash = xe_hash64(disasm_source, xestrlena(disasm_source)); // ? + char file_name[XE_MAX_PATH]; + xesnprintfa(file_name, XECOUNT(file_name), + "%s/gen_%.16llX.%s", + base_path, + hash, + type == XE_GPU_SHADER_TYPE_VERTEX ? "vs" : "ps"); + + if (FLAGS_dump_shaders.size()) { + FILE* f = fopen(file_name, "w"); + fprintf(f, shader_source); + fprintf(f, "\n\n"); + fprintf(f, "/*\n"); + fprintf(f, disasm_source); + fprintf(f, " */\n"); + fclose(f); + } + + // Compile shader to bytecode blob. + ID3D10Blob* shader_blob = 0; + ID3D10Blob* error_blob = 0; + HRESULT hr = D3DCompile( + shader_source, strlen(shader_source), + file_name, + defines, nullptr, + "main", + type == XE_GPU_SHADER_TYPE_VERTEX ? "vs_5_0" : "ps_5_0", + flags1, flags2, + &shader_blob, &error_blob); + if (error_blob) { + char* msg = (char*)error_blob->GetBufferPointer(); + XELOGE("D3D11: shader compile failed with %s", msg); + } + XESAFERELEASE(error_blob); + if (FAILED(hr)) { + return nullptr; + } + return shader_blob; +} + +} // namespace + + +D3D11VertexShaderResource::D3D11VertexShaderResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : VertexShaderResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr), + input_layout_(nullptr), + translated_src_(nullptr) { + xe_zero_struct(geometry_shaders_, sizeof(geometry_shaders_)); +} + +D3D11VertexShaderResource::~D3D11VertexShaderResource() { + XESAFERELEASE(handle_); + XESAFERELEASE(input_layout_); + for (int i = 0; i < XECOUNT(geometry_shaders_); ++i) { + delete geometry_shaders_[i]; + } + xe_free(translated_src_); +} + +int D3D11VertexShaderResource::Prepare( + const xe_gpu_program_cntl_t& program_cntl) { + SCOPE_profile_cpu_f("gpu"); + if (is_prepared_ || handle_) { + return 0; + } + + // TODO(benvanik): look in file based on hash/etc. + void* byte_code = NULL; + size_t byte_code_length = 0; + + // Translate and compile source. + D3D11ShaderTranslator translator; + int ret = translator.TranslateVertexShader(this, program_cntl); + if (ret) { + XELOGE("D3D11: failed to translate vertex shader"); + return ret; + } + translated_src_ = xestrdupa(translator.translated_src()); + + ID3D10Blob* shader_blob = D3D11ShaderCompile( + XE_GPU_SHADER_TYPE_VERTEX, translated_src_, disasm_src()); + if (!shader_blob) { + return 1; + } + byte_code_length = shader_blob->GetBufferSize(); + byte_code = xe_malloc(byte_code_length); + xe_copy_struct( + byte_code, shader_blob->GetBufferPointer(), byte_code_length); + XESAFERELEASE(shader_blob); + + // Create shader. + HRESULT hr = resource_cache_->device()->CreateVertexShader( + byte_code, byte_code_length, + nullptr, + &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: failed to create vertex shader"); + xe_free(byte_code); + return 1; + } + + // Create input layout. + ret = CreateInputLayout(byte_code, byte_code_length); + xe_free(byte_code); + if (ret) { + return 1; + } + is_prepared_ = true; + return 0; +} + +int D3D11VertexShaderResource::CreateInputLayout(const void* byte_code, + size_t byte_code_length) { + size_t element_count = 0; + const auto& inputs = buffer_inputs(); + for (uint32_t n = 0; n < inputs.count; n++) { + element_count += inputs.descs[n].info.element_count; + } + if (!element_count) { + XELOGW("D3D11: vertex shader with zero inputs -- retaining previous values?"); + input_layout_ = NULL; + return 0; + } + + D3D11_INPUT_ELEMENT_DESC* element_descs = + (D3D11_INPUT_ELEMENT_DESC*)xe_alloca( + sizeof(D3D11_INPUT_ELEMENT_DESC) * element_count); + uint32_t el_index = 0; + for (uint32_t n = 0; n < inputs.count; n++) { + const auto& input = inputs.descs[n]; + for (uint32_t m = 0; m < input.info.element_count; m++) { + const auto& el = input.info.elements[m]; + uint32_t vb_slot = input.input_index; + DXGI_FORMAT vtx_format; + switch (el.format) { + case FMT_8_8_8_8: + if (el.is_normalized) { + vtx_format = el.is_signed ? + DXGI_FORMAT_R8G8B8A8_SNORM : DXGI_FORMAT_R8G8B8A8_UNORM; + } else { + vtx_format = el.is_signed ? + DXGI_FORMAT_R8G8B8A8_SINT : DXGI_FORMAT_R8G8B8A8_UINT; + } + break; + case FMT_2_10_10_10: + if (el.is_normalized) { + vtx_format = DXGI_FORMAT_R10G10B10A2_UNORM; + } else { + vtx_format = DXGI_FORMAT_R10G10B10A2_UINT; + } + break; + // DXGI_FORMAT_R11G11B10_FLOAT? + case FMT_16_16: + if (el.is_normalized) { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16_SNORM : DXGI_FORMAT_R16G16_UNORM; + } else { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16_SINT : DXGI_FORMAT_R16G16_UINT; + } + break; + case FMT_16_16_16_16: + if (el.is_normalized) { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16B16A16_SNORM : DXGI_FORMAT_R16G16B16A16_UNORM; + } else { + vtx_format = el.is_signed ? + DXGI_FORMAT_R16G16B16A16_SINT : DXGI_FORMAT_R16G16B16A16_UINT; + } + break; + case FMT_16_16_FLOAT: + vtx_format = DXGI_FORMAT_R16G16_FLOAT; + break; + case FMT_16_16_16_16_FLOAT: + vtx_format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + case FMT_32: + vtx_format = el.is_signed ? + DXGI_FORMAT_R32_SINT : DXGI_FORMAT_R32_UINT; + break; + case FMT_32_32: + vtx_format = el.is_signed ? + DXGI_FORMAT_R32G32_SINT : DXGI_FORMAT_R32G32_UINT; + break; + case FMT_32_32_32_32: + vtx_format = el.is_signed ? + DXGI_FORMAT_R32G32B32A32_SINT : DXGI_FORMAT_R32G32B32A32_UINT; + break; + case FMT_32_FLOAT: + vtx_format = DXGI_FORMAT_R32_FLOAT; + break; + case FMT_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32_FLOAT; + break; + case FMT_32_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32B32_FLOAT; + break; + case FMT_32_32_32_32_FLOAT: + vtx_format = DXGI_FORMAT_R32G32B32A32_FLOAT; + break; + default: + XEASSERTALWAYS(); + break; + } + element_descs[el_index].SemanticName = "XE_VF"; + element_descs[el_index].SemanticIndex = el_index; + element_descs[el_index].Format = vtx_format; + element_descs[el_index].InputSlot = vb_slot; + element_descs[el_index].AlignedByteOffset = el.offset_words * 4; + element_descs[el_index].InputSlotClass = D3D11_INPUT_PER_VERTEX_DATA; + element_descs[el_index].InstanceDataStepRate = 0; + el_index++; + } + } + HRESULT hr = resource_cache_->device()->CreateInputLayout( + element_descs, + (UINT)element_count, + byte_code, byte_code_length, + &input_layout_); + if (FAILED(hr)) { + XELOGE("D3D11: failed to create vertex shader input layout"); + return 1; + } + + return 0; +} + +int D3D11VertexShaderResource::DemandGeometryShader( + GeometryShaderType type, D3D11GeometryShader** out_shader) { + if (geometry_shaders_[type]) { + *out_shader = geometry_shaders_[type]; + return 0; + } + + // Demand generate. + auto device = resource_cache_->device(); + D3D11GeometryShader* shader = nullptr; + switch (type) { + case POINT_SPRITE_SHADER: + shader = new D3D11PointSpriteGeometryShader(device); + break; + case RECT_LIST_SHADER: + shader = new D3D11RectListGeometryShader(device); + break; + case QUAD_LIST_SHADER: + shader = new D3D11QuadListGeometryShader(device); + break; + default: + XEASSERTALWAYS(); + return 1; + } + if (!shader) { + return 1; + } + + if (shader->Prepare(this)) { + delete shader; + return 1; + } + + geometry_shaders_[type] = shader; + *out_shader = geometry_shaders_[type]; + return 0; +} + +D3D11PixelShaderResource::D3D11PixelShaderResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : PixelShaderResource(memory_range, info), + resource_cache_(resource_cache), + handle_(nullptr), + translated_src_(nullptr) { +} + +D3D11PixelShaderResource::~D3D11PixelShaderResource() { + XESAFERELEASE(handle_); + xe_free(translated_src_); +} + +int D3D11PixelShaderResource::Prepare(const xe_gpu_program_cntl_t& program_cntl, + VertexShaderResource* input_shader) { + SCOPE_profile_cpu_f("gpu"); + if (is_prepared_ || handle_) { + return 0; + } + + // TODO(benvanik): look in file based on hash/etc. + void* byte_code = NULL; + size_t byte_code_length = 0; + + // Translate and compile source. + D3D11ShaderTranslator translator; + int ret = translator.TranslatePixelShader(this, + program_cntl, + input_shader->alloc_counts()); + if (ret) { + XELOGE("D3D11: failed to translate pixel shader"); + return ret; + } + translated_src_ = xestrdupa(translator.translated_src()); + + ID3D10Blob* shader_blob = D3D11ShaderCompile( + XE_GPU_SHADER_TYPE_PIXEL, translated_src_, disasm_src()); + if (!shader_blob) { + return 1; + } + byte_code_length = shader_blob->GetBufferSize(); + byte_code = xe_malloc(byte_code_length); + xe_copy_struct( + byte_code, shader_blob->GetBufferPointer(), byte_code_length); + XESAFERELEASE(shader_blob); + + // Create shader. + HRESULT hr = resource_cache_->device()->CreatePixelShader( + byte_code, byte_code_length, + nullptr, + &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: failed to create pixel shader"); + xe_free(byte_code); + return 1; + } + + xe_free(byte_code); + is_prepared_ = true; + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_shader_resource.h b/src/xenia/gpu/d3d11/d3d11_shader_resource.h new file mode 100644 index 000000000..5c0da8242 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_resource.h @@ -0,0 +1,91 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_SHADER_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_SHADER_RESOURCE_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11GeometryShader; +class D3D11ResourceCache; + +struct Output; +typedef struct { + Output* output; + xenos::XE_GPU_SHADER_TYPE type; + uint32_t tex_fetch_index; +} xe_gpu_translate_ctx_t; + +class D3D11VertexShaderResource : public VertexShaderResource { +public: + D3D11VertexShaderResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11VertexShaderResource() override; + + void* handle() const override { return handle_; } + ID3D11InputLayout* input_layout() const { return input_layout_; } + const char* translated_src() const { return translated_src_; } + + int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl) override; + + enum GeometryShaderType { + POINT_SPRITE_SHADER, + RECT_LIST_SHADER, + QUAD_LIST_SHADER, + MAX_GEOMETRY_SHADER_TYPE, // keep at the end + }; + int DemandGeometryShader(GeometryShaderType type, + D3D11GeometryShader** out_shader); + +private: + int CreateInputLayout(const void* byte_code, size_t byte_code_length); + + D3D11ResourceCache* resource_cache_; + ID3D11VertexShader* handle_; + ID3D11InputLayout* input_layout_; + D3D11GeometryShader* geometry_shaders_[MAX_GEOMETRY_SHADER_TYPE]; + char* translated_src_; +}; + + +class D3D11PixelShaderResource : public PixelShaderResource { +public: + D3D11PixelShaderResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11PixelShaderResource() override; + + void* handle() const override { return handle_; } + const char* translated_src() const { return translated_src_; } + + int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl, + VertexShaderResource* vertex_shader) override; + +private: + D3D11ResourceCache* resource_cache_; + ID3D11PixelShader* handle_; + char* translated_src_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_SHADER_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.cc b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc new file mode 100644 index 000000000..5bb28c6e6 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.cc @@ -0,0 +1,1631 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +namespace { + +const char* GetFormatTypeName(const VertexBufferResource::DeclElement& el) { + switch (el.format) { + case FMT_32: + return el.is_signed ? "int" : "uint"; + case FMT_32_FLOAT: + return "float"; + case FMT_16_16: + case FMT_32_32: + if (el.is_normalized) { + return el.is_signed ? "snorm float2" : "unorm float2"; + } else { + return el.is_signed ? "int2" : "uint2"; + } + case FMT_16_16_FLOAT: + case FMT_32_32_FLOAT: + return "float2"; + case FMT_10_11_11: + case FMT_11_11_10: + return "int3"; // ? + case FMT_32_32_32_FLOAT: + return "float3"; + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_16_16_16_16: + case FMT_32_32_32_32: + if (el.is_normalized) { + return el.is_signed ? "snorm float4" : "unorm float4"; + } else { + return el.is_signed ? "int4" : "uint4"; + } + case FMT_16_16_16_16_FLOAT: + case FMT_32_32_32_32_FLOAT: + return "float4"; + default: + XELOGE("Unknown vertex format: %d", el.format); + XEASSERTALWAYS(); + return "float4"; + } +} + +} // anonymous namespace + +D3D11ShaderTranslator::D3D11ShaderTranslator() + : capacity_(kCapacity), offset_(0) { + buffer_[0] = 0; +} + +int D3D11ShaderTranslator::TranslateVertexShader( + VertexShaderResource* vertex_shader, + const xe_gpu_program_cntl_t& program_cntl) { + SCOPE_profile_cpu_f("gpu"); + + type_ = XE_GPU_SHADER_TYPE_VERTEX; + tex_fetch_index_ = 0; + dwords_ = vertex_shader->dwords(); + + // Add constants buffers. + // We could optimize this by only including used buffers, but the compiler + // seems to do a good job of doing this for us. + // It also does read detection, so c[512] can end up c[4] in the asm - + // instead of doing this optimization ourselves we could maybe just query + // this from the compiler. + append( + "cbuffer float_consts : register(b0) {\n" + " float4 c[512];\n" + "};\n"); + // TODO(benvanik): add bool/loop constants. + + AppendTextureHeader(vertex_shader->sampler_inputs()); + + // Transform utilities. We adjust the output position in various ways + // as we can't do this via D3D11 APIs. + append( + "cbuffer vs_consts : register(b3) {\n" + " float4 window;\n" // x,y,w,h + " float4 viewport_z_enable;\n" // min,(max - min),?,enabled + " float4 viewport_size;\n" // x,y,w,h + "};" + "float4 applyViewport(float4 pos) {\n" + " if (viewport_z_enable.w) {\n" + //" pos.x = (pos.x + 1) * viewport_size.z * 0.5 + viewport_size.x;\n" + //" pos.y = (1 - pos.y) * viewport_size.w * 0.5 + viewport_size.y;\n" + //" pos.z = viewport_z_enable.x + pos.z * viewport_z_enable.y;\n" + // w? + " } else {\n" + " pos.xy = pos.xy / float2(window.z / 2.0, -window.w / 2.0) + float2(-1.0, 1.0);\n" + " pos.zw = float2(0.0, 1.0);\n" + " }\n" + " pos.xy += window.xy;\n" + " return pos;\n" + "}\n"); + + // Add vertex shader input. + append( + "struct VS_INPUT {\n"); + uint32_t el_index = 0; + const auto& buffer_inputs = vertex_shader->buffer_inputs(); + for (uint32_t n = 0; n < buffer_inputs.count; n++) { + const auto& input = buffer_inputs.descs[n]; + for (uint32_t m = 0; m < input.info.element_count; m++) { + const auto& el = input.info.elements[m]; + const char* type_name = GetFormatTypeName(el); + const auto& fetch = el.vtx_fetch; + uint32_t fetch_slot = fetch.const_index * 3 + fetch.const_index_sel; + append( + " %s vf%u_%d : XE_VF%u;\n", + type_name, fetch_slot, fetch.offset, el_index); + el_index++; + } + } + append( + "};\n"); + + // Add vertex shader output (pixel shader input). + const auto& alloc_counts = vertex_shader->alloc_counts(); + append( + "struct VS_OUTPUT {\n"); + if (alloc_counts.positions) { + XEASSERT(alloc_counts.positions == 1); + append( + " float4 oPos : SV_POSITION;\n"); + } + if (alloc_counts.params) { + append( + " float4 o[%d] : XE_O;\n", + kMaxInterpolators); + } + if (alloc_counts.point_size) { + append( + " float4 oPointSize : PSIZE;\n"); + } + append( + "};\n"); + + // Vertex shader main() header. + append( + "VS_OUTPUT main(VS_INPUT i) {\n" + " VS_OUTPUT o;\n"); + + // Always write position, as some shaders seem to only write certain values. + if (alloc_counts.positions) { + append( + " o.oPos = float4(0.0, 0.0, 0.0, 0.0);\n"); + } + if (alloc_counts.point_size) { + append( + " o.oPointSize = float4(1.0, 0.0, 0.0, 0.0);\n"); + } + + // TODO(benvanik): remove this, if possible (though the compiler may be smart + // enough to do it for us). + if (alloc_counts.params) { + for (uint32_t n = 0; n < kMaxInterpolators; n++) { + append( + " o.o[%d] = float4(0.0, 0.0, 0.0, 0.0);\n", n); + } + } + + // Add temporaries for any registers we may use. + uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; + for (uint32_t n = 0; n <= temp_regs; n++) { + append( + " float4 r%d = c[%d];\n", n, n); + } + append(" float4 t;\n"); + + // Execute blocks. + const auto& execs = vertex_shader->execs(); + for (auto it = execs.begin(); it != execs.end(); ++it) { + const instr_cf_exec_t& cf = *it; + // TODO(benvanik): figure out how sequences/jmps/loops/etc work. + if (TranslateExec(cf)) { + return 1; + } + } + + // main footer. + if (alloc_counts.positions) { + append( + " o.oPos = applyViewport(o.oPos);\n"); + } + append( + " return o;\n" + "};\n"); + + return 0; +} + +int D3D11ShaderTranslator::TranslatePixelShader( + PixelShaderResource* pixel_shader, + const xe_gpu_program_cntl_t& program_cntl, + const VertexShaderResource::AllocCounts& alloc_counts) { + SCOPE_profile_cpu_f("gpu"); + + // We need an input VS to make decisions here. + // TODO(benvanik): do we need to pair VS/PS up and store the combination? + // If the same PS is used with different VS that output different amounts + // (and less than the number of required registers), things may die. + + type_ = XE_GPU_SHADER_TYPE_PIXEL; + tex_fetch_index_ = 0; + dwords_ = pixel_shader->dwords(); + + // Add constants buffers. + // We could optimize this by only including used buffers, but the compiler + // seems to do a good job of doing this for us. + // It also does read detection, so c[512] can end up c[4] in the asm - + // instead of doing this optimization ourselves we could maybe just query + // this from the compiler. + append( + "cbuffer float_consts : register(b0) {\n" + " float4 c[512];\n" + "};\n"); + // TODO(benvanik): add bool/loop constants. + + AppendTextureHeader(pixel_shader->sampler_inputs()); + + // Add vertex shader output (pixel shader input). + append( + "struct VS_OUTPUT {\n"); + if (alloc_counts.positions) { + XEASSERT(alloc_counts.positions == 1); + append( + " float4 oPos : SV_POSITION;\n"); + } + if (alloc_counts.params) { + append( + " float4 o[%d] : XE_O;\n", + kMaxInterpolators); + } + append( + "};\n"); + + // Add pixel shader output. + append( + "struct PS_OUTPUT {\n"); + for (uint32_t n = 0; n < alloc_counts.params; n++) { + append( + " float4 oC%d : SV_TARGET%d;\n", n, n); + if (program_cntl.ps_export_depth) { + // Is this per render-target? + append( + " float oD%d : SV_DEPTH%d;\n", n, n); + } + } + append( + "};\n"); + + // Pixel shader main() header. + append( + "PS_OUTPUT main(VS_OUTPUT i) {\n" + " PS_OUTPUT o;\n" + " o.oC0 = float4(1.0, 0.0, 0.0, 1.0);\n"); + + // Add temporary registers. + uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; + for (uint32_t n = 0; n <= MAX(15, temp_regs); n++) { + append( + " float4 r%d = c[%d];\n", n, n + 256); + } + append(" float4 t;\n"); + + // Bring registers local. + if (alloc_counts.params) { + for (uint32_t n = 0; n < kMaxInterpolators; n++) { + append( + " r%d = i.o[%d];\n", n, n); + } + } + + // Execute blocks. + const auto& execs = pixel_shader->execs(); + for (auto it = execs.begin(); it != execs.end(); ++it) { + const instr_cf_exec_t& cf = *it; + // TODO(benvanik): figure out how sequences/jmps/loops/etc work. + if (TranslateExec(cf)) { + return 1; + } + } + + // main footer. + append( + " return o;\n" + "}\n"); + + return 0; +} + +void D3D11ShaderTranslator::AppendTextureHeader( + const ShaderResource::SamplerInputs& sampler_inputs) { + bool fetch_setup[32] = { false }; + + // 1 texture per constant slot, 1 sampler per fetch. + for (uint32_t n = 0; n < sampler_inputs.count; n++) { + const auto& input = sampler_inputs.descs[n]; + const auto& fetch = input.tex_fetch; + + // Add texture, if needed. + if (!fetch_setup[fetch.const_idx]) { + fetch_setup[fetch.const_idx] = true; + const char* texture_type = NULL; + switch (fetch.dimension) { + case DIMENSION_1D: + texture_type = "Texture1D"; + break; + default: + case DIMENSION_2D: + texture_type = "Texture2D"; + break; + case DIMENSION_3D: + texture_type = "Texture3D"; + break; + case DIMENSION_CUBE: + texture_type = "TextureCube"; + break; + } + append("%s x_texture_%d;\n", texture_type, fetch.const_idx); + } + + // Add sampler. + append("SamplerState x_sampler_%d;\n", n); + } +} + +namespace { + +static const char chan_names[] = { + 'x', 'y', 'z', 'w', + // these only apply to FETCH dst's, and we shouldn't be using them: + '0', '1', '?', '_', +}; + +} // namespace + +void D3D11ShaderTranslator::AppendSrcReg(uint32_t num, uint32_t type, + uint32_t swiz, uint32_t negate, + uint32_t abs) { + if (negate) { + append("-"); + } + if (abs) { + append("abs("); + } + if (type) { + // Register. + append("r%u", num); + } else { + // Constant. + append("c[%u]", type_ == XE_GPU_SHADER_TYPE_PIXEL ? num + 256 : num); + } + if (swiz) { + append("."); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[(swiz + i) & 0x3]); + swiz >>= 2; + } + } + if (abs) { + append(")"); + } +} + +void D3D11ShaderTranslator::AppendDestRegName(uint32_t num, uint32_t dst_exp) { + if (!dst_exp) { + // Register. + append("r%u", num); + } else { + // Export. + switch (type_) { + case XE_GPU_SHADER_TYPE_VERTEX: + switch (num) { + case 62: + append("o.oPos"); + break; + case 63: + append("o.oPointSize"); + break; + default: + // Varying. + append("o.o[%u]", num);; + break; + } + break; + case XE_GPU_SHADER_TYPE_PIXEL: + switch (num) { + case 0: + append("o.oC0"); + break; + default: + // TODO(benvanik): other render targets? + // TODO(benvanik): depth? + XEASSERTALWAYS(); + break; + } + break; + } + } +} + +void D3D11ShaderTranslator::AppendDestReg(uint32_t num, uint32_t mask, + uint32_t dst_exp) { + if (mask != 0xF) { + // If masking, store to a temporary variable and clean it up later. + append("t"); + } else { + // Store directly to output. + AppendDestRegName(num, dst_exp); + } +} + +void D3D11ShaderTranslator::AppendDestRegPost(uint32_t num, uint32_t mask, + uint32_t dst_exp) { + if (mask != 0xF) { + // Masking. + append(" "); + AppendDestRegName(num, dst_exp); + append(" = float4("); + for (int i = 0; i < 4; i++) { + // TODO(benvanik): mask out values? mix in old value as temp? + // append("%c", (mask & 0x1) ? chan_names[i] : 'w'); + if (!(mask & 0x1)) { + AppendDestRegName(num, dst_exp); + } else { + append("t"); + } + append(".%c", chan_names[i]); + mask >>= 1; + if (i < 3) { + append(", "); + } + } + append(");\n"); + } +} + +void D3D11ShaderTranslator::PrintSrcReg(uint32_t num, uint32_t type, + uint32_t swiz, uint32_t negate, + uint32_t abs) { + if (negate) { + append("-"); + } + if (abs) { + append("|"); + } + append("%c%u", type ? 'R' : 'C', num); + if (swiz) { + append("."); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[(swiz + i) & 0x3]); + swiz >>= 2; + } + } + if (abs) { + append("|"); + } +} + +void D3D11ShaderTranslator::PrintDstReg(uint32_t num, uint32_t mask, + uint32_t dst_exp) { + append("%s%u", dst_exp ? "export" : "R", num); + if (mask != 0xf) { + append("."); + for (int i = 0; i < 4; i++) { + append("%c", (mask & 0x1) ? chan_names[i] : '_'); + mask >>= 1; + } + } +} + +void D3D11ShaderTranslator::PrintExportComment(uint32_t num) { + const char *name = NULL; + switch (type_) { + case XE_GPU_SHADER_TYPE_VERTEX: + switch (num) { + case 62: name = "gl_Position"; break; + case 63: name = "gl_PointSize"; break; + } + break; + case XE_GPU_SHADER_TYPE_PIXEL: + switch (num) { + case 0: name = "gl_FragColor"; break; + } + break; + } + /* if we had a symbol table here, we could look + * up the name of the varying.. + */ + if (name) { + append("\t; %s", name); + } +} + +int D3D11ShaderTranslator::TranslateALU_ADDv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(" + "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MULv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(" * "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MAXv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + if (alu.src1_reg == alu.src2_reg && + alu.src1_sel == alu.src2_sel && + alu.src1_swiz == alu.src2_swiz && + alu.src1_reg_negate == alu.src2_reg_negate && + alu.src1_reg_abs == alu.src2_reg_abs) { + // This is a mov. + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + } else { + append("max("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + } + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MINv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("min("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_SETXXv(const instr_alu_t& alu, const char* op) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("float4(("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").x %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").x ? 1.0 : 0.0, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").y %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").y ? 1.0 : 0.0, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").z %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").z ? 1.0 : 0.0, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").w %s (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").w ? 1.0 : 0.0)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_SETEv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, "=="); +} +int D3D11ShaderTranslator::TranslateALU_SETGTv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, ">"); +} +int D3D11ShaderTranslator::TranslateALU_SETGTEv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, ">="); +} +int D3D11ShaderTranslator::TranslateALU_SETNEv(const instr_alu_t& alu) { + return TranslateALU_SETXXv(alu, "!="); +} + +int D3D11ShaderTranslator::TranslateALU_FRACv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("frac("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_TRUNCv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("trunc("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_FLOORv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("floor("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MULADDv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("mad("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(", "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_CNDXXv(const instr_alu_t& alu, const char* op) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + // TODO(benvanik): check argument order - could be 3 as compare and 1 and 2 as values. + append("float4(("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").x %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").x : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").x, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").y %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").y : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").y, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").z %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").z : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").z, ("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").w %s 0.0 ? (", op); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").w : ("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(").w)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_CNDEv(const instr_alu_t& alu) { + return TranslateALU_CNDXXv(alu, "=="); +} +int D3D11ShaderTranslator::TranslateALU_CNDGTEv(const instr_alu_t& alu) { + return TranslateALU_CNDXXv(alu, ">="); +} +int D3D11ShaderTranslator::TranslateALU_CNDGTv(const instr_alu_t& alu) { + return TranslateALU_CNDXXv(alu, ">"); +} + +int D3D11ShaderTranslator::TranslateALU_DOT4v(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("dot("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(", "); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(")"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_DOT3v(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("dot(float4("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").xyz, float4("); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").xyz)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_DOT2ADDv(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("dot(float4("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(").xy, float4("); + AppendSrcReg(alu.src2_reg, alu.src2_sel, alu.src2_swiz, alu.src2_reg_negate, alu.src2_reg_abs); + append(").xy) + "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +// CUBEv + +int D3D11ShaderTranslator::TranslateALU_MAX4v(const instr_alu_t& alu) { + AppendDestReg(alu.vector_dest, alu.vector_write_mask, alu.export_data); + append(" = "); + if (alu.vector_clamp) { + append("saturate("); + } + append("max("); + append("max("); + append("max("); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".x, "); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".y), "); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".z), "); + AppendSrcReg(alu.src1_reg, alu.src1_sel, alu.src1_swiz, alu.src1_reg_negate, alu.src1_reg_abs); + append(".w)"); + if (alu.vector_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.vector_dest, alu.vector_write_mask, alu.export_data); + return 0; +} + +// ... + +int D3D11ShaderTranslator::TranslateALU_MAXs(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + if ((alu.src3_swiz & 0x3) == (((alu.src3_swiz >> 2) + 1) & 0x3)) { + // This is a mov. + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + } else { + append("max("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x, "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".y).xxxx"); + } + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MINs(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + append("min("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x, "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".y).xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_SETXXs(const instr_alu_t& alu, const char* op) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + append("(("); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(".x %s 0.0) ? 1.0 : 0.0).xxxx", op); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_SETEs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, "=="); +} +int D3D11ShaderTranslator::TranslateALU_SETGTs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, ">"); +} +int D3D11ShaderTranslator::TranslateALU_SETGTEs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, ">="); +} +int D3D11ShaderTranslator::TranslateALU_SETNEs(const instr_alu_t& alu) { + return TranslateALU_SETXXs(alu, "!="); +} + +int D3D11ShaderTranslator::TranslateALU_RECIP_IEEE(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + append("(1.0 / "); + AppendSrcReg(alu.src3_reg, alu.src3_sel, alu.src3_swiz, alu.src3_reg_negate, alu.src3_reg_abs); + append(")"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} + +int D3D11ShaderTranslator::TranslateALU_MUL_CONST_0(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + uint32_t src3_swiz = alu.src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); + append("("); + AppendSrcReg(alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c * ", chan_names[swiz_a]); + AppendSrcReg(reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c", chan_names[swiz_b]); + append(").xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_MUL_CONST_1(const instr_alu_t& alu) { + return TranslateALU_MUL_CONST_0(alu); +} + +int D3D11ShaderTranslator::TranslateALU_ADD_CONST_0(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + uint32_t src3_swiz = alu.src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); + append("("); + AppendSrcReg(alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c + ", chan_names[swiz_a]); + AppendSrcReg(reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c", chan_names[swiz_b]); + append(").xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_ADD_CONST_1(const instr_alu_t& alu) { + return TranslateALU_ADD_CONST_0(alu); +} + +int D3D11ShaderTranslator::TranslateALU_SUB_CONST_0(const instr_alu_t& alu) { + AppendDestReg(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + append(" = "); + if (alu.scalar_clamp) { + append("saturate("); + } + uint32_t src3_swiz = alu.src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + uint32_t reg2 = (alu.scalar_opc & 1) | (alu.src3_swiz & 0x3C) | (alu.src3_sel << 1); + append("("); + AppendSrcReg(alu.src3_reg, 0, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c - ", chan_names[swiz_a]); + AppendSrcReg(reg2, 1, 0, alu.src3_reg_negate, alu.src3_reg_abs); + append(".%c", chan_names[swiz_b]); + append(").xxxx"); + if (alu.scalar_clamp) { + append(")"); + } + append(";\n"); + AppendDestRegPost(alu.scalar_dest, alu.scalar_write_mask, alu.export_data); + return 0; +} +int D3D11ShaderTranslator::TranslateALU_SUB_CONST_1(const instr_alu_t& alu) { + return TranslateALU_SUB_CONST_0(alu); +} + +namespace { + +typedef int (D3D11ShaderTranslator::*TranslateFn)(const instr_alu_t& alu); +typedef struct { + uint32_t num_srcs; + const char* name; + TranslateFn fn; +} TranslateInfo; +#define ALU_INSTR(opc, num_srcs) \ + { num_srcs, #opc, nullptr } +#define ALU_INSTR_IMPL(opc, num_srcs) \ + { num_srcs, #opc, &D3D11ShaderTranslator::TranslateALU_##opc } + +} // namespace + +int D3D11ShaderTranslator::TranslateALU(const instr_alu_t* alu, int sync) { + static TranslateInfo vector_alu_instrs[0x20] = { + ALU_INSTR_IMPL(ADDv, 2), // 0 + ALU_INSTR_IMPL(MULv, 2), // 1 + ALU_INSTR_IMPL(MAXv, 2), // 2 + ALU_INSTR_IMPL(MINv, 2), // 3 + ALU_INSTR_IMPL(SETEv, 2), // 4 + ALU_INSTR_IMPL(SETGTv, 2), // 5 + ALU_INSTR_IMPL(SETGTEv, 2), // 6 + ALU_INSTR_IMPL(SETNEv, 2), // 7 + ALU_INSTR_IMPL(FRACv, 1), // 8 + ALU_INSTR_IMPL(TRUNCv, 1), // 9 + ALU_INSTR_IMPL(FLOORv, 1), // 10 + ALU_INSTR_IMPL(MULADDv, 3), // 11 + ALU_INSTR_IMPL(CNDEv, 3), // 12 + ALU_INSTR_IMPL(CNDGTEv, 3), // 13 + ALU_INSTR_IMPL(CNDGTv, 3), // 14 + ALU_INSTR_IMPL(DOT4v, 2), // 15 + ALU_INSTR_IMPL(DOT3v, 2), // 16 + ALU_INSTR_IMPL(DOT2ADDv, 3), // 17 -- ??? + ALU_INSTR(CUBEv, 2), // 18 + ALU_INSTR_IMPL(MAX4v, 1), // 19 + ALU_INSTR(PRED_SETE_PUSHv, 2), // 20 + ALU_INSTR(PRED_SETNE_PUSHv, 2), // 21 + ALU_INSTR(PRED_SETGT_PUSHv, 2), // 22 + ALU_INSTR(PRED_SETGTE_PUSHv, 2), // 23 + ALU_INSTR(KILLEv, 2), // 24 + ALU_INSTR(KILLGTv, 2), // 25 + ALU_INSTR(KILLGTEv, 2), // 26 + ALU_INSTR(KILLNEv, 2), // 27 + ALU_INSTR(DSTv, 2), // 28 + ALU_INSTR(MOVAv, 1), // 29 + }; + static TranslateInfo scalar_alu_instrs[0x40] = { + ALU_INSTR(ADDs, 1), // 0 + ALU_INSTR(ADD_PREVs, 1), // 1 + ALU_INSTR(MULs, 1), // 2 + ALU_INSTR(MUL_PREVs, 1), // 3 + ALU_INSTR(MUL_PREV2s, 1), // 4 + ALU_INSTR_IMPL(MAXs, 1), // 5 + ALU_INSTR_IMPL(MINs, 1), // 6 + ALU_INSTR_IMPL(SETEs, 1), // 7 + ALU_INSTR_IMPL(SETGTs, 1), // 8 + ALU_INSTR_IMPL(SETGTEs, 1), // 9 + ALU_INSTR_IMPL(SETNEs, 1), // 10 + ALU_INSTR(FRACs, 1), // 11 + ALU_INSTR(TRUNCs, 1), // 12 + ALU_INSTR(FLOORs, 1), // 13 + ALU_INSTR(EXP_IEEE, 1), // 14 + ALU_INSTR(LOG_CLAMP, 1), // 15 + ALU_INSTR(LOG_IEEE, 1), // 16 + ALU_INSTR(RECIP_CLAMP, 1), // 17 + ALU_INSTR(RECIP_FF, 1), // 18 + ALU_INSTR_IMPL(RECIP_IEEE, 1), // 19 + ALU_INSTR(RECIPSQ_CLAMP, 1), // 20 + ALU_INSTR(RECIPSQ_FF, 1), // 21 + ALU_INSTR(RECIPSQ_IEEE, 1), // 22 + ALU_INSTR(MOVAs, 1), // 23 + ALU_INSTR(MOVA_FLOORs, 1), // 24 + ALU_INSTR(SUBs, 1), // 25 + ALU_INSTR(SUB_PREVs, 1), // 26 + ALU_INSTR(PRED_SETEs, 1), // 27 + ALU_INSTR(PRED_SETNEs, 1), // 28 + ALU_INSTR(PRED_SETGTs, 1), // 29 + ALU_INSTR(PRED_SETGTEs, 1), // 30 + ALU_INSTR(PRED_SET_INVs, 1), // 31 + ALU_INSTR(PRED_SET_POPs, 1), // 32 + ALU_INSTR(PRED_SET_CLRs, 1), // 33 + ALU_INSTR(PRED_SET_RESTOREs, 1), // 34 + ALU_INSTR(KILLEs, 1), // 35 + ALU_INSTR(KILLGTs, 1), // 36 + ALU_INSTR(KILLGTEs, 1), // 37 + ALU_INSTR(KILLNEs, 1), // 38 + ALU_INSTR(KILLONEs, 1), // 39 + ALU_INSTR(SQRT_IEEE, 1), // 40 + { 0, 0, false }, + ALU_INSTR_IMPL(MUL_CONST_0, 2), // 42 + ALU_INSTR_IMPL(MUL_CONST_1, 2), // 43 + ALU_INSTR_IMPL(ADD_CONST_0, 2), // 44 + ALU_INSTR_IMPL(ADD_CONST_1, 2), // 45 + ALU_INSTR_IMPL(SUB_CONST_0, 2), // 46 + ALU_INSTR_IMPL(SUB_CONST_1, 2), // 47 + ALU_INSTR(SIN, 1), // 48 + ALU_INSTR(COS, 1), // 49 + ALU_INSTR(RETAIN_PREV, 1), // 50 + }; +#undef ALU_INSTR +#undef ALU_INSTR_IMPL + + if (!alu->scalar_write_mask && !alu->vector_write_mask) { + append(" // \n"); + return 0; + } + + if (alu->vector_write_mask) { + // Disassemble vector op. + const auto& iv = vector_alu_instrs[alu->vector_opc]; + append(" // %sALU:\t", sync ? "(S)" : " "); + append("%s", iv.name); + if (alu->pred_select & 0x2) { + // seems to work similar to conditional execution in ARM instruction + // set, so let's use a similar syntax for now: + append((alu->pred_select & 0x1) ? "EQ" : "NE"); + } + append("\t"); + PrintDstReg(alu->vector_dest, alu->vector_write_mask, alu->export_data); + append(" = "); + if (iv.num_srcs == 3) { + PrintSrcReg(alu->src3_reg, alu->src3_sel, alu->src3_swiz, + alu->src3_reg_negate, alu->src3_reg_abs); + append(", "); + } + PrintSrcReg(alu->src1_reg, alu->src1_sel, alu->src1_swiz, + alu->src1_reg_negate, alu->src1_reg_abs); + if (iv.num_srcs > 1) { + append(", "); + PrintSrcReg(alu->src2_reg, alu->src2_sel, alu->src2_swiz, + alu->src2_reg_negate, alu->src2_reg_abs); + } + if (alu->vector_clamp) { + append(" CLAMP"); + } + if (alu->export_data) { + PrintExportComment(alu->vector_dest); + } + append("\n"); + + // Translate vector op. + if (iv.fn) { + append(" "); + if ((this->*iv.fn)(*alu)) { + return 1; + } + } else { + append(" // \n"); + } + } + + if (alu->scalar_write_mask || !alu->vector_write_mask) { + // 2nd optional scalar op: + + // Disassemble scalar op. + const auto& is = scalar_alu_instrs[alu->scalar_opc]; + append(" // "); + append("\t"); + if (is.name) { + append("\t \t%s\t", is.name); + } else { + append("\t \tOP(%u)\t", alu->scalar_opc); + } + PrintDstReg(alu->scalar_dest, alu->scalar_write_mask, alu->export_data); + append(" = "); + if (is.num_srcs == 2) { + // ADD_CONST_0 dest, [const], [reg] + uint32_t src3_swiz = alu->src3_swiz & ~0x3C; + uint32_t swiz_a = ((src3_swiz >> 6) - 1) & 0x3; + uint32_t swiz_b = (src3_swiz & 0x3); + PrintSrcReg(alu->src3_reg, 0, 0, + alu->src3_reg_negate, alu->src3_reg_abs); + append(".%c", chan_names[swiz_a]); + append(", "); + uint32_t reg2 = (alu->scalar_opc & 1) | (alu->src3_swiz & 0x3C) | (alu->src3_sel << 1); + PrintSrcReg(reg2, 1, 0, + alu->src3_reg_negate, alu->src3_reg_abs); + append(".%c", chan_names[swiz_b]); + } else { + PrintSrcReg(alu->src3_reg, alu->src3_sel, alu->src3_swiz, + alu->src3_reg_negate, alu->src3_reg_abs); + } + if (alu->scalar_clamp) { + append(" CLAMP"); + } + if (alu->export_data) { + PrintExportComment(alu->scalar_dest); + } + append("\n"); + + // Translate scalar op. + if (is.fn) { + append(" "); + if ((this->*is.fn)(*alu)) { + return 1; + } + } else { + append(" // \n"); + } + } + + return 0; +} + +void D3D11ShaderTranslator::PrintDestFecth(uint32_t dst_reg, + uint32_t dst_swiz) { + append("\tR%u.", dst_reg); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[dst_swiz & 0x7]); + dst_swiz >>= 3; + } +} + +void D3D11ShaderTranslator::AppendFetchDest(uint32_t dst_reg, + uint32_t dst_swiz) { + append("r%u.", dst_reg); + for (int i = 0; i < 4; i++) { + append("%c", chan_names[dst_swiz & 0x7]); + dst_swiz >>= 3; + } +} + +int D3D11ShaderTranslator::GetFormatComponentCount(uint32_t format) { + switch (format) { + case FMT_32: + case FMT_32_FLOAT: + return 1; + case FMT_16_16: + case FMT_16_16_FLOAT: + case FMT_32_32: + case FMT_32_32_FLOAT: + return 2; + case FMT_10_11_11: + case FMT_11_11_10: + case FMT_32_32_32_FLOAT: + return 3; + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_16_16_16_16: + case FMT_16_16_16_16_FLOAT: + case FMT_32_32_32_32: + case FMT_32_32_32_32_FLOAT: + return 4; + default: + XELOGE("Unknown vertex format: %d", format); + XEASSERTALWAYS(); + return 4; + } +} + +int D3D11ShaderTranslator::TranslateExec(const instr_cf_exec_t& cf) { + static const struct { + const char *name; + } cf_instructions[] = { + #define INSTR(opc, fxn) { #opc } + INSTR(NOP, print_cf_nop), + INSTR(EXEC, print_cf_exec), + INSTR(EXEC_END, print_cf_exec), + INSTR(COND_EXEC, print_cf_exec), + INSTR(COND_EXEC_END, print_cf_exec), + INSTR(COND_PRED_EXEC, print_cf_exec), + INSTR(COND_PRED_EXEC_END, print_cf_exec), + INSTR(LOOP_START, print_cf_loop), + INSTR(LOOP_END, print_cf_loop), + INSTR(COND_CALL, print_cf_jmp_call), + INSTR(RETURN, print_cf_jmp_call), + INSTR(COND_JMP, print_cf_jmp_call), + INSTR(ALLOC, print_cf_alloc), + INSTR(COND_EXEC_PRED_CLEAN, print_cf_exec), + INSTR(COND_EXEC_PRED_CLEAN_END, print_cf_exec), + INSTR(MARK_VS_FETCH_DONE, print_cf_nop), // ?? + #undef INSTR + }; + + append( + " // %s ADDR(0x%x) CNT(0x%x)", + cf_instructions[cf.opc].name, cf.address, cf.count); + if (cf.yeild) { + append(" YIELD"); + } + uint8_t vc = cf.vc_hi | (cf.vc_lo << 2); + if (vc) { + append(" VC(0x%x)", vc); + } + if (cf.bool_addr) { + append(" BOOL_ADDR(0x%x)", cf.bool_addr); + } + if (cf.address_mode == ABSOLUTE_ADDR) { + append(" ABSOLUTE_ADDR"); + } + if (cf.is_cond_exec()) { + append(" COND(%d)", cf.condition); + } + append("\n"); + + uint32_t sequence = cf.serialize; + for (uint32_t i = 0; i < cf.count; i++) { + uint32_t alu_off = (cf.address + i); + int sync = sequence & 0x2; + if (sequence & 0x1) { + const instr_fetch_t* fetch = + (const instr_fetch_t*)(dwords_ + alu_off * 3); + switch (fetch->opc) { + case VTX_FETCH: + if (TranslateVertexFetch(&fetch->vtx, sync)) { + return 1; + } + break; + case TEX_FETCH: + if (TranslateTextureFetch(&fetch->tex, sync)) { + return 1; + } + break; + case TEX_GET_BORDER_COLOR_FRAC: + case TEX_GET_COMP_TEX_LOD: + case TEX_GET_GRADIENTS: + case TEX_GET_WEIGHTS: + case TEX_SET_TEX_LOD: + case TEX_SET_GRADIENTS_H: + case TEX_SET_GRADIENTS_V: + default: + XEASSERTALWAYS(); + break; + } + } else { + const instr_alu_t* alu = + (const instr_alu_t*)(dwords_ + alu_off * 3); + if (TranslateALU(alu, sync)) { + return 1; + } + } + sequence >>= 2; + } + + return 0; +} + +int D3D11ShaderTranslator::TranslateVertexFetch(const instr_fetch_vtx_t* vtx, + int sync) { + static const struct { + const char *name; + } fetch_types[0xff] = { + #define TYPE(id) { #id } + TYPE(FMT_1_REVERSE), // 0 + {0}, + TYPE(FMT_8), // 2 + {0}, + {0}, + {0}, + TYPE(FMT_8_8_8_8), // 6 + TYPE(FMT_2_10_10_10), // 7 + {0}, + {0}, + TYPE(FMT_8_8), // 10 + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + TYPE(FMT_16), // 24 + TYPE(FMT_16_16), // 25 + TYPE(FMT_16_16_16_16), // 26 + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + TYPE(FMT_32), // 33 + TYPE(FMT_32_32), // 34 + TYPE(FMT_32_32_32_32), // 35 + TYPE(FMT_32_FLOAT), // 36 + TYPE(FMT_32_32_FLOAT), // 37 + TYPE(FMT_32_32_32_32_FLOAT), // 38 + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + {0}, + TYPE(FMT_32_32_32_FLOAT), // 57 + #undef TYPE + }; + + // Disassemble. + append(" // %sFETCH:\t", sync ? "(S)" : " "); + if (vtx->pred_select) { + append(vtx->pred_condition ? "EQ" : "NE"); + } + PrintDestFecth(vtx->dst_reg, vtx->dst_swiz); + append(" = R%u.", vtx->src_reg); + append("%c", chan_names[vtx->src_swiz & 0x3]); + if (fetch_types[vtx->format].name) { + append(" %s", fetch_types[vtx->format].name); + } else { + append(" TYPE(0x%x)", vtx->format); + } + append(" %s", vtx->format_comp_all ? "SIGNED" : "UNSIGNED"); + if (!vtx->num_format_all) { + append(" NORMALIZED"); + } + append(" STRIDE(%u)", vtx->stride); + if (vtx->offset) { + append(" OFFSET(%u)", vtx->offset); + } + append(" CONST(%u, %u)", vtx->const_index, vtx->const_index_sel); + if (1) { + // XXX + append(" src_reg_am=%u", vtx->src_reg_am); + append(" dst_reg_am=%u", vtx->dst_reg_am); + append(" num_format_all=%u", vtx->num_format_all); + append(" signed_rf_mode_all=%u", vtx->signed_rf_mode_all); + append(" exp_adjust_all=%u", vtx->exp_adjust_all); + } + append("\n"); + + // Translate. + append(" "); + append("r%u.xyzw", vtx->dst_reg); + append(" = float4("); + uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; + // TODO(benvanik): detect xyzw = xyzw, etc. + // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc + uint32_t component_count = GetFormatComponentCount(vtx->format); + uint32_t dst_swiz = vtx->dst_swiz; + for (int i = 0; i < 4; i++) { + if ((dst_swiz & 0x7) == 4) { + append("0.0"); + } else if ((dst_swiz & 0x7) == 5) { + append("1.0"); + } else if ((dst_swiz & 0x7) == 6) { + // ? + append("?"); + } else if ((dst_swiz & 0x7) == 7) { + append("r%u.%c", vtx->dst_reg, chan_names[i]); + } else { + append("i.vf%u_%d.%c", + fetch_slot, vtx->offset, + chan_names[dst_swiz & 0x3]); + } + if (i < 3) { + append(", "); + } + dst_swiz >>= 3; + } + append(");\n"); + return 0; +} + +int D3D11ShaderTranslator::TranslateTextureFetch(const instr_fetch_tex_t* tex, + int sync) { + // Disassemble. + static const char *filter[] = { + "POINT", // TEX_FILTER_POINT + "LINEAR", // TEX_FILTER_LINEAR + "BASEMAP", // TEX_FILTER_BASEMAP + }; + static const char *aniso_filter[] = { + "DISABLED", // ANISO_FILTER_DISABLED + "MAX_1_1", // ANISO_FILTER_MAX_1_1 + "MAX_2_1", // ANISO_FILTER_MAX_2_1 + "MAX_4_1", // ANISO_FILTER_MAX_4_1 + "MAX_8_1", // ANISO_FILTER_MAX_8_1 + "MAX_16_1", // ANISO_FILTER_MAX_16_1 + }; + static const char *arbitrary_filter[] = { + "2x4_SYM", // ARBITRARY_FILTER_2X4_SYM + "2x4_ASYM", // ARBITRARY_FILTER_2X4_ASYM + "4x2_SYM", // ARBITRARY_FILTER_4X2_SYM + "4x2_ASYM", // ARBITRARY_FILTER_4X2_ASYM + "4x4_SYM", // ARBITRARY_FILTER_4X4_SYM + "4x4_ASYM", // ARBITRARY_FILTER_4X4_ASYM + }; + static const char *sample_loc[] = { + "CENTROID", // SAMPLE_CENTROID + "CENTER", // SAMPLE_CENTER + }; + uint32_t src_swiz = tex->src_swiz; + append(" // %sFETCH:\t", sync ? "(S)" : " "); + if (tex->pred_select) { + append(tex->pred_condition ? "EQ" : "NE"); + } + PrintDestFecth(tex->dst_reg, tex->dst_swiz); + append(" = R%u.", tex->src_reg); + for (int i = 0; i < 3; i++) { + append("%c", chan_names[src_swiz & 0x3]); + src_swiz >>= 2; + } + append(" CONST(%u)", tex->const_idx); + if (tex->fetch_valid_only) { + append(" VALID_ONLY"); + } + if (tex->tx_coord_denorm) { + append(" DENORM"); + } + if (tex->mag_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" MAG(%s)", filter[tex->mag_filter]); + } + if (tex->min_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" MIN(%s)", filter[tex->min_filter]); + } + if (tex->mip_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" MIP(%s)", filter[tex->mip_filter]); + } + if (tex->aniso_filter != ANISO_FILTER_USE_FETCH_CONST) { + append(" ANISO(%s)", aniso_filter[tex->aniso_filter]); + } + if (tex->arbitrary_filter != ARBITRARY_FILTER_USE_FETCH_CONST) { + append(" ARBITRARY(%s)", arbitrary_filter[tex->arbitrary_filter]); + } + if (tex->vol_mag_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" VOL_MAG(%s)", filter[tex->vol_mag_filter]); + } + if (tex->vol_min_filter != TEX_FILTER_USE_FETCH_CONST) { + append(" VOL_MIN(%s)", filter[tex->vol_min_filter]); + } + if (!tex->use_comp_lod) { + append(" LOD(%u)", tex->use_comp_lod); + append(" LOD_BIAS(%u)", tex->lod_bias); + } + if (tex->use_reg_lod) { + append(" REG_LOD(%u)", tex->use_reg_lod); + } + if (tex->use_reg_gradients) { + append(" USE_REG_GRADIENTS"); + } + append(" LOCATION(%s)", sample_loc[tex->sample_location]); + if (tex->offset_x || tex->offset_y || tex->offset_z) { + append(" OFFSET(%u,%u,%u)", tex->offset_x, tex->offset_y, tex->offset_z); + } + append("\n"); + + int src_component_count = 0; + switch (tex->dimension) { + case DIMENSION_1D: + src_component_count = 1; + break; + default: + case DIMENSION_2D: + src_component_count = 2; + break; + case DIMENSION_3D: + src_component_count = 3; + break; + case DIMENSION_CUBE: + src_component_count = 3; + break; + } + + // Translate. + append(" "); + append("r%u.xyzw", tex->dst_reg); + append(" = "); + append( + "x_texture_%d.Sample(x_sampler_%d, r%u.", + tex->const_idx, + tex_fetch_index_++, // hacky way to line up to tex buffers + tex->src_reg); + src_swiz = tex->src_swiz; + for (int i = 0; i < src_component_count; i++) { + append("%c", chan_names[src_swiz & 0x3]); + src_swiz >>= 2; + } + append(")."); + + // Pass one over dest does xyzw and fakes the special values. + // TODO(benvanik): detect and set as rN = float4(samp.xyz, 1.0); / etc + uint32_t dst_swiz = tex->dst_swiz; + for (int i = 0; i < 4; i++) { + append("%c", chan_names[dst_swiz & 0x3]); + dst_swiz >>= 3; + } + append(";\n"); + // Do another pass to set constant values. + dst_swiz = tex->dst_swiz; + for (int i = 0; i < 4; i++) { + if ((dst_swiz & 0x7) == 4) { + append(" r%u.%c = 0.0;\n", tex->dst_reg, chan_names[i]); + } else if ((dst_swiz & 0x7) == 5) { + append(" r%u.%c = 1.0;\n", tex->dst_reg, chan_names[i]); + } + dst_swiz >>= 3; + } + return 0; +} diff --git a/src/xenia/gpu/d3d11/d3d11_shader_translator.h b/src/xenia/gpu/d3d11/d3d11_shader_translator.h new file mode 100644 index 000000000..ad85c7775 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_shader_translator.h @@ -0,0 +1,125 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_SHADER_TRANSLATOR_H_ +#define XENIA_GPU_D3D11_D3D11_SHADER_TRANSLATOR_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + + +class D3D11ShaderTranslator { +public: + const static uint32_t kMaxInterpolators = 16; + + D3D11ShaderTranslator(); + + int TranslateVertexShader(VertexShaderResource* vertex_shader, + const xenos::xe_gpu_program_cntl_t& program_cntl); + int TranslatePixelShader( + PixelShaderResource* pixel_shader, + const xenos::xe_gpu_program_cntl_t& program_cntl, + const VertexShaderResource::AllocCounts& alloc_counts); + + const char* translated_src() const { return buffer_; } + +private: + xenos::XE_GPU_SHADER_TYPE type_; + uint32_t tex_fetch_index_; + const uint32_t* dwords_; + + static const int kCapacity = 64 * 1024; + char buffer_[kCapacity]; + size_t capacity_; + size_t offset_; + void append(const char* format, ...) { + va_list args; + va_start(args, format); + int len = xevsnprintfa(buffer_ + offset_, capacity_ - offset_, + format, args); + va_end(args); + offset_ += len; + buffer_[offset_] = 0; + } + + void AppendTextureHeader( + const ShaderResource::SamplerInputs& sampler_inputs); + + void AppendSrcReg(uint32_t num, uint32_t type, uint32_t swiz, uint32_t negate, + uint32_t abs); + void AppendDestRegName(uint32_t num, uint32_t dst_exp); + void AppendDestReg(uint32_t num, uint32_t mask, uint32_t dst_exp); + void AppendDestRegPost(uint32_t num, uint32_t mask, uint32_t dst_exp); + void PrintSrcReg(uint32_t num, uint32_t type, uint32_t swiz, uint32_t negate, + uint32_t abs); + void PrintDstReg(uint32_t num, uint32_t mask, uint32_t dst_exp); + void PrintExportComment(uint32_t num); + + int TranslateALU(const xenos::instr_alu_t* alu, int sync); + int TranslateALU_ADDv(const xenos::instr_alu_t& alu); + int TranslateALU_MULv(const xenos::instr_alu_t& alu); + int TranslateALU_MAXv(const xenos::instr_alu_t& alu); + int TranslateALU_MINv(const xenos::instr_alu_t& alu); + int TranslateALU_SETXXv(const xenos::instr_alu_t& alu, const char* op); + int TranslateALU_SETEv(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTv(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTEv(const xenos::instr_alu_t& alu); + int TranslateALU_SETNEv(const xenos::instr_alu_t& alu); + int TranslateALU_FRACv(const xenos::instr_alu_t& alu); + int TranslateALU_TRUNCv(const xenos::instr_alu_t& alu); + int TranslateALU_FLOORv(const xenos::instr_alu_t& alu); + int TranslateALU_MULADDv(const xenos::instr_alu_t& alu); + int TranslateALU_CNDXXv(const xenos::instr_alu_t& alu, const char* op); + int TranslateALU_CNDEv(const xenos::instr_alu_t& alu); + int TranslateALU_CNDGTEv(const xenos::instr_alu_t& alu); + int TranslateALU_CNDGTv(const xenos::instr_alu_t& alu); + int TranslateALU_DOT4v(const xenos::instr_alu_t& alu); + int TranslateALU_DOT3v(const xenos::instr_alu_t& alu); + int TranslateALU_DOT2ADDv(const xenos::instr_alu_t& alu); + // CUBEv + int TranslateALU_MAX4v(const xenos::instr_alu_t& alu); + // ... + int TranslateALU_MAXs(const xenos::instr_alu_t& alu); + int TranslateALU_MINs(const xenos::instr_alu_t& alu); + int TranslateALU_SETXXs(const xenos::instr_alu_t& alu, const char* op); + int TranslateALU_SETEs(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTs(const xenos::instr_alu_t& alu); + int TranslateALU_SETGTEs(const xenos::instr_alu_t& alu); + int TranslateALU_SETNEs(const xenos::instr_alu_t& alu); + int TranslateALU_RECIP_IEEE(const xenos::instr_alu_t& alu); + int TranslateALU_MUL_CONST_0(const xenos::instr_alu_t& alu); + int TranslateALU_MUL_CONST_1(const xenos::instr_alu_t& alu); + int TranslateALU_ADD_CONST_0(const xenos::instr_alu_t& alu); + int TranslateALU_ADD_CONST_1(const xenos::instr_alu_t& alu); + int TranslateALU_SUB_CONST_0(const xenos::instr_alu_t& alu); + int TranslateALU_SUB_CONST_1(const xenos::instr_alu_t& alu); + + void PrintDestFecth(uint32_t dst_reg, uint32_t dst_swiz); + void AppendFetchDest(uint32_t dst_reg, uint32_t dst_swiz); + int GetFormatComponentCount(uint32_t format); + + int TranslateExec(const xenos::instr_cf_exec_t& cf); + int TranslateVertexFetch(const xenos::instr_fetch_vtx_t* vtx, int sync); + int TranslateTextureFetch(const xenos::instr_fetch_tex_t* tex, int sync); +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_SHADER_TRANSLATOR_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_texture_resource.cc b/src/xenia/gpu/d3d11/d3d11_texture_resource.cc new file mode 100644 index 000000000..a90c60b0d --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture_resource.cc @@ -0,0 +1,219 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::d3d11; +using namespace xe::gpu::xenos; + + +D3D11TextureResource::D3D11TextureResource( + D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info) + : TextureResource(memory_range, info), + resource_cache_(resource_cache), + texture_(nullptr), + handle_(nullptr) { +} + +D3D11TextureResource::~D3D11TextureResource() { + XESAFERELEASE(texture_); + XESAFERELEASE(handle_); +} + +int D3D11TextureResource::CreateHandle() { + SCOPE_profile_cpu_f("gpu"); + + D3D11_SHADER_RESOURCE_VIEW_DESC srv_desc; + xe_zero_struct(&srv_desc, sizeof(srv_desc)); + // TODO(benvanik): this may need to be typed on the fetch instruction (float/int/etc?) + srv_desc.Format = info_.format; + + D3D_SRV_DIMENSION dimension = D3D11_SRV_DIMENSION_UNKNOWN; + switch (info_.dimension) { + case TEXTURE_DIMENSION_1D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D; + srv_desc.Texture1D.MipLevels = 1; + srv_desc.Texture1D.MostDetailedMip = 0; + if (CreateHandle1D()) { + XELOGE("D3D11: failed to create Texture1D"); + return 1; + } + break; + case TEXTURE_DIMENSION_2D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D; + srv_desc.Texture2D.MipLevels = 1; + srv_desc.Texture2D.MostDetailedMip = 0; + if (CreateHandle2D()) { + XELOGE("D3D11: failed to create Texture2D"); + return 1; + } + break; + case TEXTURE_DIMENSION_3D: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D; + srv_desc.Texture3D.MipLevels = 1; + srv_desc.Texture3D.MostDetailedMip = 0; + if (CreateHandle3D()) { + XELOGE("D3D11: failed to create Texture3D"); + return 1; + } + break; + case TEXTURE_DIMENSION_CUBE: + srv_desc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURECUBE; + srv_desc.TextureCube.MipLevels = 1; + srv_desc.TextureCube.MostDetailedMip = 0; + if (CreateHandleCube()) { + XELOGE("D3D11: failed to create TextureCube"); + return 1; + } + break; + } + + HRESULT hr = resource_cache_->device()->CreateShaderResourceView( + texture_, &srv_desc, &handle_); + if (FAILED(hr)) { + XELOGE("D3D11: unable to create texture resource view"); + return 1; + } + return 0; +} + +int D3D11TextureResource::CreateHandle1D() { + uint32_t width = 1 + info_.size_1d.width; + + D3D11_TEXTURE1D_DESC texture_desc; + xe_zero_struct(&texture_desc, sizeof(texture_desc)); + texture_desc.Width = width; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = info_.format; + texture_desc.Usage = D3D11_USAGE_DYNAMIC; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? + HRESULT hr = resource_cache_->device()->CreateTexture1D( + &texture_desc, NULL, (ID3D11Texture1D**)&texture_); + if (FAILED(hr)) { + return 1; + } + return 0; +} + +int D3D11TextureResource::CreateHandle2D() { + D3D11_TEXTURE2D_DESC texture_desc; + xe_zero_struct(&texture_desc, sizeof(texture_desc)); + texture_desc.Width = info_.size_2d.output_width; + texture_desc.Height = info_.size_2d.output_height; + texture_desc.MipLevels = 1; + texture_desc.ArraySize = 1; + texture_desc.Format = info_.format; + texture_desc.SampleDesc.Count = 1; + texture_desc.SampleDesc.Quality = 0; + texture_desc.Usage = D3D11_USAGE_DYNAMIC; + texture_desc.BindFlags = D3D11_BIND_SHADER_RESOURCE; + texture_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; + texture_desc.MiscFlags = 0; // D3D11_RESOURCE_MISC_GENERATE_MIPS? + HRESULT hr = resource_cache_->device()->CreateTexture2D( + &texture_desc, NULL, (ID3D11Texture2D**)&texture_); + if (FAILED(hr)) { + return 1; + } + return 0; +} + +int D3D11TextureResource::CreateHandle3D() { + XELOGE("D3D11: CreateTexture3D not yet implemented"); + XEASSERTALWAYS(); + return 1; +} + +int D3D11TextureResource::CreateHandleCube() { + XELOGE("D3D11: CreateTextureCube not yet implemented"); + XEASSERTALWAYS(); + return 1; +} + +int D3D11TextureResource::InvalidateRegion(const MemoryRange& memory_range) { + SCOPE_profile_cpu_f("gpu"); + + switch (info_.dimension) { + case TEXTURE_DIMENSION_1D: + return InvalidateRegion1D(memory_range); + case TEXTURE_DIMENSION_2D: + return InvalidateRegion2D(memory_range); + case TEXTURE_DIMENSION_3D: + return InvalidateRegion3D(memory_range); + case TEXTURE_DIMENSION_CUBE: + return InvalidateRegionCube(memory_range); + } + return 1; +} + +int D3D11TextureResource::InvalidateRegion1D(const MemoryRange& memory_range) { + return 1; +} + +int D3D11TextureResource::InvalidateRegion2D(const MemoryRange& memory_range) { + // TODO(benvanik): all mip levels. + D3D11_MAPPED_SUBRESOURCE res; + HRESULT hr = resource_cache_->context()->Map( + texture_, 0, D3D11_MAP_WRITE_DISCARD, 0, &res); + if (FAILED(hr)) { + XELOGE("D3D11: failed to map texture"); + return 1; + } + + const uint8_t* src = memory_range_.host_base; + uint8_t* dest = (uint8_t*)res.pData; + + uint32_t output_pitch = res.RowPitch; // (output_width / info.block_size) * info.texel_pitch; + if (!info_.is_tiled) { + dest = (uint8_t*)res.pData; + for (uint32_t y = 0; y < info_.size_2d.block_height; y++) { + for (uint32_t x = 0; x < info_.size_2d.logical_pitch; x += info_.texel_pitch) { + TextureSwap(dest + x, src + x, info_.texel_pitch); + } + src += info_.size_2d.input_pitch; + dest += output_pitch; + } + } else { + auto bpp = (info_.texel_pitch >> 2) + ((info_.texel_pitch >> 1) >> (info_.texel_pitch >> 2)); + for (uint32_t y = 0, output_base_offset = 0; + y < info_.size_2d.block_height; + y++, output_base_offset += output_pitch) { + auto input_base_offset = TiledOffset2DOuter(y, (info_.size_2d.input_width / info_.block_size), bpp); + for (uint32_t x = 0, output_offset = output_base_offset; + x < info_.size_2d.block_width; + x++, output_offset += info_.texel_pitch) { + auto input_offset = TiledOffset2DInner(x, y, bpp, input_base_offset) >> bpp; + TextureSwap(dest + output_offset, + src + input_offset * info_.texel_pitch, + info_.texel_pitch); + } + } + } + resource_cache_->context()->Unmap(texture_, 0); + return 0; +} + +int D3D11TextureResource::InvalidateRegion3D(const MemoryRange& memory_range) { + return 1; +} + +int D3D11TextureResource::InvalidateRegionCube( + const MemoryRange& memory_range) { + return 1; +} diff --git a/src/xenia/gpu/d3d11/d3d11_texture_resource.h b/src/xenia/gpu/d3d11/d3d11_texture_resource.h new file mode 100644 index 000000000..4e59662a4 --- /dev/null +++ b/src/xenia/gpu/d3d11/d3d11_texture_resource.h @@ -0,0 +1,60 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_D3D11_D3D11_TEXTURE_RESOURCE_H_ +#define XENIA_GPU_D3D11_D3D11_TEXTURE_RESOURCE_H_ + +#include +#include + +#include + + +namespace xe { +namespace gpu { +namespace d3d11 { + +class D3D11ResourceCache; + + +class D3D11TextureResource : public TextureResource { +public: + D3D11TextureResource(D3D11ResourceCache* resource_cache, + const MemoryRange& memory_range, + const Info& info); + ~D3D11TextureResource() override; + + void* handle() const override { return handle_; } + +protected: + int CreateHandle() override; + int CreateHandle1D(); + int CreateHandle2D(); + int CreateHandle3D(); + int CreateHandleCube(); + + int InvalidateRegion(const MemoryRange& memory_range) override; + int InvalidateRegion1D(const MemoryRange& memory_range); + int InvalidateRegion2D(const MemoryRange& memory_range); + int InvalidateRegion3D(const MemoryRange& memory_range); + int InvalidateRegionCube(const MemoryRange& memory_range); + +private: + D3D11ResourceCache* resource_cache_; + ID3D11Resource* texture_; + ID3D11ShaderResourceView* handle_; +}; + + +} // namespace d3d11 +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_D3D11_D3D11_TEXTURE_RESOURCE_H_ diff --git a/src/xenia/gpu/d3d11/d3d11_window.cc b/src/xenia/gpu/d3d11/d3d11_window.cc index 930149694..da33ab6bb 100644 --- a/src/xenia/gpu/d3d11/d3d11_window.cc +++ b/src/xenia/gpu/d3d11/d3d11_window.cc @@ -9,6 +9,8 @@ #include +#include + using namespace xe; using namespace xe::gpu; @@ -36,6 +38,7 @@ D3D11Window::D3D11Window( } D3D11Window::~D3D11Window() { + Profiler::set_display(nullptr); if (context_) { context_->ClearState(); } @@ -100,10 +103,23 @@ int D3D11Window::Initialize(const char* title, uint32_t width, uint32_t height) } context_->OMSetRenderTargets(1, &render_target_view_, NULL); + // Setup profiler display. + if (Profiler::is_enabled()) { + std::unique_ptr profiler_display( + new D3D11ProfilerDisplay(this)); + Profiler::set_display(std::move(profiler_display)); + } + return 0; } void D3D11Window::Swap() { + SCOPE_profile_cpu_f("gpu"); + + // Present profiler. + context_->OMSetRenderTargets(1, &render_target_view_, NULL); + Profiler::Present(); + // Swap buffers. // TODO(benvanik): control vsync with flag. bool vsync = true; diff --git a/src/xenia/gpu/d3d11/d3d11_window.h b/src/xenia/gpu/d3d11/d3d11_window.h index f9e47723d..df470df0f 100644 --- a/src/xenia/gpu/d3d11/d3d11_window.h +++ b/src/xenia/gpu/d3d11/d3d11_window.h @@ -29,7 +29,9 @@ public: IDXGIFactory1* dxgi_factory, ID3D11Device* device); virtual ~D3D11Window(); + ID3D11Device* device() const { return device_; } IDXGISwapChain* swap_chain() const { return swap_chain_; } + ID3D11DeviceContext* context() const { return context_; } virtual int Initialize(const char* title, uint32_t width, uint32_t height); diff --git a/src/xenia/gpu/d3d11/sources.gypi b/src/xenia/gpu/d3d11/sources.gypi index b1ad47ff4..b6b6d76c1 100644 --- a/src/xenia/gpu/d3d11/sources.gypi +++ b/src/xenia/gpu/d3d11/sources.gypi @@ -1,6 +1,8 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ + 'd3d11_buffer_resource.cc', + 'd3d11_buffer_resource.h', 'd3d11_geometry_shader.cc', 'd3d11_geometry_shader.h', 'd3d11_gpu-private.h', @@ -10,10 +12,18 @@ 'd3d11_graphics_driver.h', 'd3d11_graphics_system.cc', 'd3d11_graphics_system.h', - 'd3d11_shader.cc', - 'd3d11_shader.h', - 'd3d11_shader_cache.cc', - 'd3d11_shader_cache.h', + 'd3d11_profiler_display.cc', + 'd3d11_profiler_display.h', + 'd3d11_resource_cache.cc', + 'd3d11_resource_cache.h', + 'd3d11_sampler_state_resource.cc', + 'd3d11_sampler_state_resource.h', + 'd3d11_shader_resource.cc', + 'd3d11_shader_resource.h', + 'd3d11_shader_translator.cc', + 'd3d11_shader_translator.h', + 'd3d11_texture_resource.cc', + 'd3d11_texture_resource.h', 'd3d11_window.cc', 'd3d11_window.h', ], diff --git a/src/xenia/gpu/draw_command.cc b/src/xenia/gpu/draw_command.cc new file mode 100644 index 000000000..468c4ed08 --- /dev/null +++ b/src/xenia/gpu/draw_command.cc @@ -0,0 +1,17 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + diff --git a/src/xenia/gpu/draw_command.h b/src/xenia/gpu/draw_command.h new file mode 100644 index 000000000..ac5b07fe6 --- /dev/null +++ b/src/xenia/gpu/draw_command.h @@ -0,0 +1,78 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_DRAW_COMMAND_H_ +#define XENIA_GPU_DRAW_COMMAND_H_ + +#include +#include +#include +#include +#include +#include + + +namespace xe { +namespace gpu { + + +// TODO(benvanik): move more of the enums in here? +struct DrawCommand { + xenos::XE_GPU_PRIMITIVE_TYPE prim_type; + uint32_t start_index; + uint32_t index_count; + uint32_t base_vertex; + + VertexShaderResource* vertex_shader; + PixelShaderResource* pixel_shader; + + // TODO(benvanik): dirty tracking/max ranges/etc. + struct { + float* values; + size_t count; + } float4_constants; + struct { + uint32_t* values; + size_t count; + } loop_constants; + struct { + uint32_t* values; + size_t count; + } bool_constants; + + // Index buffer, if present. If index_count > 0 then auto draw. + IndexBufferResource* index_buffer; + + // Vertex buffers. + struct { + uint32_t input_index; + VertexBufferResource* buffer; + uint32_t stride; + uint32_t offset; + } vertex_buffers[96]; + size_t vertex_buffer_count; + + // Texture samplers. + struct SamplerInput { + uint32_t input_index; + TextureResource* texture; + SamplerStateResource* sampler_state; + }; + SamplerInput vertex_shader_samplers[32]; + size_t vertex_shader_sampler_count; + SamplerInput pixel_shader_samplers[32]; + size_t pixel_shader_sampler_count; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_DRAW_COMMAND_H_ diff --git a/src/xenia/gpu/graphics_driver.cc b/src/xenia/gpu/graphics_driver.cc index 65dddea49..e398839b8 100644 --- a/src/xenia/gpu/graphics_driver.cc +++ b/src/xenia/gpu/graphics_driver.cc @@ -12,12 +12,300 @@ using namespace xe; using namespace xe::gpu; +using namespace xe::gpu::xenos; GraphicsDriver::GraphicsDriver(Memory* memory) : - memory_(memory), address_translation_(0) { - memset(®ister_file_, 0, sizeof(register_file_)); + memory_(memory), address_translation_(0) { } GraphicsDriver::~GraphicsDriver() { } + +int GraphicsDriver::LoadShader(XE_GPU_SHADER_TYPE type, + uint32_t address, uint32_t length, + uint32_t start) { + MemoryRange memory_range( + memory_->Translate(address), + address, length); + + ShaderResource* shader = nullptr; + if (type == XE_GPU_SHADER_TYPE_VERTEX) { + VertexShaderResource::Info info; + shader = vertex_shader_ = resource_cache()->FetchVertexShader(memory_range, + info); + if (!vertex_shader_) { + XELOGE("Unable to fetch vertex shader"); + return 1; + } + } else { + PixelShaderResource::Info info; + shader = pixel_shader_ = resource_cache()->FetchPixelShader(memory_range, + info); + if (!pixel_shader_) { + XELOGE("Unable to fetch pixel shader"); + return 1; + } + } + + if (!shader->is_prepared()) { + // Disassemble. + const char* source = shader->disasm_src(); + XELOGGPU("Set shader %d at %0.8X (%db):\n%s", + type, address, length, + source ? source : ""); + } + + return 0; +} + +int GraphicsDriver::PrepareDraw(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + // Ignore copies for now. + uint32_t enable_mode = register_file_[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7; + if (enable_mode != 4) { + XELOGW("GPU: ignoring draw with enable mode %d", enable_mode); + return 1; + } + + // Reset the things we don't modify so that we have clean state. + command.prim_type = XE_GPU_PRIMITIVE_TYPE_POINT_LIST; + command.index_count = 0; + command.index_buffer = nullptr; + + // Generic stuff. + command.start_index = register_file_[XE_GPU_REG_VGT_INDX_OFFSET].u32; + command.base_vertex = 0; + + int ret; + ret = PopulateState(command); + if (ret) { + XELOGE("Unable to prepare draw state"); + return ret; + } + ret = PopulateConstantBuffers(command); + if (ret) { + XELOGE("Unable to prepare draw constant buffers"); + return ret; + } + ret = PopulateShaders(command); + if (ret) { + XELOGE("Unable to prepare draw shaders"); + return ret; + } + ret = PopulateInputAssembly(command); + if (ret) { + XELOGE("Unable to prepare draw input assembly"); + return ret; + } + ret = PopulateSamplers(command); + if (ret) { + XELOGE("Unable to prepare draw samplers"); + return ret; + } + return 0; +} + +int GraphicsDriver::PrepareDrawIndexBuffer( + DrawCommand& command, + uint32_t address, uint32_t length, + xenos::XE_GPU_ENDIAN endianness, + IndexFormat format) { + SCOPE_profile_cpu_f("gpu"); + + address += address_translation_; + MemoryRange memory_range(memory_->Translate(address), address, length); + + IndexBufferResource::Info info; + info.endianness = endianness; + info.format = format; + + command.index_buffer = + resource_cache()->FetchIndexBuffer(memory_range, info); + if (!command.index_buffer) { + return 1; + } + return 0; +} + +int GraphicsDriver::PopulateState(DrawCommand& command) { + return 0; +} + +int GraphicsDriver::PopulateConstantBuffers(DrawCommand& command) { + command.float4_constants.count = 512; + command.float4_constants.values = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_000_X].f32; + command.loop_constants.count = 32; + command.loop_constants.values = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].u32; + command.bool_constants.count = 8; + command.bool_constants.values = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32; + return 0; +} + +int GraphicsDriver::PopulateShaders(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + if (!vertex_shader_) { + XELOGE("No vertex shader bound; ignoring"); + return 1; + } + if (!pixel_shader_) { + XELOGE("No pixel shader bound; ignoring"); + return 1; + } + + xe_gpu_program_cntl_t program_cntl; + program_cntl.dword_0 = register_file_[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; + if (!vertex_shader_->is_prepared()) { + if (vertex_shader_->Prepare(program_cntl)) { + XELOGE("Unable to prepare vertex shader"); + return 1; + } + } + if (!pixel_shader_->is_prepared()) { + if (pixel_shader_->Prepare(program_cntl, vertex_shader_)) { + XELOGE("Unable to prepare pixel shader"); + return 1; + } + } + + command.vertex_shader = vertex_shader_; + command.pixel_shader = pixel_shader_; + + return 0; +} + +int GraphicsDriver::PopulateInputAssembly(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + const auto& buffer_inputs = command.vertex_shader->buffer_inputs(); + command.vertex_buffer_count = buffer_inputs.count; + for (size_t n = 0; n < buffer_inputs.count; n++) { + const auto& desc = buffer_inputs.descs[n]; + + int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; + auto group = reinterpret_cast(®ister_file_.values[r]); + xe_gpu_vertex_fetch_t* fetch = nullptr; + switch (desc.fetch_slot % 3) { + case 0: + fetch = &group->vertex_fetch_0; + break; + case 1: + fetch = &group->vertex_fetch_1; + break; + case 2: + fetch = &group->vertex_fetch_2; + break; + } + XEASSERTNOTNULL(fetch); + // If this assert doesn't hold, maybe we just abort? + XEASSERT(fetch->type == 0x3); + XEASSERTNOTZERO(fetch->size); + + const auto& info = desc.info; + + MemoryRange memory_range; + memory_range.guest_base = (fetch->address << 2) + address_translation_; + memory_range.host_base = memory_->Translate(memory_range.guest_base); + memory_range.length = fetch->size * 4; + // TODO(benvanik): if the memory range is within the command buffer, we + // should use a cached transient buffer. + + auto buffer = resource_cache()->FetchVertexBuffer(memory_range, info); + if (!buffer) { + XELOGE("Unable to create vertex fetch buffer"); + return 1; + } + + command.vertex_buffers[n].input_index = desc.input_index; + command.vertex_buffers[n].buffer = buffer; + command.vertex_buffers[n].stride = desc.info.stride_words * 4; + command.vertex_buffers[n].offset = 0; + } + return 0; +} + +int GraphicsDriver::PopulateSamplers(DrawCommand& command) { + SCOPE_profile_cpu_f("gpu"); + + // Vertex texture samplers. + const auto& vertex_sampler_inputs = command.vertex_shader->sampler_inputs(); + command.vertex_shader_sampler_count = vertex_sampler_inputs.count; + for (size_t i = 0; i < command.vertex_shader_sampler_count; ++i) { + if (PopulateSamplerSet(vertex_sampler_inputs.descs[i], + command.vertex_shader_samplers[i])) { + return 1; + } + } + + // Pixel shader texture sampler. + const auto& pixel_sampler_inputs = command.pixel_shader->sampler_inputs(); + command.pixel_shader_sampler_count = pixel_sampler_inputs.count; + for (size_t i = 0; i < command.pixel_shader_sampler_count; ++i) { + if (PopulateSamplerSet(pixel_sampler_inputs.descs[i], + command.pixel_shader_samplers[i])) { + return 1; + } + } + + return 0; +} + +int GraphicsDriver::PopulateSamplerSet( + const ShaderResource::SamplerDesc& src_input, + DrawCommand::SamplerInput& dst_input) { + int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + src_input.fetch_slot * 6; + const auto group = (const xe_gpu_fetch_group_t*)®ister_file_.values[r]; + const xenos::xe_gpu_texture_fetch_t& fetch = group->texture_fetch; + if (fetch.type != 0x2) { + return 0; + } + + dst_input.input_index = src_input.input_index; + dst_input.texture = nullptr; + dst_input.sampler_state = nullptr; + + TextureResource::Info info; + if (!TextureResource::Info::Prepare(fetch, info)) { + XELOGE("D3D11: unable to parse texture fetcher info"); + return 0; // invalid texture used + } + if (info.format == DXGI_FORMAT_UNKNOWN) { + XELOGW("D3D11: unknown texture format %d", info.format); + return 0; // invalid texture used + } + + // TODO(benvanik): quick validate without refetching intraframe. + // Fetch texture from the cache. + MemoryRange memory_range; + memory_range.guest_base = (fetch.address << 12) + address_translation_; + memory_range.host_base = memory_->Translate(memory_range.guest_base); + memory_range.length = info.input_length; + + auto texture = resource_cache()->FetchTexture(memory_range, info); + if (!texture) { + XELOGW("D3D11: unable to fetch texture"); + return 0; // invalid texture used + } + + SamplerStateResource::Info sampler_info; + if (!SamplerStateResource::Info::Prepare(fetch, + src_input.tex_fetch, + sampler_info)) { + XELOGW("D3D11: unable to parse sampler info"); + return 0; // invalid texture used + } + auto sampler_state = resource_cache()->FetchSamplerState(sampler_info); + if (!sampler_state) { + XELOGW("D3D11: unable to fetch sampler"); + return 0; // invalid texture used + } + + dst_input.texture = texture; + dst_input.sampler_state = sampler_state; + return 0; +} diff --git a/src/xenia/gpu/graphics_driver.h b/src/xenia/gpu/graphics_driver.h index 675a5a7c2..23cb24972 100644 --- a/src/xenia/gpu/graphics_driver.h +++ b/src/xenia/gpu/graphics_driver.h @@ -11,7 +11,9 @@ #define XENIA_GPU_GRAPHICS_DRIVER_H_ #include -#include +#include +#include +#include #include @@ -24,38 +26,45 @@ public: virtual ~GraphicsDriver(); Memory* memory() const { return memory_; } - xenos::RegisterFile* register_file() { return ®ister_file_; }; + virtual ResourceCache* resource_cache() const = 0; + RegisterFile* register_file() { return ®ister_file_; }; void set_address_translation(uint32_t value) { address_translation_ = value; } - virtual void Initialize() = 0; + virtual int Initialize() = 0; - virtual void InvalidateState( - uint32_t mask) = 0; - virtual void SetShader( - xenos::XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length) = 0; - virtual void DrawIndexBuffer( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) = 0; - //virtual void DrawIndexImmediate(); - virtual void DrawIndexAuto( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count) = 0; + int LoadShader(xenos::XE_GPU_SHADER_TYPE type, + uint32_t address, uint32_t length, + uint32_t start); + + int PrepareDraw(DrawCommand& command); + int PrepareDrawIndexBuffer(DrawCommand& command, + uint32_t address, uint32_t length, + xenos::XE_GPU_ENDIAN endianness, + IndexFormat format); + virtual int Draw(const DrawCommand& command) = 0; virtual int Resolve() = 0; +private: + int PopulateState(DrawCommand& command); + int PopulateConstantBuffers(DrawCommand& command); + int PopulateShaders(DrawCommand& command); + int PopulateInputAssembly(DrawCommand& command); + int PopulateSamplers(DrawCommand& command); + int PopulateSamplerSet(const ShaderResource::SamplerDesc& src_input, + DrawCommand::SamplerInput& dst_input); + protected: GraphicsDriver(Memory* memory); Memory* memory_; - - xenos::RegisterFile register_file_; + RegisterFile register_file_; uint32_t address_translation_; + + VertexShaderResource* vertex_shader_; + PixelShaderResource* pixel_shader_; }; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index fbcb1d744..86905cc48 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -11,9 +11,10 @@ #include #include +#include +#include #include -#include -#include +#include using namespace xe; @@ -24,10 +25,10 @@ using namespace xe::gpu::xenos; GraphicsSystem::GraphicsSystem(Emulator* emulator) : emulator_(emulator), memory_(emulator->memory()), - thread_(0), running_(false), driver_(0), worker_(0), + thread_(nullptr), running_(false), driver_(nullptr), + command_processor_(nullptr), interrupt_callback_(0), interrupt_callback_data_(0), - last_interrupt_time_(0), swap_pending_(false), - thread_wait_(NULL) { + last_interrupt_time_(0), thread_wait_(nullptr) { // Create the run loop used for any windows/etc. // This must be done on the thread we create the driver. run_loop_ = xe_run_loop_create(); @@ -42,15 +43,16 @@ X_STATUS GraphicsSystem::Setup() { processor_ = emulator_->processor(); // Create worker. - worker_ = new RingBufferWorker(this, memory_); + command_processor_ = new CommandProcessor(this, memory_); // Let the processor know we want register access callbacks. - RegisterAccessCallbacks callbacks; - callbacks.context = this; - callbacks.handles = (RegisterHandlesCallback)HandlesRegisterThunk; - callbacks.read = (RegisterReadCallback)ReadRegisterThunk; - callbacks.write = (RegisterWriteCallback)WriteRegisterThunk; - emulator_->processor()->AddRegisterAccessCallbacks(callbacks); + emulator_->memory()->AddMappedRange( + 0x7FC80000, + 0xFFFF0000, + 0x0000FFFF, + this, + reinterpret_cast(MMIOReadRegisterThunk), + reinterpret_cast(MMIOWriteRegisterThunk)); // Create worker thread. // This will initialize the graphics system. @@ -76,15 +78,18 @@ void GraphicsSystem::ThreadStart() { // Main run loop. while (running_) { // Peek main run loop. - if (xe_run_loop_pump(run_loop)) { - break; + { + SCOPE_profile_cpu_i("gpu", "GraphicsSystemRunLoopPump"); + if (xe_run_loop_pump(run_loop)) { + break; + } } if (!running_) { break; } // Pump worker. - worker_->Pump(); + command_processor_->Pump(); if (!running_) { break; @@ -106,7 +111,7 @@ void GraphicsSystem::Shutdown() { xe_thread_join(thread_); xe_thread_release(thread_); - delete worker_; + delete command_processor_; xe_run_loop_release(run_loop_); } @@ -124,21 +129,19 @@ void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) { Sleep(0); } XEASSERTNOTNULL(driver_); - worker_->Initialize(driver_, ptr, page_count); + command_processor_->Initialize(driver_, ptr, page_count); } void GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size) { - worker_->EnableReadPointerWriteBack(ptr, block_size); -} - -bool GraphicsSystem::HandlesRegister(uint64_t addr) { - return (addr & 0xFFFF0000) == 0x7FC80000; + command_processor_->EnableReadPointerWriteBack(ptr, block_size); } uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { uint32_t r = addr & 0xFFFF; - XELOGGPU("ReadRegister(%.4X)", r); + if (FLAGS_trace_ring_buffer) { + XELOGGPU("ReadRegister(%.4X)", r); + } RegisterFile* regs = driver_->register_file(); @@ -151,31 +154,33 @@ uint64_t GraphicsSystem::ReadRegister(uint64_t addr) { return 1; } - XEASSERT(r >= 0 && r < kXEGpuRegisterCount); + XEASSERT(r >= 0 && r < RegisterFile::kRegisterCount); return regs->values[r].u32; } void GraphicsSystem::WriteRegister(uint64_t addr, uint64_t value) { uint32_t r = addr & 0xFFFF; - XELOGGPU("WriteRegister(%.4X, %.8X)", r, value); + if (FLAGS_trace_ring_buffer) { + XELOGGPU("WriteRegister(%.4X, %.8X)", r, value); + } RegisterFile* regs = driver_->register_file(); switch (r) { case 0x0714: // CP_RB_WPTR - worker_->UpdateWritePointer((uint32_t)value); + command_processor_->UpdateWritePointer((uint32_t)value); break; default: XELOGW("Unknown GPU register %.4X write: %.8X", r, value); break; } - XEASSERT(r >= 0 && r < kXEGpuRegisterCount); + XEASSERT(r >= 0 && r < RegisterFile::kRegisterCount); regs->values[r].u32 = (uint32_t)value; } void GraphicsSystem::MarkVblank() { - worker_->increment_counter(); + command_processor_->increment_counter(); } void GraphicsSystem::DispatchInterruptCallback( @@ -190,6 +195,7 @@ void GraphicsSystem::DispatchInterruptCallback( if (!interrupt_callback_) { return; } + uint64_t args[] = { source, interrupt_callback_data_ }; processor_->ExecuteInterrupt( - cpu, interrupt_callback_, source, interrupt_callback_data_); + cpu, interrupt_callback_, args, XECOUNT(args)); } diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 5c1f03f8d..3b8fdabb1 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -21,8 +21,8 @@ XEDECLARECLASS2(xe, cpu, Processor); namespace xe { namespace gpu { +class CommandProcessor; class GraphicsDriver; -class RingBufferWorker; class GraphicsSystem { @@ -40,14 +40,12 @@ public: void InitializeRingBuffer(uint32_t ptr, uint32_t page_count); void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); - bool HandlesRegister(uint64_t addr); virtual uint64_t ReadRegister(uint64_t addr); virtual void WriteRegister(uint64_t addr, uint64_t value); void MarkVblank(); void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF); - bool swap_pending() const { return swap_pending_; } - void set_swap_pending(bool value) { swap_pending_ = value; } + virtual void Swap() = 0; protected: virtual void Initialize(); @@ -59,14 +57,11 @@ private: } void ThreadStart(); - static bool HandlesRegisterThunk(GraphicsSystem* gs, uint64_t addr) { - return gs->HandlesRegister(addr); - } - static uint64_t ReadRegisterThunk(GraphicsSystem* gs, uint64_t addr) { + static uint64_t MMIOReadRegisterThunk(GraphicsSystem* gs, uint64_t addr) { return gs->ReadRegister(addr); } - static void WriteRegisterThunk(GraphicsSystem* gs, uint64_t addr, - uint64_t value) { + static void MMIOWriteRegisterThunk(GraphicsSystem* gs, uint64_t addr, + uint64_t value) { gs->WriteRegister(addr, value); } @@ -82,12 +77,11 @@ protected: bool running_; GraphicsDriver* driver_; - RingBufferWorker* worker_; + CommandProcessor* command_processor_; uint32_t interrupt_callback_; uint32_t interrupt_callback_data_; double last_interrupt_time_; - bool swap_pending_; HANDLE thread_wait_; }; diff --git a/src/xenia/gpu/nop/nop_graphics_driver.cc b/src/xenia/gpu/nop/nop_graphics_driver.cc index 69f88fa95..b710b85e4 100644 --- a/src/xenia/gpu/nop/nop_graphics_driver.cc +++ b/src/xenia/gpu/nop/nop_graphics_driver.cc @@ -10,7 +10,6 @@ #include #include -#include using namespace xe; @@ -19,69 +18,19 @@ using namespace xe::gpu::nop; using namespace xe::gpu::xenos; -NopGraphicsDriver::NopGraphicsDriver(Memory* memory) : - GraphicsDriver(memory) { - shader_cache_ = new ShaderCache(); +NopGraphicsDriver::NopGraphicsDriver(Memory* memory) + : GraphicsDriver(memory), resource_cache_(nullptr) { } NopGraphicsDriver::~NopGraphicsDriver() { - delete shader_cache_; } -void NopGraphicsDriver::Initialize() { +int NopGraphicsDriver::Initialize() { + return 0; } -void NopGraphicsDriver::InvalidateState( - uint32_t mask) { - if (mask == XE_GPU_INVALIDATE_MASK_ALL) { - XELOGGPU("NOP: (invalidate all)"); - } - if (mask & XE_GPU_INVALIDATE_MASK_VERTEX_SHADER) { - XELOGGPU("NOP: invalidate vertex shader"); - } - if (mask & XE_GPU_INVALIDATE_MASK_PIXEL_SHADER) { - XELOGGPU("NOP: invalidate pixel shader"); - } -} - -void NopGraphicsDriver::SetShader( - XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length) { - // Find or create shader in the cache. - uint8_t* p = memory_->Translate(address); - Shader* shader = shader_cache_->FindOrCreate( - type, p, length); - - // Disassemble. - const char* source = shader->disasm_src(); - if (!source) { - source = ""; - } - XELOGGPU("NOP: set shader %d at %0.8X (%db):\n%s", - type, address, length, source); -} - -void NopGraphicsDriver::DrawIndexBuffer( - XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness) { - XELOGGPU("NOP: draw index buffer"); -} - -void NopGraphicsDriver::DrawIndexAuto( - XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count) { - XELOGGPU("NOP: draw indexed %d (%d indicies)", - prim_type, index_count); - - // TODO(benvanik): - // program control - // context misc - // interpolator control - // shader constants / bools / integers - // fetch constants +int NopGraphicsDriver::Draw(const DrawCommand& command) { + return 0; } int NopGraphicsDriver::Resolve() { diff --git a/src/xenia/gpu/nop/nop_graphics_driver.h b/src/xenia/gpu/nop/nop_graphics_driver.h index d345c8159..9463a0cd5 100644 --- a/src/xenia/gpu/nop/nop_graphics_driver.h +++ b/src/xenia/gpu/nop/nop_graphics_driver.h @@ -19,9 +19,6 @@ namespace xe { namespace gpu { - -class ShaderCache; - namespace nop { @@ -30,27 +27,16 @@ public: NopGraphicsDriver(Memory* memory); virtual ~NopGraphicsDriver(); - virtual void Initialize(); + ResourceCache* resource_cache() const override { return resource_cache_; } - virtual void InvalidateState( - uint32_t mask); - virtual void SetShader( - xenos::XE_GPU_SHADER_TYPE type, - uint32_t address, - uint32_t start, - uint32_t length); - virtual void DrawIndexBuffer( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - bool index_32bit, uint32_t index_count, - uint32_t index_base, uint32_t index_size, uint32_t endianness); - virtual void DrawIndexAuto( - xenos::XE_GPU_PRIMITIVE_TYPE prim_type, - uint32_t index_count); + int Initialize() override; - virtual int Resolve(); + int Draw(const DrawCommand& command) override; + + int Resolve() override; protected: - ShaderCache* shader_cache_; + ResourceCache* resource_cache_; }; diff --git a/src/xenia/gpu/nop/nop_graphics_system.h b/src/xenia/gpu/nop/nop_graphics_system.h index 54f77e04e..cf5f43b8a 100644 --- a/src/xenia/gpu/nop/nop_graphics_system.h +++ b/src/xenia/gpu/nop/nop_graphics_system.h @@ -28,6 +28,8 @@ public: virtual void Shutdown(); + void Swap() override {} + protected: virtual void Initialize(); virtual void Pump(); diff --git a/src/xenia/gpu/xenos/registers.cc b/src/xenia/gpu/register_file.cc similarity index 74% rename from src/xenia/gpu/xenos/registers.cc rename to src/xenia/gpu/register_file.cc index 5d4e99106..288881d58 100644 --- a/src/xenia/gpu/xenos/registers.cc +++ b/src/xenia/gpu/register_file.cc @@ -1,27 +1,30 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -const char* xe::gpu::xenos::GetRegisterName(uint32_t index) { - switch (index) { -#define XE_GPU_REGISTER(index, type, name) \ - case index: return #name; -#include -#undef XE_GPU_REGISTER - default: - return NULL; - } -} +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace xe; +using namespace xe::gpu; + + +RegisterFile::RegisterFile() { + xe_zero_struct(values, sizeof(values)); +} + +const char* RegisterFile::GetRegisterName(uint32_t index) { + switch (index) { +#define XE_GPU_REGISTER(index, type, name) \ + case index: return #name; +#include +#undef XE_GPU_REGISTER + default: + return NULL; + } +} diff --git a/src/xenia/gpu/xenos/registers.h b/src/xenia/gpu/register_file.h similarity index 57% rename from src/xenia/gpu/xenos/registers.h rename to src/xenia/gpu/register_file.h index 39a0d43db..3ab23b4fa 100644 --- a/src/xenia/gpu/xenos/registers.h +++ b/src/xenia/gpu/register_file.h @@ -1,51 +1,51 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_XENOS_REGISTERS_H_ -#define XENIA_GPU_XENOS_REGISTERS_H_ - -#include - - -namespace xe { -namespace gpu { -namespace xenos { - - -static const uint32_t kXEGpuRegisterCount = 0x5003; - - -enum Registers { -#define XE_GPU_REGISTER(index, type, name) \ - XE_GPU_REG_##name = index, -#include -#undef XE_GPU_REGISTER -}; - - -const char* GetRegisterName(uint32_t index); - - -union RegisterValue { - uint32_t u32; - float f32; -}; - - -struct RegisterFile { - RegisterValue values[kXEGpuRegisterCount]; -}; - - -} // namespace xenos -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_XENOS_REGISTERS_H_ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_REGISTER_FILE_H_ +#define XENIA_GPU_REGISTER_FILE_H_ + +#include + + +namespace xe { +namespace gpu { + + +enum Register { +#define XE_GPU_REGISTER(index, type, name) \ + XE_GPU_REG_##name = index, +#include +#undef XE_GPU_REGISTER +}; + + +class RegisterFile { +public: + RegisterFile(); + + const char* GetRegisterName(uint32_t index); + + static const size_t kRegisterCount = 0x5003; + union RegisterValue { + uint32_t u32; + float f32; + }; + RegisterValue values[kRegisterCount]; + + RegisterValue& operator[](Register reg) { + return values[reg]; + } +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_REGISTER_FILE_H_ diff --git a/src/xenia/gpu/resource.cc b/src/xenia/gpu/resource.cc new file mode 100644 index 000000000..35ef82bb6 --- /dev/null +++ b/src/xenia/gpu/resource.cc @@ -0,0 +1,37 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +HashedResource::HashedResource(const MemoryRange& memory_range) + : memory_range_(memory_range) { +} + +HashedResource::~HashedResource() = default; + +PagedResource::PagedResource(const MemoryRange& memory_range) + : memory_range_(memory_range), dirtied_(true) { +} + +PagedResource::~PagedResource() = default; + +void PagedResource::MarkDirty(uint32_t lo_address, uint32_t hi_address) { + dirtied_ = true; +} + +StaticResource::StaticResource() = default; + +StaticResource::~StaticResource() = default; diff --git a/src/xenia/gpu/resource.h b/src/xenia/gpu/resource.h new file mode 100644 index 000000000..1fb56b3d8 --- /dev/null +++ b/src/xenia/gpu/resource.h @@ -0,0 +1,104 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_RESOURCE_H_ +#define XENIA_GPU_RESOURCE_H_ + +#include +#include + + +namespace xe { +namespace gpu { + + +struct MemoryRange { + uint8_t* host_base; + uint32_t guest_base; + uint32_t length; + + MemoryRange() : host_base(nullptr), guest_base(0), length(0) {} + MemoryRange(const MemoryRange& other) + : host_base(other.host_base), guest_base(other.guest_base), + length(other.length) {} + MemoryRange(uint8_t* _host_base, uint32_t _guest_base, uint32_t _length) + : host_base(_host_base), guest_base(_guest_base), length(_length) {} +}; + + +class Resource { +public: + virtual ~Resource() = default; + + virtual void* handle() const = 0; + + template + T* handle_as() { + return reinterpret_cast(handle()); + } + +protected: + Resource() = default; + + // last use/LRU stuff +}; + + +class HashedResource : public Resource { +public: + ~HashedResource() override; + + const MemoryRange& memory_range() const { return memory_range_; } + +protected: + HashedResource(const MemoryRange& memory_range); + + MemoryRange memory_range_; + // key +}; + + +class PagedResource : public Resource { +public: + ~PagedResource() override; + + const MemoryRange& memory_range() const { return memory_range_; } + + template + bool Equals(const T& info) { + return Equals(&info, sizeof(info)); + } + virtual bool Equals(const void* info_ptr, size_t info_length) = 0; + + bool is_dirty() const { return dirtied_; } + void MarkDirty(uint32_t lo_address, uint32_t hi_address); + +protected: + PagedResource(const MemoryRange& memory_range); + + MemoryRange memory_range_; + bool dirtied_; + // dirtied pages list +}; + + +class StaticResource : public Resource { +public: + ~StaticResource() override; + +protected: + StaticResource(); +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_RESOURCE_H_ diff --git a/src/xenia/gpu/resource_cache.cc b/src/xenia/gpu/resource_cache.cc new file mode 100644 index 000000000..5641c8318 --- /dev/null +++ b/src/xenia/gpu/resource_cache.cc @@ -0,0 +1,172 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +ResourceCache::ResourceCache(Memory* memory) + : memory_(memory) { +} + +ResourceCache::~ResourceCache() { + for (auto it = resources_.begin(); it != resources_.end(); ++it) { + Resource* resource = *it; + delete resource; + } + resources_.clear(); +} + +VertexShaderResource* ResourceCache::FetchVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) { + return FetchHashedResource( + memory_range, info, &ResourceCache::CreateVertexShader); +} + +PixelShaderResource* ResourceCache::FetchPixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) { + return FetchHashedResource( + memory_range, info, &ResourceCache::CreatePixelShader); +} + +TextureResource* ResourceCache::FetchTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) { + auto resource = FetchPagedResource( + memory_range, info, &ResourceCache::CreateTexture); + if (!resource) { + return nullptr; + } + if (resource->Prepare()) { + XELOGE("Unable to prepare texture"); + return nullptr; + } + return resource; +} + +SamplerStateResource* ResourceCache::FetchSamplerState( + const SamplerStateResource::Info& info) { + auto key = info.hash(); + auto it = static_resources_.find(key); + if (it != static_resources_.end()) { + return static_cast(it->second); + } + auto resource = CreateSamplerState(info); + if (resource->Prepare()) { + XELOGE("Unable to prepare sampler state"); + return nullptr; + } + static_resources_.insert({ key, resource }); + resources_.push_back(resource); + return resource; +} + +IndexBufferResource* ResourceCache::FetchIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) { + auto resource = FetchPagedResource( + memory_range, info, &ResourceCache::CreateIndexBuffer); + if (!resource) { + return nullptr; + } + if (resource->Prepare()) { + XELOGE("Unable to prepare index buffer"); + return nullptr; + } + return resource; +} + +VertexBufferResource* ResourceCache::FetchVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) { + auto resource = FetchPagedResource( + memory_range, info, &ResourceCache::CreateVertexBuffer); + if (!resource) { + return nullptr; + } + if (resource->Prepare()) { + XELOGE("Unable to prepare vertex buffer"); + return nullptr; + } + return resource; +} + +uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) { + // We could do something smarter here to potentially early exit. + return xe_hash64(memory_range.host_base, memory_range.length); +} + +void ResourceCache::SyncRange(uint32_t address, int length) { + SCOPE_profile_cpu_f("gpu"); + + // Scan the page table in sync with our resource list. This means + // we have O(n) complexity for updates, though we could definitely + // make this faster/cleaner. + // TODO(benvanik): actually do this right. + // For now we assume the page table in the range of our resources + // will not be changing, which allows us to do a foreach(res) and reload + // and then clear the table. + + // total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768 + // each byte = 1 page + // Walk as qwords so we can clear things up faster. + uint64_t* page_table = reinterpret_cast( + memory_->Translate(memory_->page_table())); + uint32_t page_size = 16 * 1024; // 16KB pages + + uint32_t lo_address = address % 0x20000000; + uint32_t hi_address = lo_address + length; + hi_address = (hi_address / page_size) * page_size + page_size; + int start_page = lo_address / page_size; + int end_page = hi_address / page_size; + + { + SCOPE_profile_cpu_i("gpu", "SyncRange:mark"); + auto it = lo_address > page_size ? + paged_resources_.upper_bound(lo_address - page_size) : + paged_resources_.begin(); + auto end_it = paged_resources_.lower_bound(hi_address + page_size); + while (it != end_it) { + const auto& memory_range = it->second->memory_range(); + int lo_page = (memory_range.guest_base % 0x20000000) / page_size; + int hi_page = lo_page + (memory_range.length / page_size); + lo_page = std::max(lo_page, start_page); + hi_page = std::min(hi_page, end_page); + if (lo_page > hi_page) { + ++it; + continue; + } + for (int i = lo_page / 8; i <= hi_page / 8; ++i) { + uint64_t page_flags = page_table[i]; + if (page_flags) { + // Dirty! + it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size); + } + } + ++it; + } + } + + // Reset page table. + { + SCOPE_profile_cpu_i("gpu", "SyncRange:reset"); + for (auto i = start_page / 8; i <= end_page / 8; ++i) { + page_table[i] = 0; + } + } +} diff --git a/src/xenia/gpu/resource_cache.h b/src/xenia/gpu/resource_cache.h new file mode 100644 index 000000000..be95f0861 --- /dev/null +++ b/src/xenia/gpu/resource_cache.h @@ -0,0 +1,127 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_RESOURCE_CACHE_H_ +#define XENIA_GPU_RESOURCE_CACHE_H_ + +#include + +#include +#include +#include +#include +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class ResourceCache { +public: + virtual ~ResourceCache(); + + VertexShaderResource* FetchVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info); + PixelShaderResource* FetchPixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info); + + TextureResource* FetchTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info); + SamplerStateResource* FetchSamplerState( + const SamplerStateResource::Info& info); + + IndexBufferResource* FetchIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info); + VertexBufferResource* FetchVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info); + + uint64_t HashRange(const MemoryRange& memory_range); + + void SyncRange(uint32_t address, int length); + +protected: + ResourceCache(Memory* memory); + + template + T* FetchHashedResource(const MemoryRange& memory_range, + const typename T::Info& info, + const V& factory) { + // TODO(benvanik): if there's no way it's changed and it's been checked, + // just lookup. This way we don't rehash 100x a frame. + auto key = HashRange(memory_range); + auto it = hashed_resources_.find(key); + if (it != hashed_resources_.end()) { + return static_cast(it->second); + } + auto resource = (this->*factory)(memory_range, info); + hashed_resources_.insert({ key, resource }); + resources_.push_back(resource); + return resource; + } + + template + T* FetchPagedResource(const MemoryRange& memory_range, + const typename T::Info& info, + const V& factory) { + uint32_t lo_address = memory_range.guest_base % 0x20000000; + auto key = uint64_t(lo_address); + auto range = paged_resources_.equal_range(key); + for (auto it = range.first; it != range.second; ++it) { + if (it->second->memory_range().length == memory_range.length && + it->second->Equals(info)) { + return static_cast(it->second); + } + } + auto resource = (this->*factory)(memory_range, info); + paged_resources_.insert({ key, resource }); + resources_.push_back(resource); + return resource; + } + + virtual VertexShaderResource* CreateVertexShader( + const MemoryRange& memory_range, + const VertexShaderResource::Info& info) = 0; + virtual PixelShaderResource* CreatePixelShader( + const MemoryRange& memory_range, + const PixelShaderResource::Info& info) = 0; + virtual TextureResource* CreateTexture( + const MemoryRange& memory_range, + const TextureResource::Info& info) = 0; + virtual SamplerStateResource* CreateSamplerState( + const SamplerStateResource::Info& info) = 0; + virtual IndexBufferResource* CreateIndexBuffer( + const MemoryRange& memory_range, + const IndexBufferResource::Info& info) = 0; + virtual VertexBufferResource* CreateVertexBuffer( + const MemoryRange& memory_range, + const VertexBufferResource::Info& info) = 0; + +private: + Memory* memory_; + + std::vector resources_; + std::unordered_map hashed_resources_; + std::unordered_map static_resources_; + std::multimap paged_resources_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_RESOURCE_CACHE_H_ diff --git a/src/xenia/gpu/sampler_state_resource.cc b/src/xenia/gpu/sampler_state_resource.cc new file mode 100644 index 000000000..5865a6920 --- /dev/null +++ b/src/xenia/gpu/sampler_state_resource.cc @@ -0,0 +1,32 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +bool SamplerStateResource::Info::Prepare( + const xe_gpu_texture_fetch_t& fetch, const instr_fetch_tex_t& fetch_instr, + Info& out_info) { + out_info.min_filter = static_cast( + fetch_instr.min_filter == 3 ? fetch.min_filter : fetch_instr.min_filter); + out_info.mag_filter = static_cast( + fetch_instr.mag_filter == 3 ? fetch.mag_filter : fetch_instr.mag_filter); + out_info.mip_filter = static_cast( + fetch_instr.mip_filter == 3 ? fetch.mip_filter : fetch_instr.mip_filter); + out_info.clamp_u = fetch.clamp_x; + out_info.clamp_v = fetch.clamp_y; + out_info.clamp_w = fetch.clamp_z; + return true; +} diff --git a/src/xenia/gpu/sampler_state_resource.h b/src/xenia/gpu/sampler_state_resource.h new file mode 100644 index 000000000..c0a3c4ab3 --- /dev/null +++ b/src/xenia/gpu/sampler_state_resource.h @@ -0,0 +1,67 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SAMPLER_STATE_RESOURCE_H_ +#define XENIA_GPU_SAMPLER_STATE_RESOURCE_H_ + +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class SamplerStateResource : public StaticResource { +public: + struct Info { + xenos::instr_tex_filter_t min_filter; + xenos::instr_tex_filter_t mag_filter; + xenos::instr_tex_filter_t mip_filter; + uint32_t clamp_u; + uint32_t clamp_v; + uint32_t clamp_w; + + uint64_t hash() const { + return hash_combine(0, + min_filter, mag_filter, mip_filter, + clamp_u, clamp_v, clamp_w); + } + bool Equals(const Info& other) const { + return min_filter == other.min_filter && + mag_filter == other.mag_filter && + mip_filter == other.mip_filter && + clamp_u == other.clamp_u && + clamp_v == other.clamp_v && + clamp_w == other.clamp_w; + } + + static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, + const xenos::instr_fetch_tex_t& fetch_instr, + Info& out_info); + }; + + SamplerStateResource(const Info& info) : info_(info) {} + virtual ~SamplerStateResource() = default; + + const Info& info() const { return info_; } + + virtual int Prepare() = 0; + +protected: + Info info_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_SAMPLER_STATE_RESOURCE_H_ diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h deleted file mode 100644 index 1dd26b2b4..000000000 --- a/src/xenia/gpu/shader.h +++ /dev/null @@ -1,104 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_SHADER_H_ -#define XENIA_GPU_SHADER_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -class Shader { -public: - Shader(xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); - virtual ~Shader(); - - xenos::XE_GPU_SHADER_TYPE type() const { return type_; } - const uint32_t* dwords() const { return dwords_; } - size_t dword_count() const { return dword_count_; } - uint64_t hash() const { return hash_; } - bool is_prepared() const { return is_prepared_; } - - const char* disasm_src() const { return disasm_src_; } - - typedef struct { - xenos::instr_fetch_vtx_t vtx_fetch; - uint32_t format; - uint32_t offset_words; - uint32_t size_words; - } vtx_buffer_element_t; - typedef struct { - uint32_t input_index; - uint32_t fetch_slot; - uint32_t stride_words; - uint32_t element_count; - vtx_buffer_element_t elements[16]; - } vtx_buffer_desc_t; - typedef struct { - uint32_t count; - vtx_buffer_desc_t descs[16]; - } vtx_buffer_inputs_t; - const vtx_buffer_inputs_t* GetVertexBufferInputs(); - - typedef struct { - uint32_t input_index; - uint32_t fetch_slot; - xenos::instr_fetch_tex_t tex_fetch; - uint32_t format; - } tex_buffer_desc_t; - typedef struct { - uint32_t count; - tex_buffer_desc_t descs[32]; - } tex_buffer_inputs_t; - const tex_buffer_inputs_t* GetTextureBufferInputs(); - - typedef struct { - uint32_t positions; - uint32_t params; - uint32_t memories; - bool point_size; - } alloc_counts_t; - const alloc_counts_t& alloc_counts() const { return alloc_counts_; } - -private: - void GatherIO(); - void GatherAlloc(const xenos::instr_cf_alloc_t* cf); - void GatherExec(const xenos::instr_cf_exec_t* cf); - void GatherVertexFetch(const xenos::instr_fetch_vtx_t* vtx); - void GatherTextureFetch(const xenos::instr_fetch_tex_t* tex); - -protected: - xenos::XE_GPU_SHADER_TYPE type_; - uint32_t* dwords_; - size_t dword_count_; - uint64_t hash_; - bool is_prepared_; - - char* disasm_src_; - - alloc_counts_t alloc_counts_; - std::vector execs_; - std::vector allocs_; - vtx_buffer_inputs_t vtx_buffer_inputs_; - tex_buffer_inputs_t tex_buffer_inputs_; -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_SHADER_H_ diff --git a/src/xenia/gpu/shader_cache.cc b/src/xenia/gpu/shader_cache.cc deleted file mode 100644 index 9aee3e2b7..000000000 --- a/src/xenia/gpu/shader_cache.cc +++ /dev/null @@ -1,80 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace std; -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -ShaderCache::ShaderCache() { -} - -ShaderCache::~ShaderCache() { - Clear(); -} - -Shader* ShaderCache::Create( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length) { - uint64_t hash = Hash(src_ptr, length); - Shader* shader = CreateCore(type, src_ptr, length, hash); - map_.insert(pair(hash, shader)); - return shader; -} - -Shader* ShaderCache::CreateCore( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) { - return new Shader(type, src_ptr, length, hash); -} - -Shader* ShaderCache::Find( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length) { - uint64_t hash = Hash(src_ptr, length); - unordered_map::iterator it = map_.find(hash); - if (it != map_.end()) { - return it->second; - } - return NULL; -} - -Shader* ShaderCache::FindOrCreate( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length) { - uint64_t hash = Hash(src_ptr, length); - unordered_map::iterator it = map_.find(hash); - if (it != map_.end()) { - return it->second; - } - Shader* shader = CreateCore(type, src_ptr, length, hash); - map_.insert(pair(hash, shader)); - return shader; -} - -void ShaderCache::Clear() { - // TODO(benvanik): clear. - for (unordered_map::iterator it = map_.begin(); - it != map_.end(); ++it) { - Shader* shader = it->second; - delete shader; - } - map_.clear(); -} - -uint64_t ShaderCache::Hash(const uint8_t* src_ptr, size_t length) { - return xe_hash64(src_ptr, length, 0); -} diff --git a/src/xenia/gpu/shader_cache.h b/src/xenia/gpu/shader_cache.h deleted file mode 100644 index 97edc382f..000000000 --- a/src/xenia/gpu/shader_cache.h +++ /dev/null @@ -1,56 +0,0 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#ifndef XENIA_GPU_SHADER_CACHE_H_ -#define XENIA_GPU_SHADER_CACHE_H_ - -#include -#include -#include - - -namespace xe { -namespace gpu { - - -class ShaderCache { -public: - ShaderCache(); - virtual ~ShaderCache(); - - Shader* Create( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length); - Shader* Find( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length); - Shader* FindOrCreate( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length); - - void Clear(); - -private: - uint64_t Hash(const uint8_t* src_ptr, size_t length); - - std::unordered_map map_; - -protected: - virtual Shader* CreateCore( - xenos::XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash); -}; - - -} // namespace gpu -} // namespace xe - - -#endif // XENIA_GPU_SHADER_CACHE_H_ diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader_resource.cc similarity index 70% rename from src/xenia/gpu/shader.cc rename to src/xenia/gpu/shader_resource.cc index 69b083a60..07b64efbe 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader_resource.cc @@ -1,266 +1,275 @@ -/** - ****************************************************************************** - * Xenia : Xbox 360 Emulator Research Project * - ****************************************************************************** - * Copyright 2013 Ben Vanik. All rights reserved. * - * Released under the BSD license - see LICENSE in the root for more details. * - ****************************************************************************** - */ - -#include - -#include - - -using namespace xe; -using namespace xe::gpu; -using namespace xe::gpu::xenos; - - -Shader::Shader( - XE_GPU_SHADER_TYPE type, - const uint8_t* src_ptr, size_t length, - uint64_t hash) : - type_(type), hash_(hash), is_prepared_(false), disasm_src_(NULL) { - xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_)); - xe_zero_struct(&vtx_buffer_inputs_, sizeof(vtx_buffer_inputs_)); - xe_zero_struct(&tex_buffer_inputs_, sizeof(tex_buffer_inputs_)); - - // Verify. - dword_count_ = length / 4; - XEASSERT(dword_count_ <= 512); - - // Copy bytes and swap. - size_t byte_size = dword_count_ * sizeof(uint32_t); - dwords_ = (uint32_t*)xe_malloc(byte_size); - for (uint32_t n = 0; n < dword_count_; n++) { - dwords_[n] = XEGETUINT32BE(src_ptr + n * 4); - } - - // Gather input/output registers/etc. - GatherIO(); - - // Disassemble, for debugging. - disasm_src_ = DisassembleShader(type_, dwords_, dword_count_); -} - -Shader::~Shader() { - if (disasm_src_) { - xe_free(disasm_src_); - } - xe_free(dwords_); -} - -void Shader::GatherIO() { - // Process all execution blocks. - instr_cf_t cfa; - instr_cf_t cfb; - for (int idx = 0; idx < dword_count_; idx += 3) { - uint32_t dword_0 = dwords_[idx + 0]; - uint32_t dword_1 = dwords_[idx + 1]; - uint32_t dword_2 = dwords_[idx + 2]; - cfa.dword_0 = dword_0; - cfa.dword_1 = dword_1 & 0xFFFF; - cfb.dword_0 = (dword_1 >> 16) | (dword_2 << 16); - cfb.dword_1 = dword_2 >> 16; - if (cfa.opc == ALLOC) { - GatherAlloc(&cfa.alloc); - } else if (cfa.is_exec()) { - GatherExec(&cfa.exec); - } - if (cfb.opc == ALLOC) { - GatherAlloc(&cfb.alloc); - } else if (cfb.is_exec()) { - GatherExec(&cfb.exec); - } - if (cfa.opc == EXEC_END || cfb.opc == EXEC_END) { - break; - } - } -} - -void Shader::GatherAlloc(const instr_cf_alloc_t* cf) { - allocs_.push_back(*cf); - - switch (cf->buffer_select) { - case SQ_POSITION: - // Position (SV_POSITION). - alloc_counts_.positions += cf->size + 1; - break; - case SQ_PARAMETER_PIXEL: - // Output to PS (if VS), or frag output (if PS). - alloc_counts_.params += cf->size + 1; - break; - case SQ_MEMORY: - // MEMEXPORT? - alloc_counts_.memories += cf->size + 1; - break; - } -} - -void Shader::GatherExec(const instr_cf_exec_t* cf) { - execs_.push_back(*cf); - - uint32_t sequence = cf->serialize; - for (uint32_t i = 0; i < cf->count; i++) { - uint32_t alu_off = (cf->address + i); - int sync = sequence & 0x2; - if (sequence & 0x1) { - const instr_fetch_t* fetch = - (const instr_fetch_t*)(dwords_ + alu_off * 3); - switch (fetch->opc) { - case VTX_FETCH: - GatherVertexFetch(&fetch->vtx); - break; - case TEX_FETCH: - GatherTextureFetch(&fetch->tex); - break; - case TEX_GET_BORDER_COLOR_FRAC: - case TEX_GET_COMP_TEX_LOD: - case TEX_GET_GRADIENTS: - case TEX_GET_WEIGHTS: - case TEX_SET_TEX_LOD: - case TEX_SET_GRADIENTS_H: - case TEX_SET_GRADIENTS_V: - default: - XEASSERTALWAYS(); - break; - } - } else { - // TODO(benvanik): gather registers used, predicate bits used, etc. - const instr_alu_t* alu = - (const instr_alu_t*)(dwords_ + alu_off * 3); - if (alu->vector_write_mask) { - if (alu->export_data && alu->vector_dest == 63) { - alloc_counts_.point_size = true; - } - } - if (alu->scalar_write_mask || !alu->vector_write_mask) { - if (alu->export_data && alu->scalar_dest == 63) { - alloc_counts_.point_size = true; - } - } - } - sequence >>= 2; - } -} - -void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { - // dst_reg/dst_swiz - // src_reg/src_swiz - // format = a2xx_sq_surfaceformat - // format_comp_all ? signed : unsigned - // num_format_all ? normalized - // stride - // offset - // const_index/const_index_sel -- fetch constant register - // num_format_all ? integer : fraction - // exp_adjust_all - [-32,31] - (2^exp_adjust_all)*fetch - 0 = default - - // Sometimes games have fetches that just produce constants. We can - // ignore those. - uint32_t dst_swiz = vtx->dst_swiz; - bool fetches_any_data = false; - for (int i = 0; i < 4; i++) { - if ((dst_swiz & 0x7) == 4) { - // 0.0 - } else if ((dst_swiz & 0x7) == 5) { - // 1.0 - } else if ((dst_swiz & 0x7) == 6) { - // ? - } else if ((dst_swiz & 0x7) == 7) { - // Previous register value. - } else { - fetches_any_data = true; - break; - } - dst_swiz >>= 3; - } - if (!fetches_any_data) { - return; - } - - uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; - auto& inputs = vtx_buffer_inputs_; - vtx_buffer_element_t* el = NULL; - for (size_t n = 0; n < inputs.count; n++) { - auto& input = inputs.descs[n]; - if (input.fetch_slot == fetch_slot) { - XEASSERT(input.element_count + 1 < XECOUNT(input.elements)); - // It may not hold that all strides are equal, but I hope it does. - XEASSERT(!vtx->stride || input.stride_words == vtx->stride); - el = &input.elements[input.element_count++]; - break; - } - } - if (!el) { - XEASSERTNOTZERO(vtx->stride); - XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); - auto& input = inputs.descs[inputs.count++]; - input.input_index = inputs.count - 1; - input.fetch_slot = fetch_slot; - input.stride_words = vtx->stride; - el = &input.elements[input.element_count++]; - } - - el->vtx_fetch = *vtx; - el->format = vtx->format; - el->offset_words = vtx->offset; - el->size_words = 0; - switch (el->format) { - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_10_11_11: - case FMT_11_11_10: - el->size_words = 1; - break; - case FMT_16_16: - case FMT_16_16_FLOAT: - el->size_words = 1; - break; - case FMT_16_16_16_16: - case FMT_16_16_16_16_FLOAT: - el->size_words = 2; - break; - case FMT_32: - case FMT_32_FLOAT: - el->size_words = 1; - break; - case FMT_32_32: - case FMT_32_32_FLOAT: - el->size_words = 2; - break; - case FMT_32_32_32_FLOAT: - el->size_words = 3; - break; - case FMT_32_32_32_32: - case FMT_32_32_32_32_FLOAT: - el->size_words = 4; - break; - default: - XELOGE("Unknown vertex format: %d", el->format); - XEASSERTALWAYS(); - break; - } -} - -const Shader::vtx_buffer_inputs_t* Shader::GetVertexBufferInputs() { - return &vtx_buffer_inputs_; -} - -void Shader::GatherTextureFetch(const xenos::instr_fetch_tex_t* tex) { - // TODO(benvanik): check dest_swiz to see if we are writing anything. - - auto& inputs = tex_buffer_inputs_; - XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); - auto& input = inputs.descs[inputs.count++]; - input.input_index = inputs.count - 1; - input.fetch_slot = tex->const_idx & 0xF; // ? - input.tex_fetch = *tex; - - // Format mangling, size estimation, etc. -} - -const Shader::tex_buffer_inputs_t* Shader::GetTextureBufferInputs() { - return &tex_buffer_inputs_; -} +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +ShaderResource::ShaderResource(const MemoryRange& memory_range, + const Info& info, + xenos::XE_GPU_SHADER_TYPE type) + : HashedResource(memory_range), + info_(info), type_(type), is_prepared_(false), disasm_src_(nullptr) { + xe_zero_struct(&alloc_counts_, sizeof(alloc_counts_)); + xe_zero_struct(&buffer_inputs_, sizeof(buffer_inputs_)); + xe_zero_struct(&sampler_inputs_, sizeof(sampler_inputs_)); + + // Verify. + dword_count_ = memory_range.length / 4; + XEASSERT(dword_count_ <= 512); + + // Copy bytes and swap. + size_t byte_size = dword_count_ * sizeof(uint32_t); + dwords_ = (uint32_t*)xe_malloc(byte_size); + for (uint32_t n = 0; n < dword_count_; n++) { + dwords_[n] = XEGETUINT32BE(memory_range.host_base + n * 4); + } + + // Disassemble, for debugging. + disasm_src_ = DisassembleShader(type_, dwords_, dword_count_); + + // Gather input/output registers/etc. + GatherIO(); +} + +ShaderResource::~ShaderResource() { + xe_free(disasm_src_); + xe_free(dwords_); +} + +void ShaderResource::GatherIO() { + // Process all execution blocks. + instr_cf_t cfa; + instr_cf_t cfb; + for (int idx = 0; idx < dword_count_; idx += 3) { + uint32_t dword_0 = dwords_[idx + 0]; + uint32_t dword_1 = dwords_[idx + 1]; + uint32_t dword_2 = dwords_[idx + 2]; + cfa.dword_0 = dword_0; + cfa.dword_1 = dword_1 & 0xFFFF; + cfb.dword_0 = (dword_1 >> 16) | (dword_2 << 16); + cfb.dword_1 = dword_2 >> 16; + if (cfa.opc == ALLOC) { + GatherAlloc(&cfa.alloc); + } else if (cfa.is_exec()) { + GatherExec(&cfa.exec); + } + if (cfb.opc == ALLOC) { + GatherAlloc(&cfb.alloc); + } else if (cfb.is_exec()) { + GatherExec(&cfb.exec); + } + if (cfa.opc == EXEC_END || cfb.opc == EXEC_END) { + break; + } + } +} + +void ShaderResource::GatherAlloc(const instr_cf_alloc_t* cf) { + allocs_.push_back(*cf); + + switch (cf->buffer_select) { + case SQ_POSITION: + // Position (SV_POSITION). + alloc_counts_.positions += cf->size + 1; + break; + case SQ_PARAMETER_PIXEL: + // Output to PS (if VS), or frag output (if PS). + alloc_counts_.params += cf->size + 1; + break; + case SQ_MEMORY: + // MEMEXPORT? + alloc_counts_.memories += cf->size + 1; + break; + } +} + +void ShaderResource::GatherExec(const instr_cf_exec_t* cf) { + execs_.push_back(*cf); + + uint32_t sequence = cf->serialize; + for (uint32_t i = 0; i < cf->count; i++) { + uint32_t alu_off = (cf->address + i); + int sync = sequence & 0x2; + if (sequence & 0x1) { + const instr_fetch_t* fetch = + (const instr_fetch_t*)(dwords_ + alu_off * 3); + switch (fetch->opc) { + case VTX_FETCH: + GatherVertexFetch(&fetch->vtx); + break; + case TEX_FETCH: + GatherTextureFetch(&fetch->tex); + break; + case TEX_GET_BORDER_COLOR_FRAC: + case TEX_GET_COMP_TEX_LOD: + case TEX_GET_GRADIENTS: + case TEX_GET_WEIGHTS: + case TEX_SET_TEX_LOD: + case TEX_SET_GRADIENTS_H: + case TEX_SET_GRADIENTS_V: + default: + XEASSERTALWAYS(); + break; + } + } else { + // TODO(benvanik): gather registers used, predicate bits used, etc. + const instr_alu_t* alu = + (const instr_alu_t*)(dwords_ + alu_off * 3); + if (alu->vector_write_mask) { + if (alu->export_data && alu->vector_dest == 63) { + alloc_counts_.point_size = true; + } + } + if (alu->scalar_write_mask || !alu->vector_write_mask) { + if (alu->export_data && alu->scalar_dest == 63) { + alloc_counts_.point_size = true; + } + } + } + sequence >>= 2; + } +} + +void ShaderResource::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { + XEASSERT(type_ == XE_GPU_SHADER_TYPE_VERTEX); + + // dst_reg/dst_swiz + // src_reg/src_swiz + // format = a2xx_sq_surfaceformat + // format_comp_all ? signed : unsigned + // num_format_all ? normalized + // stride + // offset + // const_index/const_index_sel -- fetch constant register + // num_format_all ? integer : fraction + // exp_adjust_all - [-32,31] - (2^exp_adjust_all)*fetch - 0 = default + + // Sometimes games have fetches that just produce constants. We can + // ignore those. + uint32_t dst_swiz = vtx->dst_swiz; + bool fetches_any_data = false; + for (int i = 0; i < 4; i++) { + if ((dst_swiz & 0x7) == 4) { + // 0.0 + } else if ((dst_swiz & 0x7) == 5) { + // 1.0 + } else if ((dst_swiz & 0x7) == 6) { + // ? + } else if ((dst_swiz & 0x7) == 7) { + // Previous register value. + } else { + fetches_any_data = true; + break; + } + dst_swiz >>= 3; + } + if (!fetches_any_data) { + return; + } + + uint32_t fetch_slot = vtx->const_index * 3 + vtx->const_index_sel; + auto& inputs = buffer_inputs_; + VertexBufferResource::DeclElement* el = nullptr; + for (size_t n = 0; n < inputs.count; n++) { + auto& desc = inputs.descs[n]; + auto& info = desc.info; + if (desc.fetch_slot == fetch_slot) { + XEASSERT(info.element_count <= XECOUNT(info.elements)); + // It may not hold that all strides are equal, but I hope it does. + XEASSERT(!vtx->stride || info.stride_words == vtx->stride); + el = &info.elements[info.element_count++]; + break; + } + } + if (!el) { + XEASSERTNOTZERO(vtx->stride); + XEASSERT(inputs.count + 1 < XECOUNT(inputs.descs)); + auto& desc = inputs.descs[inputs.count++]; + desc.input_index = inputs.count - 1; + desc.fetch_slot = fetch_slot; + desc.info.stride_words = vtx->stride; + el = &desc.info.elements[desc.info.element_count++]; + } + + el->vtx_fetch = *vtx; + el->format = vtx->format; + el->is_normalized = vtx->num_format_all == 0; + el->is_signed = vtx->format_comp_all == 1; + el->offset_words = vtx->offset; + el->size_words = 0; + switch (el->format) { + case FMT_8_8_8_8: + case FMT_2_10_10_10: + case FMT_10_11_11: + case FMT_11_11_10: + el->size_words = 1; + break; + case FMT_16_16: + case FMT_16_16_FLOAT: + el->size_words = 1; + break; + case FMT_16_16_16_16: + case FMT_16_16_16_16_FLOAT: + el->size_words = 2; + break; + case FMT_32: + case FMT_32_FLOAT: + el->size_words = 1; + break; + case FMT_32_32: + case FMT_32_32_FLOAT: + el->size_words = 2; + break; + case FMT_32_32_32_FLOAT: + el->size_words = 3; + break; + case FMT_32_32_32_32: + case FMT_32_32_32_32_FLOAT: + el->size_words = 4; + break; + default: + XELOGE("Unknown vertex format: %d", el->format); + XEASSERTALWAYS(); + break; + } +} + +void ShaderResource::GatherTextureFetch(const xenos::instr_fetch_tex_t* tex) { + // TODO(benvanik): check dest_swiz to see if we are writing anything. + + XEASSERT(sampler_inputs_.count + 1 < XECOUNT(sampler_inputs_.descs)); + auto& input = sampler_inputs_.descs[sampler_inputs_.count++]; + input.input_index = sampler_inputs_.count - 1; + input.fetch_slot = tex->const_idx & 0xF; // ? + input.tex_fetch = *tex; + + // Format mangling, size estimation, etc. +} + +VertexShaderResource::VertexShaderResource( + const MemoryRange& memory_range, const Info& info) + : ShaderResource(memory_range, info, XE_GPU_SHADER_TYPE_VERTEX) { +} + +VertexShaderResource::~VertexShaderResource() = default; + +PixelShaderResource::PixelShaderResource( + const MemoryRange& memory_range, const Info& info) + : ShaderResource(memory_range, info, XE_GPU_SHADER_TYPE_PIXEL) { +} + +PixelShaderResource::~PixelShaderResource() = default; diff --git a/src/xenia/gpu/shader_resource.h b/src/xenia/gpu/shader_resource.h new file mode 100644 index 000000000..b591bfaf2 --- /dev/null +++ b/src/xenia/gpu/shader_resource.h @@ -0,0 +1,128 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SHADER_RESOURCE_H_ +#define XENIA_GPU_SHADER_RESOURCE_H_ + +#include +#include +#include +#include + + +namespace xe { +namespace gpu { + + +class ShaderResource : public HashedResource { +public: + struct Info { + // type, etc? + }; + + ~ShaderResource() override; + + const Info& info() const { return info_; } + xenos::XE_GPU_SHADER_TYPE type() const { return type_; } + const uint32_t* dwords() const { return dwords_; } + const size_t dword_count() const { return dword_count_; } + + bool is_prepared() const { return is_prepared_; } + const char* disasm_src() const { return disasm_src_; } + + struct BufferDesc { + uint32_t input_index; + uint32_t fetch_slot; + VertexBufferResource::Info info; + // xenos::instr_fetch_vtx_t vtx_fetch; for each el + }; + struct BufferInputs { + uint32_t count; + BufferDesc descs[32]; + }; + const BufferInputs& buffer_inputs() { return buffer_inputs_; } + + struct SamplerDesc { + uint32_t input_index; + uint32_t fetch_slot; + uint32_t format; + xenos::instr_fetch_tex_t tex_fetch; + }; + struct SamplerInputs { + uint32_t count; + SamplerDesc descs[32]; + }; + const SamplerInputs& sampler_inputs() { return sampler_inputs_; } + + struct AllocCounts { + uint32_t positions; + uint32_t params; + uint32_t memories; + bool point_size; + }; + const AllocCounts& alloc_counts() const { return alloc_counts_; } + const std::vector& execs() const { return execs_; } + const std::vector& allocs() const { return allocs_; } + +private: + void GatherIO(); + void GatherAlloc(const xenos::instr_cf_alloc_t* cf); + void GatherExec(const xenos::instr_cf_exec_t* cf); + void GatherVertexFetch(const xenos::instr_fetch_vtx_t* vtx); + void GatherTextureFetch(const xenos::instr_fetch_tex_t* tex); + +protected: + ShaderResource(const MemoryRange& memory_range, + const Info& info, + xenos::XE_GPU_SHADER_TYPE type); + + Info info_; + xenos::XE_GPU_SHADER_TYPE type_; + size_t dword_count_; + uint32_t* dwords_; + char* disasm_src_; + + AllocCounts alloc_counts_; + std::vector execs_; + std::vector allocs_; + BufferInputs buffer_inputs_; + SamplerInputs sampler_inputs_; + + bool is_prepared_; +}; + + +class VertexShaderResource : public ShaderResource { +public: + VertexShaderResource(const MemoryRange& memory_range, + const Info& info); + ~VertexShaderResource() override; + + // buffer_inputs() matching VertexBufferResource::Info + + virtual int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl) = 0; +}; + + +class PixelShaderResource : public ShaderResource { +public: + PixelShaderResource(const MemoryRange& memory_range, + const Info& info); + ~PixelShaderResource() override; + + virtual int Prepare(const xenos::xe_gpu_program_cntl_t& program_cntl, + VertexShaderResource* vertex_shader) = 0; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_SHADER_RESOURCE_H_ diff --git a/src/xenia/gpu/sources.gypi b/src/xenia/gpu/sources.gypi index 3d3ced141..b01f7a33b 100644 --- a/src/xenia/gpu/sources.gypi +++ b/src/xenia/gpu/sources.gypi @@ -1,7 +1,12 @@ # Copyright 2013 Ben Vanik. All Rights Reserved. { 'sources': [ - 'command_buffer.h', + 'buffer_resource.cc', + 'buffer_resource.h', + 'command_processor.cc', + 'command_processor.h', + 'draw_command.cc', + 'draw_command.h', 'gpu-private.h', 'gpu.cc', 'gpu.h', @@ -9,12 +14,18 @@ 'graphics_driver.h', 'graphics_system.cc', 'graphics_system.h', - 'ring_buffer_worker.cc', - 'ring_buffer_worker.h', - 'shader.cc', - 'shader.h', - 'shader_cache.cc', - 'shader_cache.h', + 'register_file.cc', + 'register_file.h', + 'resource.cc', + 'resource.h', + 'resource_cache.cc', + 'resource_cache.h', + 'sampler_state_resource.cc', + 'sampler_state_resource.h', + 'shader_resource.cc', + 'shader_resource.h', + 'texture_resource.cc', + 'texture_resource.h', ], 'includes': [ diff --git a/src/xenia/gpu/texture_resource.cc b/src/xenia/gpu/texture_resource.cc new file mode 100644 index 000000000..531796c11 --- /dev/null +++ b/src/xenia/gpu/texture_resource.cc @@ -0,0 +1,350 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include + + +using namespace std; +using namespace xe; +using namespace xe::gpu; +using namespace xe::gpu::xenos; + + +bool TextureResource::Info::Prepare(const xe_gpu_texture_fetch_t& fetch, + Info& info) { + // http://msdn.microsoft.com/en-us/library/windows/desktop/cc308051(v=vs.85).aspx + // a2xx_sq_surfaceformat + + info.dimension = (TextureDimension)fetch.dimension; + switch (info.dimension) { + case TEXTURE_DIMENSION_1D: + info.width = fetch.size_1d.width; + break; + case TEXTURE_DIMENSION_2D: + info.width = fetch.size_2d.width; + info.height = fetch.size_2d.height; + break; + case TEXTURE_DIMENSION_3D: + case TEXTURE_DIMENSION_CUBE: + info.width = fetch.size_3d.width; + info.height = fetch.size_3d.height; + info.depth = fetch.size_3d.depth; + break; + } + info.block_size = 0; + info.texel_pitch = 0; + info.endianness = (XE_GPU_ENDIAN)fetch.endianness; + info.is_tiled = fetch.tiled; + info.is_compressed = false; + info.input_length = 0; + info.format = DXGI_FORMAT_UNKNOWN; + switch (fetch.format) { + case FMT_8: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RRR1: + info.format = DXGI_FORMAT_R8_UNORM; + break; + case XE_GPU_SWIZZLE_000R: + info.format = DXGI_FORMAT_A8_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_8"); + info.format = DXGI_FORMAT_A8_UNORM; + break; + } + info.block_size = 1; + info.texel_pitch = 1; + break; + case FMT_1_5_5_5: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_BGRA: + info.format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_1_5_5_5"); + info.format = DXGI_FORMAT_B5G5R5A1_UNORM; + break; + } + info.block_size = 1; + info.texel_pitch = 2; + break; + case FMT_8_8_8_8: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RGBA: + info.format = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + case XE_GPU_SWIZZLE_BGRA: + info.format = DXGI_FORMAT_B8G8R8A8_UNORM; + break; + case XE_GPU_SWIZZLE_RGB1: + info.format = DXGI_FORMAT_R8G8B8A8_UNORM; // ? + break; + case XE_GPU_SWIZZLE_BGR1: + info.format = DXGI_FORMAT_B8G8R8X8_UNORM; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_8_8_8_8"); + info.format = DXGI_FORMAT_R8G8B8A8_UNORM; + break; + } + info.block_size = 1; + info.texel_pitch = 4; + break; + case FMT_4_4_4_4: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_BGRA: + info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_4_4_4_4"); + info.format = DXGI_FORMAT_B4G4R4A4_UNORM; // only supported on Windows 8+ + break; + } + info.block_size = 1; + info.texel_pitch = 2; + break; + case FMT_16_16_16_16_FLOAT: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_RGBA: + info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_16_16_16_16_FLOAT"); + info.format = DXGI_FORMAT_R16G16B16A16_FLOAT; + break; + } + info.block_size = 1; + info.texel_pitch = 8; + break; + case FMT_32_FLOAT: + switch (fetch.swizzle) { + case XE_GPU_SWIZZLE_R111: + info.format = DXGI_FORMAT_R32_FLOAT; + break; + default: + XELOGW("D3D11: unhandled swizzle for FMT_32_FLOAT"); + info.format = DXGI_FORMAT_R32_FLOAT; + break; + } + info.block_size = 1; + info.texel_pitch = 4; + break; + case FMT_DXT1: + info.format = DXGI_FORMAT_BC1_UNORM; + info.block_size = 4; + info.texel_pitch = 8; + info.is_compressed = true; + break; + case FMT_DXT2_3: + case FMT_DXT4_5: + info.format = (fetch.format == FMT_DXT4_5 ? DXGI_FORMAT_BC3_UNORM : DXGI_FORMAT_BC2_UNORM); + info.block_size = 4; + info.texel_pitch = 16; + info.is_compressed = true; + break; + case FMT_1_REVERSE: + case FMT_1: + case FMT_5_6_5: + case FMT_6_5_5: + case FMT_2_10_10_10: + case FMT_8_A: + case FMT_8_B: + case FMT_8_8: + case FMT_Cr_Y1_Cb_Y0: + case FMT_Y1_Cr_Y0_Cb: + case FMT_5_5_5_1: + case FMT_8_8_8_8_A: + case FMT_10_11_11: + case FMT_11_11_10: + case FMT_24_8: + case FMT_24_8_FLOAT: + case FMT_16: + case FMT_16_16: + case FMT_16_16_16_16: + case FMT_16_EXPAND: + case FMT_16_16_EXPAND: + case FMT_16_16_16_16_EXPAND: + case FMT_16_FLOAT: + case FMT_16_16_FLOAT: + case FMT_32: + case FMT_32_32: + case FMT_32_32_32_32: + case FMT_32_32_FLOAT: + case FMT_32_32_32_32_FLOAT: + case FMT_32_AS_8: + case FMT_32_AS_8_8: + case FMT_16_MPEG: + case FMT_16_16_MPEG: + case FMT_8_INTERLACED: + case FMT_32_AS_8_INTERLACED: + case FMT_32_AS_8_8_INTERLACED: + case FMT_16_INTERLACED: + case FMT_16_MPEG_INTERLACED: + case FMT_16_16_MPEG_INTERLACED: + case FMT_DXN: + case FMT_8_8_8_8_AS_16_16_16_16: + case FMT_DXT1_AS_16_16_16_16: + case FMT_DXT2_3_AS_16_16_16_16: + case FMT_DXT4_5_AS_16_16_16_16: + case FMT_2_10_10_10_AS_16_16_16_16: + case FMT_10_11_11_AS_16_16_16_16: + case FMT_11_11_10_AS_16_16_16_16: + case FMT_32_32_32_FLOAT: + case FMT_DXT3A: + case FMT_DXT5A: + case FMT_CTX1: + case FMT_DXT3A_AS_1_1_1_1: + info.format = DXGI_FORMAT_UNKNOWN; + break; + } + + if (info.format == DXGI_FORMAT_UNKNOWN) { + return false; + } + + // Must be called here when we know the format. + switch (info.dimension) { + case TEXTURE_DIMENSION_1D: + info.CalculateTextureSizes1D(fetch); + break; + case TEXTURE_DIMENSION_2D: + info.CalculateTextureSizes2D(fetch); + break; + case TEXTURE_DIMENSION_3D: + // TODO(benvanik): calculate size. + return false; + case TEXTURE_DIMENSION_CUBE: + // TODO(benvanik): calculate size. + return false; + } + return true; +} + +void TextureResource::Info::CalculateTextureSizes1D( + const xe_gpu_texture_fetch_t& fetch) { + // ? + size_1d.width = fetch.size_1d.width; +} + +void TextureResource::Info::CalculateTextureSizes2D( + const xe_gpu_texture_fetch_t& fetch) { + size_2d.logical_width = 1 + fetch.size_2d.width; + size_2d.logical_height = 1 + fetch.size_2d.height; + + size_2d.block_width = size_2d.logical_width / block_size; + size_2d.block_height = size_2d.logical_height / block_size; + + if (!is_compressed) { + // must be 32x32 but also must have a pitch that is a multiple of 256 bytes + uint32_t bytes_per_block = block_size * block_size * texel_pitch; + uint32_t width_multiple = 32; + if (bytes_per_block) { + uint32_t minimum_multiple = 256 / bytes_per_block; + if (width_multiple < minimum_multiple) { + width_multiple = minimum_multiple; + } + } + size_2d.input_width = XEROUNDUP(size_2d.logical_width, width_multiple); + size_2d.input_height = XEROUNDUP(size_2d.logical_height, 32); + size_2d.output_width = size_2d.logical_width; + size_2d.output_height = size_2d.logical_height; + } else { + // must be 128x128 + size_2d.input_width = XEROUNDUP(size_2d.logical_width, 128); + size_2d.input_height = XEROUNDUP(size_2d.logical_height, 128); + size_2d.output_width = XENEXTPOW2(size_2d.logical_width); + size_2d.output_height = XENEXTPOW2(size_2d.logical_height); + } + + size_2d.logical_pitch = (size_2d.logical_width / block_size) * texel_pitch; + size_2d.input_pitch = (size_2d.input_width / block_size) * texel_pitch; + + if (!is_tiled) { + input_length = size_2d.block_height * size_2d.logical_pitch; + } else { + input_length = size_2d.block_height * size_2d.logical_pitch; // ? + } +} + +TextureResource::TextureResource(const MemoryRange& memory_range, + const Info& info) + : PagedResource(memory_range), + info_(info) { +} + +TextureResource::~TextureResource() { +} + +int TextureResource::Prepare() { + if (!handle()) { + if (CreateHandle()) { + XELOGE("Unable to create texture handle"); + return 1; + } + } + + if (!dirtied_) { + return 0; + } + dirtied_ = false; + + // pass dirty regions? + return InvalidateRegion(memory_range_); +} + +void TextureResource::TextureSwap(uint8_t* dest, const uint8_t* src, + uint32_t pitch) const { + // TODO(benvanik): optimize swapping paths. + switch (info_.endianness) { + case XE_GPU_ENDIAN_8IN16: + for (uint32_t i = 0; i < pitch; i += 2, src += 2, dest += 2) { + *(uint16_t*)dest = XESWAP16(*(uint16_t*)src); + } + break; + case XE_GPU_ENDIAN_8IN32: // Swap bytes. + for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { + *(uint32_t*)dest = XESWAP32(*(uint32_t*)src); + } + break; + case XE_GPU_ENDIAN_16IN32: // Swap half words. + for (uint32_t i = 0; i < pitch; i += 4, src += 4, dest += 4) { + uint32_t value = *(uint32_t*)src; + *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); + } + break; + default: + case XE_GPU_ENDIAN_NONE: + memcpy(dest, src, pitch); + break; + } +} + +// https://code.google.com/p/crunch/source/browse/trunk/inc/crn_decomp.h#4104 +uint32_t TextureResource::TiledOffset2DOuter(uint32_t y, uint32_t width, + uint32_t log_bpp) const { + uint32_t macro = ((y >> 5) * (width >> 5)) << (log_bpp + 7); + uint32_t micro = ((y & 6) << 2) << log_bpp; + return macro + + ((micro & ~15) << 1) + + (micro & 15) + + ((y & 8) << (3 + log_bpp)) + + ((y & 1) << 4); +} + +uint32_t TextureResource::TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t base_offset) const { + uint32_t macro = (x >> 5) << (bpp + 7); + uint32_t micro = (x & 7) << bpp; + uint32_t offset = base_offset + (macro + ((micro & ~15) << 1) + (micro & 15)); + return ((offset & ~511) << 3) + ((offset & 448) << 2) + (offset & 63) + + ((y & 16) << 7) + (((((y & 8) >> 2) + (x >> 3)) & 3) << 6); +} diff --git a/src/xenia/gpu/texture_resource.h b/src/xenia/gpu/texture_resource.h new file mode 100644 index 000000000..57dc63422 --- /dev/null +++ b/src/xenia/gpu/texture_resource.h @@ -0,0 +1,110 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_TEXTURE_RESOURCE_H_ +#define XENIA_GPU_TEXTURE_RESOURCE_H_ + +#include +#include + +// TODO(benvanik): replace DXGI constants with xenia constants. +#include + + +namespace xe { +namespace gpu { + + +enum TextureDimension { + TEXTURE_DIMENSION_1D = 0, + TEXTURE_DIMENSION_2D = 1, + TEXTURE_DIMENSION_3D = 2, + TEXTURE_DIMENSION_CUBE = 3, +}; + + +class TextureResource : public PagedResource { +public: + struct Info { + TextureDimension dimension; + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t block_size; + uint32_t texel_pitch; + xenos::XE_GPU_ENDIAN endianness; + bool is_tiled; + bool is_compressed; + uint32_t input_length; + + // TODO(benvanik): replace with our own constants. + DXGI_FORMAT format; + + union { + struct { + uint32_t width; + } size_1d; + struct { + uint32_t logical_width; + uint32_t logical_height; + uint32_t block_width; + uint32_t block_height; + uint32_t input_width; + uint32_t input_height; + uint32_t output_width; + uint32_t output_height; + uint32_t logical_pitch; + uint32_t input_pitch; + } size_2d; + struct { + } size_3d; + struct { + } size_cube; + }; + + static bool Prepare(const xenos::xe_gpu_texture_fetch_t& fetch, + Info& out_info); + + private: + void CalculateTextureSizes1D(const xenos::xe_gpu_texture_fetch_t& fetch); + void CalculateTextureSizes2D(const xenos::xe_gpu_texture_fetch_t& fetch); + }; + + TextureResource(const MemoryRange& memory_range, + const Info& info); + ~TextureResource() override; + + const Info& info() const { return info_; } + + bool Equals(const void* info_ptr, size_t info_length) override { + return info_length == sizeof(Info) && + memcmp(info_ptr, &info_, info_length) == 0; + } + + virtual int Prepare(); + +protected: + virtual int CreateHandle() = 0; + virtual int InvalidateRegion(const MemoryRange& memory_range) = 0; + + void TextureSwap(uint8_t* dest, const uint8_t* src, uint32_t pitch) const; + uint32_t TiledOffset2DOuter(uint32_t y, uint32_t width, + uint32_t log_bpp) const; + uint32_t TiledOffset2DInner(uint32_t x, uint32_t y, uint32_t bpp, + uint32_t base_offset) const; + + Info info_; +}; + + +} // namespace gpu +} // namespace xe + + +#endif // XENIA_GPU_TEXTURE_RESOURCE_H_ diff --git a/src/xenia/gpu/xenos/packets.h b/src/xenia/gpu/xenos/packets.h index 4b7124310..459ab7e6e 100644 --- a/src/xenia/gpu/xenos/packets.h +++ b/src/xenia/gpu/xenos/packets.h @@ -70,6 +70,8 @@ enum Type3Opcode { PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed PM4_INTERRUPT = 0x54, // generate interrupt from the command stream + PM4_XE_SWAP = 0x55, // Xenia only: VdSwap uses this to trigger a swap. + PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory // Tiled rendering: diff --git a/src/xenia/gpu/xenos/sources.gypi b/src/xenia/gpu/xenos/sources.gypi index c1f677682..998444938 100644 --- a/src/xenia/gpu/xenos/sources.gypi +++ b/src/xenia/gpu/xenos/sources.gypi @@ -3,8 +3,6 @@ 'sources': [ 'packets.h', 'register_table.inc', - 'registers.cc', - 'registers.h', 'ucode.h', 'ucode_disassembler.cc', 'ucode_disassembler.h', diff --git a/src/xenia/hid/hid.cc b/src/xenia/hid/hid.cc index 9aa58c618..fbd66630d 100644 --- a/src/xenia/hid/hid.cc +++ b/src/xenia/hid/hid.cc @@ -17,11 +17,12 @@ using namespace xe::hid; DEFINE_string(hid, "any", - "Input system. Use: [any, nop, xinput]"); + "Input system. Use: [any, nop, winkey, xinput]"); #include #if XE_PLATFORM_WIN32 +#include #include #endif // WIN32 @@ -33,6 +34,8 @@ InputSystem* xe::hid::Create(Emulator* emulator) { if (FLAGS_hid.compare("nop") == 0) { input_system->AddDriver(xe::hid::nop::Create(input_system)); #if XE_PLATFORM_WIN32 + } else if (FLAGS_hid.compare("winkey") == 0) { + input_system->AddDriver(xe::hid::winkey::Create(input_system)); } else if (FLAGS_hid.compare("xinput") == 0) { input_system->AddDriver(xe::hid::xinput::Create(input_system)); #endif // WIN32 @@ -48,6 +51,11 @@ InputSystem* xe::hid::Create(Emulator* emulator) { input_system->AddDriver(xinput_driver); any_created = true; } + InputDriver* winkey_driver = xe::hid::winkey::Create(input_system); + if (winkey_driver) { + input_system->AddDriver(winkey_driver); + any_created = true; + } #endif // WIN32 // Fallback to nop if none created. diff --git a/src/xenia/hid/input_system.cc b/src/xenia/hid/input_system.cc index b82ca11af..6ad1ab177 100644 --- a/src/xenia/hid/input_system.cc +++ b/src/xenia/hid/input_system.cc @@ -42,6 +42,8 @@ void InputSystem::AddDriver(InputDriver* driver) { X_RESULT InputSystem::GetCapabilities( uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->GetCapabilities(user_index, flags, out_caps))) { @@ -52,6 +54,8 @@ X_RESULT InputSystem::GetCapabilities( } X_RESULT InputSystem::GetState(uint32_t user_index, X_INPUT_STATE& out_state) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (driver->GetState(user_index, out_state) == X_ERROR_SUCCESS) { @@ -63,6 +67,8 @@ X_RESULT InputSystem::GetState(uint32_t user_index, X_INPUT_STATE& out_state) { X_RESULT InputSystem::SetState( uint32_t user_index, X_INPUT_VIBRATION& vibration) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->SetState(user_index, vibration))) { @@ -74,6 +80,8 @@ X_RESULT InputSystem::SetState( X_RESULT InputSystem::GetKeystroke( uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke) { + SCOPE_profile_cpu_f("hid"); + for (auto it = drivers_.begin(); it != drivers_.end(); ++it) { InputDriver* driver = *it; if (XSUCCEEDED(driver->GetKeystroke(user_index, flags, out_keystroke))) { diff --git a/src/xenia/hid/sources.gypi b/src/xenia/hid/sources.gypi index e166ec3f0..079d059ca 100644 --- a/src/xenia/hid/sources.gypi +++ b/src/xenia/hid/sources.gypi @@ -17,6 +17,7 @@ 'conditions': [ ['OS == "win"', { 'includes': [ + 'winkey/sources.gypi', 'xinput/sources.gypi', ], }], diff --git a/src/xenia/hid/winkey/sources.gypi b/src/xenia/hid/winkey/sources.gypi new file mode 100644 index 000000000..792ac571d --- /dev/null +++ b/src/xenia/hid/winkey/sources.gypi @@ -0,0 +1,10 @@ +# Copyright 2013 Ben Vanik. All Rights Reserved. +{ + 'sources': [ + 'winkey_hid-private.h', + 'winkey_hid.cc', + 'winkey_hid.h', + 'winkey_input_driver.cc', + 'winkey_input_driver.h', + ], +} diff --git a/src/xenia/hid/winkey/winkey_hid-private.h b/src/xenia/hid/winkey/winkey_hid-private.h new file mode 100644 index 000000000..2100f9185 --- /dev/null +++ b/src/xenia/hid/winkey/winkey_hid-private.h @@ -0,0 +1,31 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_HID_WINKEY_WINKEY_HID_PRIVATE_H_ +#define XENIA_HID_WINKEY_WINKEY_HID_PRIVATE_H_ + +#include + +#include + + +namespace xe { +namespace hid { +namespace winkey { + + + + + +} // namespace winkey +} // namespace hid +} // namespace xe + + +#endif // XENIA_HID_WINKEY_WINKEY_HID_PRIVATE_H_ diff --git a/src/xenia/hid/winkey/winkey_hid.cc b/src/xenia/hid/winkey/winkey_hid.cc new file mode 100644 index 000000000..43d363271 --- /dev/null +++ b/src/xenia/hid/winkey/winkey_hid.cc @@ -0,0 +1,44 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::hid; +using namespace xe::hid::winkey; + + +namespace { + void InitializeIfNeeded(); + void CleanupOnShutdown(); + + void InitializeIfNeeded() { + static bool has_initialized = false; + if (has_initialized) { + return; + } + has_initialized = true; + + // + + atexit(CleanupOnShutdown); + } + + void CleanupOnShutdown() { + } +} + + +InputDriver* xe::hid::winkey::Create(InputSystem* input_system) { + InitializeIfNeeded(); + return new WinKeyInputDriver(input_system); +} diff --git a/src/xenia/hid/winkey/winkey_hid.h b/src/xenia/hid/winkey/winkey_hid.h new file mode 100644 index 000000000..a5ed273bc --- /dev/null +++ b/src/xenia/hid/winkey/winkey_hid.h @@ -0,0 +1,33 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_HID_WINKEY_WINKEY_HID_H_ +#define XENIA_HID_WINKEY_WINKEY_HID_H_ + +#include + + +XEDECLARECLASS2(xe, hid, InputDriver); +XEDECLARECLASS2(xe, hid, InputSystem); + + +namespace xe { +namespace hid { +namespace winkey { + + +InputDriver* Create(InputSystem* input_system); + + +} // namespace winkey +} // namespace hid +} // namespace xe + + +#endif // XENIA_HID_WINKEY_WINKEY_HID_H_ diff --git a/src/xenia/hid/winkey/winkey_input_driver.cc b/src/xenia/hid/winkey/winkey_input_driver.cc new file mode 100644 index 000000000..d0e63d64b --- /dev/null +++ b/src/xenia/hid/winkey/winkey_input_driver.cc @@ -0,0 +1,181 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + + +using namespace xe; +using namespace xe::hid; +using namespace xe::hid::winkey; + + +WinKeyInputDriver::WinKeyInputDriver(InputSystem* input_system) : + packet_number_(1), + InputDriver(input_system) { +} + +WinKeyInputDriver::~WinKeyInputDriver() { +} + +X_STATUS WinKeyInputDriver::Setup() { + return X_STATUS_SUCCESS; +} + +X_RESULT WinKeyInputDriver::GetCapabilities( + uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + // TODO(benvanik): confirm with a real XInput controller. + out_caps.type = 0x01; // XINPUT_DEVTYPE_GAMEPAD + out_caps.sub_type = 0x01; // XINPUT_DEVSUBTYPE_GAMEPAD + out_caps.flags = 0; + out_caps.gamepad.buttons = 0xFFFF; + out_caps.gamepad.left_trigger = 0xFF; + out_caps.gamepad.right_trigger = 0xFF; + out_caps.gamepad.thumb_lx = (int16_t)0xFFFF; + out_caps.gamepad.thumb_ly = (int16_t)0xFFFF; + out_caps.gamepad.thumb_rx = (int16_t)0xFFFF; + out_caps.gamepad.thumb_ry = (int16_t)0xFFFF; + out_caps.vibration.left_motor_speed = 0; + out_caps.vibration.right_motor_speed = 0; + return X_ERROR_SUCCESS; +} + +#define IS_KEY_TOGGLED(key) ((GetKeyState(key) & 0x1) == 0x1) +#define IS_KEY_DOWN(key) ((GetAsyncKeyState(key) & 0x8000) == 0x8000) + +X_RESULT WinKeyInputDriver::GetState( + uint32_t user_index, X_INPUT_STATE& out_state) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + packet_number_++; + + uint16_t buttons = 0; + uint8_t left_trigger = 0; + uint8_t right_trigger = 0; + int16_t thumb_lx = 0; + int16_t thumb_ly = 0; + int16_t thumb_rx = 0; + int16_t thumb_ry = 0; + + if (IS_KEY_TOGGLED(VK_CAPITAL)) { + // dpad toggled + if (IS_KEY_DOWN(0x41)) { + // A + buttons |= 0x0004; // XINPUT_GAMEPAD_DPAD_LEFT + } + if (IS_KEY_DOWN(0x44)) { + // D + buttons |= 0x0008; // XINPUT_GAMEPAD_DPAD_RIGHT + } + if (IS_KEY_DOWN(0x53)) { + // S + buttons |= 0x0002; // XINPUT_GAMEPAD_DPAD_DOWN + } + if (IS_KEY_DOWN(0x57)) { + // W + buttons |= 0x0001; // XINPUT_GAMEPAD_DPAD_UP + } + } else { + // left stick + if (IS_KEY_DOWN(0x41)) { + // A + thumb_lx += SHRT_MIN; + } + if (IS_KEY_DOWN(0x44)) { + // D + thumb_lx += SHRT_MAX; + } + if (IS_KEY_DOWN(0x53)) { + // S + thumb_ly += SHRT_MIN; + } + if (IS_KEY_DOWN(0x57)) { + // W + thumb_ly += SHRT_MAX; + } + } + + if (IS_KEY_DOWN(0x4C)) { + // L + buttons |= 0x4000; // XINPUT_GAMEPAD_X + } + if (IS_KEY_DOWN(VK_OEM_7)) { + // ' + buttons |= 0x2000; // XINPUT_GAMEPAD_B + } + if (IS_KEY_DOWN(VK_OEM_1)) { + // ; + buttons |= 0x1000; // XINPUT_GAMEPAD_A + } + if (IS_KEY_DOWN(0x50)) { + // P + buttons |= 0x8000; // XINPUT_GAMEPAD_Y + } + + if (IS_KEY_DOWN(0x5A)) { + // Z + buttons |= 0x0020; // XINPUT_GAMEPAD_BACK + } + if (IS_KEY_DOWN(0x58)) { + // X + buttons |= 0x0010; // XINPUT_GAMEPAD_START + } + + out_state.packet_number = packet_number_; + out_state.gamepad.buttons = buttons; + out_state.gamepad.left_trigger = left_trigger; + out_state.gamepad.right_trigger = right_trigger; + out_state.gamepad.thumb_lx = thumb_lx; + out_state.gamepad.thumb_ly = thumb_ly; + out_state.gamepad.thumb_rx = thumb_rx; + out_state.gamepad.thumb_ry = thumb_ry; + + return X_ERROR_SUCCESS; +} + +X_RESULT WinKeyInputDriver::SetState( + uint32_t user_index, X_INPUT_VIBRATION& vibration) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + return X_ERROR_SUCCESS; +} + +X_RESULT WinKeyInputDriver::GetKeystroke( + uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke) { + if (user_index != 0) { + return X_ERROR_DEVICE_NOT_CONNECTED; + } + + X_RESULT result = X_ERROR_EMPTY; + + uint16_t virtual_key = 0; + uint16_t unicode = 0; + uint16_t keystroke_flags = 0; + uint8_t hid_code = 0; + + out_keystroke.virtual_key = virtual_key; + out_keystroke.unicode = unicode; + out_keystroke.flags = keystroke_flags; + out_keystroke.user_index = 0; + out_keystroke.hid_code = hid_code; + + // X_ERROR_EMPTY if no new keys + // X_ERROR_DEVICE_NOT_CONNECTED if no device + // X_ERROR_SUCCESS if key + return result; +} diff --git a/src/xenia/hid/winkey/winkey_input_driver.h b/src/xenia/hid/winkey/winkey_input_driver.h new file mode 100644 index 000000000..b1d00fd10 --- /dev/null +++ b/src/xenia/hid/winkey/winkey_input_driver.h @@ -0,0 +1,50 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_HID_WINKEY_WINKEY_DRIVER_H_ +#define XENIA_HID_WINKEY_WINKEY_DRIVER_H_ + +#include + +#include +#include + + +namespace xe { +namespace hid { +namespace winkey { + + +class WinKeyInputDriver : public InputDriver { +public: + WinKeyInputDriver(InputSystem* input_system); + virtual ~WinKeyInputDriver(); + + virtual X_STATUS Setup(); + + virtual X_RESULT GetCapabilities( + uint32_t user_index, uint32_t flags, X_INPUT_CAPABILITIES& out_caps); + virtual X_RESULT GetState( + uint32_t user_index, X_INPUT_STATE& out_state); + virtual X_RESULT SetState( + uint32_t user_index, X_INPUT_VIBRATION& vibration); + virtual X_RESULT GetKeystroke( + uint32_t user_index, uint32_t flags, X_INPUT_KEYSTROKE& out_keystroke); + +protected: + uint32_t packet_number_; +}; + + +} // namespace winkey +} // namespace hid +} // namespace xe + + +#endif // XENIA_HID_WINKEY_WINKEY_DRIVER_H_ diff --git a/src/xenia/kernel/fs/devices/disc_image_file.cc b/src/xenia/kernel/fs/devices/disc_image_file.cc index d98919c62..c094dbef7 100644 --- a/src/xenia/kernel/fs/devices/disc_image_file.cc +++ b/src/xenia/kernel/fs/devices/disc_image_file.cc @@ -63,6 +63,9 @@ X_STATUS DiscImageFile::ReadSync( size_t* out_bytes_read) { GDFXEntry* gdfx_entry = entry_->gdfx_entry(); xe_mmap_ref mmap = entry_->mmap(); + if (byte_offset >= gdfx_entry->size) { + return X_STATUS_END_OF_FILE; + } size_t real_offset = gdfx_entry->offset + byte_offset; size_t real_length = MIN(buffer_length, gdfx_entry->size - byte_offset); xe_copy_memory( diff --git a/src/xenia/kernel/fs/devices/host_path_file.cc b/src/xenia/kernel/fs/devices/host_path_file.cc index cf75e69e9..b36f9f890 100644 --- a/src/xenia/kernel/fs/devices/host_path_file.cc +++ b/src/xenia/kernel/fs/devices/host_path_file.cc @@ -71,6 +71,6 @@ X_STATUS HostPathFile::ReadSync( *out_bytes_read = bytes_read; return X_STATUS_SUCCESS; } else { - return X_STATUS_UNSUCCESSFUL; + return X_STATUS_END_OF_FILE; } } diff --git a/src/xenia/kernel/fs/devices/stfs_container_file.cc b/src/xenia/kernel/fs/devices/stfs_container_file.cc index 05b1a21a8..4f9f25a53 100644 --- a/src/xenia/kernel/fs/devices/stfs_container_file.cc +++ b/src/xenia/kernel/fs/devices/stfs_container_file.cc @@ -64,6 +64,9 @@ X_STATUS STFSContainerFile::ReadSync( STFSEntry* stfs_entry = entry_->stfs_entry(); xe_mmap_ref mmap = entry_->mmap(); uint8_t* map_ptr = xe_mmap_get_addr(mmap); + if (byte_offset >= stfs_entry->size) { + return X_STATUS_END_OF_FILE; + } // Each block is 4096. // Blocks may not be sequential, so we need to read by blocks and handle the diff --git a/src/xenia/kernel/fs/filesystem.cc b/src/xenia/kernel/fs/filesystem.cc index 6efa53de6..e83d409c8 100644 --- a/src/xenia/kernel/fs/filesystem.cc +++ b/src/xenia/kernel/fs/filesystem.cc @@ -70,7 +70,7 @@ int FileSystem::CreateSymbolicLink(const char* path, const char* target) { } int FileSystem::DeleteSymbolicLink(const char* path) { - std::tr1::unordered_map::iterator it = + std::unordered_map::iterator it = symlinks_.find(std::string(path)); if (it != symlinks_.end()) { symlinks_.erase(it); @@ -93,7 +93,7 @@ Entry* FileSystem::ResolvePath(const char* path) { // drive path -> device mappings with nothing nested. char full_path[XE_MAX_PATH]; XEIGNORE(xestrcpya(full_path, XECOUNT(full_path), path)); - for (std::tr1::unordered_map::iterator it = + for (std::unordered_map::iterator it = symlinks_.begin(); it != symlinks_.end(); ++it) { if (xestrcasestra(path, it->first.c_str()) == path) { // Found symlink, fixup. diff --git a/src/xenia/kernel/fs/filesystem.h b/src/xenia/kernel/fs/filesystem.h index acc6dbda6..94b9d787f 100644 --- a/src/xenia/kernel/fs/filesystem.h +++ b/src/xenia/kernel/fs/filesystem.h @@ -43,7 +43,7 @@ public: private: std::vector devices_; - std::tr1::unordered_map symlinks_; + std::unordered_map symlinks_; }; diff --git a/src/xenia/kernel/kernel_state.cc b/src/xenia/kernel/kernel_state.cc index 5cd6110b7..bc10751d7 100644 --- a/src/xenia/kernel/kernel_state.cc +++ b/src/xenia/kernel/kernel_state.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -135,3 +136,30 @@ XThread* KernelState::GetThreadByID(uint32_t thread_id) { xe_mutex_unlock(object_mutex_); return thread; } + +void KernelState::RegisterNotifyListener(XNotifyListener* listener) { + xe_mutex_lock(object_mutex_); + notify_listeners_.push_back(listener); + xe_mutex_unlock(object_mutex_); +} + +void KernelState::UnregisterNotifyListener(XNotifyListener* listener) { + xe_mutex_lock(object_mutex_); + for (auto it = notify_listeners_.begin(); it != notify_listeners_.end(); + ++it) { + if (*it == listener) { + notify_listeners_.erase(it); + break; + } + } + xe_mutex_unlock(object_mutex_); +} + +void KernelState::BroadcastNotification(XNotificationID id, uint32_t data) { + xe_mutex_lock(object_mutex_); + for (auto it = notify_listeners_.begin(); it != notify_listeners_.end(); + ++it) { + (*it)->EnqueueNotification(id, data); + } + xe_mutex_unlock(object_mutex_); +} diff --git a/src/xenia/kernel/kernel_state.h b/src/xenia/kernel/kernel_state.h index 944169918..f2fb3f47e 100644 --- a/src/xenia/kernel/kernel_state.h +++ b/src/xenia/kernel/kernel_state.h @@ -23,6 +23,7 @@ XEDECLARECLASS1(xe, Emulator); XEDECLARECLASS2(xe, cpu, Processor); XEDECLARECLASS2(xe, kernel, Dispatcher); XEDECLARECLASS2(xe, kernel, XModule); +XEDECLARECLASS2(xe, kernel, XNotifyListener); XEDECLARECLASS2(xe, kernel, XThread); XEDECLARECLASS2(xe, kernel, XUserModule); XEDECLARECLASS3(xe, kernel, fs, FileSystem); @@ -56,6 +57,10 @@ public: void UnregisterThread(XThread* thread); XThread* GetThreadByID(uint32_t thread_id); + void RegisterNotifyListener(XNotifyListener* listener); + void UnregisterNotifyListener(XNotifyListener* listener); + void BroadcastNotification(XNotificationID id, uint32_t data); + private: Emulator* emulator_; Memory* memory_; @@ -67,6 +72,7 @@ private: ObjectTable* object_table_; xe_mutex_t* object_mutex_; std::unordered_map threads_by_id_; + std::vector notify_listeners_; XUserModule* executable_module_; diff --git a/src/xenia/kernel/native_list.cc b/src/xenia/kernel/native_list.cc index fbf792124..bc922d31d 100644 --- a/src/xenia/kernel/native_list.cc +++ b/src/xenia/kernel/native_list.cc @@ -66,3 +66,7 @@ uint32_t NativeList::Shift() { Remove(ptr); return ptr; } + +bool NativeList::HasPending() { + return head_ != kInvalidPointer; +} diff --git a/src/xenia/kernel/native_list.h b/src/xenia/kernel/native_list.h index e242d822f..d521ea937 100644 --- a/src/xenia/kernel/native_list.h +++ b/src/xenia/kernel/native_list.h @@ -38,6 +38,7 @@ public: bool IsQueued(uint32_t list_entry_ptr); void Remove(uint32_t list_entry_ptr); uint32_t Shift(); + bool HasPending(); private: const uint32_t kInvalidPointer = 0xE0FE0FFF; diff --git a/src/xenia/kernel/objects/xnotify_listener.cc b/src/xenia/kernel/objects/xnotify_listener.cc index b9d45dafb..7e9ffb704 100644 --- a/src/xenia/kernel/objects/xnotify_listener.cc +++ b/src/xenia/kernel/objects/xnotify_listener.cc @@ -20,6 +20,7 @@ XNotifyListener::XNotifyListener(KernelState* kernel_state) : } XNotifyListener::~XNotifyListener() { + kernel_state_->UnregisterNotifyListener(this); xe_mutex_free(lock_); if (wait_handle_) { CloseHandle(wait_handle_); @@ -32,9 +33,16 @@ void XNotifyListener::Initialize(uint64_t mask) { lock_ = xe_mutex_alloc(); wait_handle_ = CreateEvent(NULL, TRUE, FALSE, NULL); mask_ = mask; + + kernel_state_->RegisterNotifyListener(this); } void XNotifyListener::EnqueueNotification(XNotificationID id, uint32_t data) { + // Ignore if the notification doesn't match our mask. + if ((mask_ & uint64_t(1 << ((id >> 25) + 1))) == 0) { + return; + } + xe_mutex_lock(lock_); auto existing = notifications_.find(id); if (existing != notifications_.end()) { diff --git a/src/xenia/kernel/objects/xnotify_listener.h b/src/xenia/kernel/objects/xnotify_listener.h index a7aa16eee..436c4434a 100644 --- a/src/xenia/kernel/objects/xnotify_listener.h +++ b/src/xenia/kernel/objects/xnotify_listener.h @@ -21,9 +21,6 @@ namespace xe { namespace kernel { -// Values seem to be all over the place - GUIDs? -typedef uint32_t XNotificationID; - class XNotifyListener : public XObject { public: diff --git a/src/xenia/kernel/objects/xthread.cc b/src/xenia/kernel/objects/xthread.cc index 33f5aa378..d8e4bac85 100644 --- a/src/xenia/kernel/objects/xthread.cc +++ b/src/xenia/kernel/objects/xthread.cc @@ -78,6 +78,9 @@ XThread::~XThread() { if (thread_state_) { delete thread_state_; } + if (scratch_address_) { + kernel_state()->memory()->HeapFree(scratch_address_, 0); + } if (tls_address_) { kernel_state()->memory()->HeapFree(tls_address_, 0); } @@ -194,6 +197,12 @@ X_STATUS XThread::Create() { XUserModule* module = kernel_state()->GetExecutableModule(); + // Allocate thread scratch. + // This is used by interrupts/APCs/etc so we can round-trip pointers through. + scratch_size_ = 4 * 16; + scratch_address_ = (uint32_t)memory()->HeapAlloc( + 0, scratch_size_, MEMORY_FLAG_ZERO); + // Allocate TLS block. const xe_xex2_header_t* header = module->xex_header(); uint32_t tls_size = header->tls_info.slot_count * header->tls_info.data_size; @@ -231,6 +240,10 @@ X_STATUS XThread::Create() { return return_code; } + char thread_name[32]; + xesnprintfa(thread_name, XECOUNT(thread_name), "XThread%04X", handle()); + set_name(thread_name); + module->Release(); return X_STATUS_SUCCESS; } @@ -240,6 +253,7 @@ X_STATUS XThread::Exit(int exit_code) { // TODO(benvanik); dispatch events? waiters? etc? event_->Set(0, false); + RundownAPCs(); // NOTE: unless PlatformExit fails, expect it to never return! X_STATUS return_code = PlatformExit(exit_code); @@ -253,10 +267,12 @@ X_STATUS XThread::Exit(int exit_code) { static uint32_t __stdcall XThreadStartCallbackWin32(void* param) { XThread* thread = reinterpret_cast(param); + xe::Profiler::ThreadEnter(thread->name()); xeKeTlsSetValue(current_thread_tls, (uint64_t)thread); thread->Execute(); xeKeTlsSetValue(current_thread_tls, NULL); thread->Release(); + xe::Profiler::ThreadExit(); return 0; } @@ -293,10 +309,12 @@ X_STATUS XThread::PlatformExit(int exit_code) { static void* XThreadStartCallbackPthreads(void* param) { XThread* thread = reinterpret_cast(param); + xe::Profiler::ThreadEnter(thread->name()); xeKeTlsSetValue(current_thread_tls, (uint64_t)thread); thread->Execute(); xeKeTlsSetValue(current_thread_tls, NULL); thread->Release(); + xe::Profiler::ThreadExit(); return 0; } @@ -357,15 +375,21 @@ void XThread::Execute() { // If a XapiThreadStartup value is present, we use that as a trampoline. // Otherwise, we are a raw thread. if (creation_params_.xapi_thread_startup) { + uint64_t args[] = { + creation_params_.start_address, + creation_params_.start_context + }; kernel_state()->processor()->Execute( thread_state_, - creation_params_.xapi_thread_startup, - creation_params_.start_address, creation_params_.start_context); + creation_params_.xapi_thread_startup, args, XECOUNT(args)); } else { // Run user code. + uint64_t args[] = { + creation_params_.start_context + }; int exit_code = (int)kernel_state()->processor()->Execute( thread_state_, - creation_params_.start_address, creation_params_.start_context); + creation_params_.start_address, args, XECOUNT(args)); // If we got here it means the execute completed without an exit being called. // Treat the return code as an implicit exit code. Exit(exit_code); @@ -394,7 +418,99 @@ void XThread::LockApc() { } void XThread::UnlockApc() { + bool needs_apc = apc_list_->HasPending(); xe_mutex_unlock(apc_lock_); + if (needs_apc) { + QueueUserAPC(reinterpret_cast(DeliverAPCs), + thread_handle_, + reinterpret_cast(this)); + } +} + +void XThread::DeliverAPCs(void* data) { + // http://www.drdobbs.com/inside-nts-asynchronous-procedure-call/184416590?pgno=1 + // http://www.drdobbs.com/inside-nts-asynchronous-procedure-call/184416590?pgno=7 + XThread* thread = reinterpret_cast(data); + auto membase = thread->memory()->membase(); + auto processor = thread->kernel_state()->processor(); + auto apc_list = thread->apc_list(); + thread->LockApc(); + while (apc_list->HasPending()) { + // Get APC entry (offset for LIST_ENTRY offset) and cache what we need. + // Calling the routine may delete the memory/overwrite it. + uint32_t apc_address = apc_list->Shift() - 8; + uint8_t* apc_ptr = membase + apc_address; + uint32_t kernel_routine = XEGETUINT32BE(apc_ptr + 16); + uint32_t normal_routine = XEGETUINT32BE(apc_ptr + 24); + uint32_t normal_context = XEGETUINT32BE(apc_ptr + 28); + uint32_t system_arg1 = XEGETUINT32BE(apc_ptr + 32); + uint32_t system_arg2 = XEGETUINT32BE(apc_ptr + 36); + + // Mark as uninserted so that it can be reinserted again by the routine. + uint32_t old_flags = XEGETUINT32BE(apc_ptr + 40); + XESETUINT32BE(apc_ptr + 40, old_flags & ~0xFF00); + + // Call kernel routine. + // The routine can modify all of its arguments before passing it on. + // Since we need to give guest accessible pointers over, we copy things + // into and out of scratch. + uint8_t* scratch_ptr = membase + thread->scratch_address_; + XESETUINT32BE(scratch_ptr + 0, normal_routine); + XESETUINT32BE(scratch_ptr + 4, normal_context); + XESETUINT32BE(scratch_ptr + 8, system_arg1); + XESETUINT32BE(scratch_ptr + 12, system_arg2); + // kernel_routine(apc_address, &normal_routine, &normal_context, &system_arg1, &system_arg2) + uint64_t kernel_args[] = { + apc_address, + thread->scratch_address_ + 0, + thread->scratch_address_ + 4, + thread->scratch_address_ + 8, + thread->scratch_address_ + 12, + }; + processor->ExecuteInterrupt( + 0, kernel_routine, kernel_args, XECOUNT(kernel_args)); + normal_routine = XEGETUINT32BE(scratch_ptr + 0); + normal_context = XEGETUINT32BE(scratch_ptr + 4); + system_arg1 = XEGETUINT32BE(scratch_ptr + 8); + system_arg2 = XEGETUINT32BE(scratch_ptr + 12); + + // Call the normal routine. Note that it may have been killed by the kernel + // routine. + if (normal_routine) { + thread->UnlockApc(); + // normal_routine(normal_context, system_arg1, system_arg2) + uint64_t normal_args[] = { normal_context, system_arg1, system_arg2 }; + processor->ExecuteInterrupt( + 0, normal_routine, normal_args, XECOUNT(normal_args)); + thread->LockApc(); + } + } + thread->UnlockApc(); +} + +void XThread::RundownAPCs() { + auto membase = memory()->membase(); + LockApc(); + while (apc_list_->HasPending()) { + // Get APC entry (offset for LIST_ENTRY offset) and cache what we need. + // Calling the routine may delete the memory/overwrite it. + uint32_t apc_address = apc_list_->Shift() - 8; + uint8_t* apc_ptr = membase + apc_address; + uint32_t rundown_routine = XEGETUINT32BE(apc_ptr + 20); + + // Mark as uninserted so that it can be reinserted again by the routine. + uint32_t old_flags = XEGETUINT32BE(apc_ptr + 40); + XESETUINT32BE(apc_ptr + 40, old_flags & ~0xFF00); + + // Call the rundown routine. + if (rundown_routine) { + // rundown_routine(apc) + uint64_t args[] = { apc_address }; + kernel_state()->processor()->ExecuteInterrupt( + 0, rundown_routine, args, XECOUNT(args)); + } + } + UnlockApc(); } int32_t XThread::QueryPriority() { diff --git a/src/xenia/kernel/objects/xthread.h b/src/xenia/kernel/objects/xthread.h index a11bad9ea..8b403429b 100644 --- a/src/xenia/kernel/objects/xthread.h +++ b/src/xenia/kernel/objects/xthread.h @@ -73,6 +73,9 @@ private: void PlatformDestroy(); X_STATUS PlatformExit(int exit_code); + static void DeliverAPCs(void* data); + void RundownAPCs(); + struct { uint32_t stack_size; uint32_t xapi_thread_startup; @@ -83,6 +86,8 @@ private: uint32_t thread_id_; void* thread_handle_; + uint32_t scratch_address_; + uint32_t scratch_size_; uint32_t tls_address_; uint32_t thread_state_address_; cpu::XenonThreadState* thread_state_; diff --git a/src/xenia/kernel/objects/xuser_module.cc b/src/xenia/kernel/objects/xuser_module.cc index 86d25847e..3a07b209a 100644 --- a/src/xenia/kernel/objects/xuser_module.cc +++ b/src/xenia/kernel/objects/xuser_module.cc @@ -345,8 +345,6 @@ void XUserModule::Dump() { } } - - xe_free(import_infos); } printf("\n"); diff --git a/src/xenia/kernel/util/xex2.cc b/src/xenia/kernel/util/xex2.cc index 5532200d8..1aca590e3 100644 --- a/src/xenia/kernel/util/xex2.cc +++ b/src/xenia/kernel/util/xex2.cc @@ -11,6 +11,7 @@ #include +#include #include #include #include @@ -20,15 +21,22 @@ using namespace alloy; +DEFINE_bool(xex_dev_key, false, "Use the devkit key."); + typedef struct xe_xex2 { xe_ref_t ref; - Memory* memory; + Memory* memory; - xe_xex2_header_t header; + xe_xex2_header_t header; std::vector* sections; + + struct { + size_t count; + xe_xex2_import_info_t* infos; + } library_imports[16]; } xe_xex2_t; @@ -39,6 +47,8 @@ int xe_xex2_read_image(xe_xex2_ref xex, const uint8_t *xex_addr, const size_t xex_length, Memory* memory); int xe_xex2_load_pe(xe_xex2_ref xex); +int xe_xex2_find_import_infos(xe_xex2_ref xex, + const xe_xex2_import_library_t* library); xe_xex2_ref xe_xex2_load(Memory* memory, @@ -58,6 +68,11 @@ xe_xex2_ref xe_xex2_load(Memory* memory, XEEXPECTZERO(xe_xex2_load_pe(xex)); + for (size_t n = 0; n < xex->header.import_library_count; n++) { + auto library = &xex->header.import_libraries[n]; + XEEXPECTZERO(xe_xex2_find_import_infos(xex, library)); + } + return xex; XECLEANUP: @@ -422,7 +437,7 @@ int xe_xex2_decrypt_key(xe_xex2_header_t *header) { // Guess key based on file info. // TODO: better way to finding out which key to use? const uint8_t *xexkey; - if (header->execution_info.title_id) { + if (header->execution_info.title_id && !FLAGS_xex_dev_key) { xexkey = xe_xex2_retail_key; } else { xexkey = xe_xex2_devkit_key; @@ -894,12 +909,10 @@ const PESection* xe_xex2_get_pe_section(xe_xex2_ref xex, const char* name) { return NULL; } -int xe_xex2_get_import_infos(xe_xex2_ref xex, - const xe_xex2_import_library_t *library, - xe_xex2_import_info_t **out_import_infos, - size_t *out_import_info_count) { - uint8_t *mem = xex->memory->membase(); - const xe_xex2_header_t *header = xe_xex2_get_header(xex); +int xe_xex2_find_import_infos(xe_xex2_ref xex, + const xe_xex2_import_library_t *library) { + uint8_t* mem = xex->memory->membase(); + auto header = xe_xex2_get_header(xex); // Find library index for verification. size_t library_index = -1; @@ -970,13 +983,34 @@ int xe_xex2_get_import_infos(xe_xex2_ref xex, } } - *out_import_info_count = info_count; - *out_import_infos = infos; + xex->library_imports[library_index].count = info_count; + xex->library_imports[library_index].infos = infos; return 0; XECLEANUP: xe_free(infos); - *out_import_info_count = 0; - *out_import_infos = NULL; return 1; } + +int xe_xex2_get_import_infos(xe_xex2_ref xex, + const xe_xex2_import_library_t *library, + xe_xex2_import_info_t **out_import_infos, + size_t *out_import_info_count) { + auto header = xe_xex2_get_header(xex); + + // Find library index for verification. + size_t library_index = -1; + for (size_t n = 0; n < header->import_library_count; n++) { + if (&header->import_libraries[n] == library) { + library_index = n; + break; + } + } + if (library_index == (size_t)-1) { + return 1; + } + + *out_import_info_count = xex->library_imports[library_index].count; + *out_import_infos = xex->library_imports[library_index].infos; + return 0; +} diff --git a/src/xenia/kernel/xam_content.cc b/src/xenia/kernel/xam_content.cc index 97469d0ca..5f53bc4ea 100644 --- a/src/xenia/kernel/xam_content.cc +++ b/src/xenia/kernel/xam_content.cc @@ -44,6 +44,7 @@ SHIM_CALL XamContentGetLicenseMask_shim( } +// http://gameservice.googlecode.com/svn-history/r14/trunk/ContentManager.cpp SHIM_CALL XamContentCreateEnumerator_shim( PPCContext* ppc_state, KernelState* state) { uint32_t arg0 = SHIM_GET_ARG_32(0); @@ -52,12 +53,15 @@ SHIM_CALL XamContentCreateEnumerator_shim( uint32_t arg3 = SHIM_GET_ARG_32(3); uint32_t arg4 = SHIM_GET_ARG_32(4); uint32_t arg5 = SHIM_GET_ARG_32(5); - uint32_t arg6 = SHIM_GET_ARG_32(6); + uint32_t handle_ptr = SHIM_GET_ARG_32(6); XELOGD( "XamContentCreateEnumerator(%.8X, %.8X, %.8X, %.8X, %.8X, %.8X, %.8X)", - arg0, arg1, arg2, arg3, arg4, arg5, arg6); - SHIM_SET_RETURN_32(X_ERROR_DEVICE_NOT_CONNECTED); + arg0, arg1, arg2, arg3, arg4, arg5, handle_ptr); + + SHIM_SET_MEM_32(handle_ptr, X_INVALID_HANDLE_VALUE); + + SHIM_SET_RETURN_32(X_ERROR_NO_MORE_FILES); } diff --git a/src/xenia/kernel/xam_net.cc b/src/xenia/kernel/xam_net.cc index 4ca580cce..18094a2ed 100644 --- a/src/xenia/kernel/xam_net.cc +++ b/src/xenia/kernel/xam_net.cc @@ -52,12 +52,15 @@ SHIM_CALL NetDll_WSAStartup_shim( if (data_ptr) { SHIM_SET_MEM_16(data_ptr + 0x000, version); - SHIM_SET_MEM_16(data_ptr + 0x002, 0); + SHIM_SET_MEM_16(data_ptr + 0x002, version); SHIM_SET_MEM_32(data_ptr + 0x004, 0); SHIM_SET_MEM_32(data_ptr + 0x105, 0); SHIM_SET_MEM_16(data_ptr + 0x186, 0); SHIM_SET_MEM_16(data_ptr + 0x188, 0); - SHIM_SET_MEM_32(data_ptr + 0x190, 0); + // Some games (PoG) want this value round-tripped - they'll compare if it + // changes and bugcheck if it does. + uint32_t vendor_ptr = SHIM_MEM_32(data_ptr + 0x190); + SHIM_SET_MEM_32(data_ptr + 0x190, vendor_ptr); } SHIM_SET_RETURN_64(0); @@ -66,7 +69,18 @@ SHIM_CALL NetDll_WSAStartup_shim( SHIM_CALL NetDll_WSAGetLastError_shim( PPCContext* ppc_state, KernelState* state) { XELOGD("NetDll_WSAGetLastError()"); - SHIM_SET_RETURN_32(WSAENETDOWN); + SHIM_SET_RETURN_32(10093L); // WSANOTINITIALISED +} + +SHIM_CALL NetDll_XNetGetTitleXnAddr_shim( + PPCContext* ppc_state, KernelState* state) { + uint32_t arg0 = SHIM_GET_ARG_32(0); + uint32_t arg1 = SHIM_GET_ARG_32(1); + XELOGD( + "NetDll_XNetGetTitleXnAddr(%d, %.8X)", + arg0, + arg1); + SHIM_SET_RETURN_32(0x00000001); } SHIM_CALL NetDll_XNetGetEthernetLinkStatus_shim( @@ -86,7 +100,7 @@ SHIM_CALL NetDll_inet_addr_shim( XELOGD( "NetDll_inet_addr(%.8X)", cp_ptr); - SHIM_SET_RETURN_32(INADDR_NONE); + SHIM_SET_RETURN_32(0xFFFFFFFF); // X_INADDR_NONE } SHIM_CALL NetDll_socket_shim( @@ -101,7 +115,7 @@ SHIM_CALL NetDll_socket_shim( af, type, protocol); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_setsockopt_shim( @@ -113,14 +127,14 @@ SHIM_CALL NetDll_setsockopt_shim( uint32_t optval_ptr = SHIM_GET_ARG_32(4); uint32_t optlen = SHIM_GET_ARG_32(5); XELOGD( - "NetDll_send(%d, %.8X, %d, %d, %.8X, %d)", + "NetDll_setsockopt(%d, %.8X, %d, %d, %.8X, %d)", arg0, socket_ptr, level, optname, optval_ptr, optlen); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_connect_shim( @@ -133,7 +147,7 @@ SHIM_CALL NetDll_connect_shim( socket_ptr, sockaddr_ptr, namelen); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_recv_shim( @@ -150,7 +164,7 @@ SHIM_CALL NetDll_recv_shim( buf_ptr, len, flags); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_recvfrom_shim( @@ -171,22 +185,24 @@ SHIM_CALL NetDll_recvfrom_shim( flags, from_ptr, fromlen_ptr); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } SHIM_CALL NetDll_send_shim( PPCContext* ppc_state, KernelState* state) { - uint32_t socket_ptr = SHIM_GET_ARG_32(0); - uint32_t buf_ptr = SHIM_GET_ARG_32(1); - uint32_t len = SHIM_GET_ARG_32(2); - uint32_t flags = SHIM_GET_ARG_32(3); + uint32_t arg0 = SHIM_GET_ARG_32(0); + uint32_t socket_ptr = SHIM_GET_ARG_32(1); + uint32_t buf_ptr = SHIM_GET_ARG_32(2); + uint32_t len = SHIM_GET_ARG_32(3); + uint32_t flags = SHIM_GET_ARG_32(4); XELOGD( - "NetDll_send(%.8X, %.8X, %d, %d)", + "NetDll_send(%d,%.8X, %.8X, %d, %d)", + arg0, socket_ptr, buf_ptr, len, flags); - SHIM_SET_RETURN_32(SOCKET_ERROR); + SHIM_SET_RETURN_32(X_SOCKET_ERROR); } @@ -199,6 +215,7 @@ void xe::kernel::xam::RegisterNetExports( SHIM_SET_MAPPING("xam.xex", NetDll_XNetStartup, state); SHIM_SET_MAPPING("xam.xex", NetDll_WSAStartup, state); SHIM_SET_MAPPING("xam.xex", NetDll_WSAGetLastError, state); + SHIM_SET_MAPPING("xam.xex", NetDll_XNetGetTitleXnAddr, state); SHIM_SET_MAPPING("xam.xex", NetDll_XNetGetEthernetLinkStatus, state); SHIM_SET_MAPPING("xam.xex", NetDll_inet_addr, state); SHIM_SET_MAPPING("xam.xex", NetDll_socket, state); diff --git a/src/xenia/kernel/xam_user.cc b/src/xenia/kernel/xam_user.cc index ebe48c2cb..005475812 100644 --- a/src/xenia/kernel/xam_user.cc +++ b/src/xenia/kernel/xam_user.cc @@ -174,6 +174,23 @@ SHIM_CALL XamUserReadProfileSettings_shim( } +SHIM_CALL XamShowSigninUI_shim( + PPCContext* ppc_state, KernelState* state) { + uint32_t unk_0 = SHIM_GET_ARG_32(0); + uint32_t unk_mask = SHIM_GET_ARG_32(1); + + XELOGD( + "XamShowSigninUI(%d, %.8X)", + unk_0, unk_mask); + + // Mask values vary. Probably matching user types? Local/remote? + // Games seem to sit and loop until we trigger this notification. + state->BroadcastNotification(0x00000009, 0); + + SHIM_SET_RETURN_32(X_ERROR_SUCCESS); +} + + } // namespace kernel } // namespace xe @@ -185,4 +202,5 @@ void xe::kernel::xam::RegisterUserExports( SHIM_SET_MAPPING("xam.xex", XamUserGetSigninInfo, state); SHIM_SET_MAPPING("xam.xex", XamUserGetName, state); SHIM_SET_MAPPING("xam.xex", XamUserReadProfileSettings, state); + SHIM_SET_MAPPING("xam.xex", XamShowSigninUI, state); } diff --git a/src/xenia/kernel/xboxkrnl_audio.cc b/src/xenia/kernel/xboxkrnl_audio.cc index c1ba83ed5..e048dad1a 100644 --- a/src/xenia/kernel/xboxkrnl_audio.cc +++ b/src/xenia/kernel/xboxkrnl_audio.cc @@ -75,7 +75,9 @@ SHIM_CALL XAudioGetVoiceCategoryVolumeChangeMask_shim( "XAudioGetVoiceCategoryVolumeChangeMask(%.8X, %.8X)", driver_ptr, out_ptr); - XEASSERT(driver_ptr == 0xAADD1100); + XEASSERT((driver_ptr & 0xFFFF0000) == 0x41550000); + + auto audio_system = state->emulator()->audio_system(); // Checking these bits to see if any voice volume changed. // I think. diff --git a/src/xenia/kernel/xboxkrnl_debug.cc b/src/xenia/kernel/xboxkrnl_debug.cc index e6cc023c3..c0a9e7771 100644 --- a/src/xenia/kernel/xboxkrnl_debug.cc +++ b/src/xenia/kernel/xboxkrnl_debug.cc @@ -288,9 +288,13 @@ SHIM_CALL RtlRaiseException_shim( } if (thread) { + XELOGD("SetThreadName(%d, %s)", thread->thread_id(), name); thread->set_name(name); thread->Release(); } + + // TODO(benvanik): unwinding required here? + return; } // TODO(benvanik): unwinding. diff --git a/src/xenia/kernel/xboxkrnl_io.cc b/src/xenia/kernel/xboxkrnl_io.cc index b93e3a104..cb3f4f2ca 100644 --- a/src/xenia/kernel/xboxkrnl_io.cc +++ b/src/xenia/kernel/xboxkrnl_io.cc @@ -392,6 +392,13 @@ SHIM_CALL NtQueryInformationFile_shim( if (XSUCCEEDED(result)) { result = X_STATUS_SUCCESS; switch (file_info_class) { + case XFileInternalInformation: + // Internal unique file pointer. Not sure why anyone would want this. + XEASSERT(length == 8); + info = 8; + // TODO(benvanik): use pointer to fs:: entry? + SHIM_SET_MEM_64(file_info_ptr, hash_combine(0, file->absolute_path())); + break; case XFilePositionInformation: // struct FILE_POSITION_INFORMATION { // LARGE_INTEGER CurrentByteOffset; diff --git a/src/xenia/kernel/xboxkrnl_rtl.cc b/src/xenia/kernel/xboxkrnl_rtl.cc index 18310e6a7..f5d00f19a 100644 --- a/src/xenia/kernel/xboxkrnl_rtl.cc +++ b/src/xenia/kernel/xboxkrnl_rtl.cc @@ -633,12 +633,13 @@ spin: cs->recursion_count = 1; } - SHIM_CALL RtlEnterCriticalSection_shim( PPCContext* ppc_state, KernelState* state) { + SCOPE_profile_cpu_f("kernel"); + uint32_t cs_ptr = SHIM_GET_ARG_32(0); - XELOGD("RtlEnterCriticalSection(%.8X)", cs_ptr); + // XELOGD("RtlEnterCriticalSection(%.8X)", cs_ptr); const uint8_t* thread_state_block = ppc_state->membase + ppc_state->r[13]; uint32_t thread_id = XThread::GetCurrentThreadId(thread_state_block); @@ -716,7 +717,7 @@ SHIM_CALL RtlLeaveCriticalSection_shim( PPCContext* ppc_state, KernelState* state) { uint32_t cs_ptr = SHIM_GET_ARG_32(0); - XELOGD("RtlLeaveCriticalSection(%.8X)", cs_ptr); + // XELOGD("RtlLeaveCriticalSection(%.8X)", cs_ptr); xeRtlLeaveCriticalSection(cs_ptr); } diff --git a/src/xenia/kernel/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl_threading.cc index 533a05d2c..064f34d7b 100644 --- a/src/xenia/kernel/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl_threading.cc @@ -368,8 +368,8 @@ uint32_t xeKeGetCurrentProcessType() { SHIM_CALL KeGetCurrentProcessType_shim( PPCContext* ppc_state, KernelState* state) { - XELOGD( - "KeGetCurrentProcessType()"); + // XELOGD( + // "KeGetCurrentProcessType()"); int result = xeKeGetCurrentProcessType(); SHIM_SET_RETURN_64(result); @@ -550,9 +550,10 @@ SHIM_CALL KeTlsGetValue_shim( PPCContext* ppc_state, KernelState* state) { uint32_t tls_index = SHIM_GET_ARG_32(0); - XELOGD( - "KeTlsGetValue(%.8X)", - tls_index); + // Logging disabled, as some games spam this. + //XELOGD( + // "KeTlsGetValue(%.8X)", + // tls_index); uint64_t result = xeKeTlsGetValue(tls_index); SHIM_SET_RETURN_64(result); @@ -1128,7 +1129,7 @@ SHIM_CALL NtWaitForSingleObjectEx_shim( uint64_t timeout = timeout_ptr ? SHIM_MEM_64(timeout_ptr) : 0; result = object->Wait( 3, wait_mode, alertable, - timeout_ptr ? &timeout : NULL); + timeout_ptr ? &timeout : NULL); object->Release(); } diff --git a/src/xenia/kernel/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl_video.cc index 951606bde..6519a067c 100644 --- a/src/xenia/kernel/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl_video.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -422,19 +423,16 @@ SHIM_CALL VdSwap_shim( unk6, unk7); - KernelState* kernel_state = shared_kernel_state_; - XEASSERTNOTNULL(kernel_state); - GraphicsSystem* gs = kernel_state->emulator()->graphics_system(); - if (!gs) { - return; - } - - gs->set_swap_pending(true); - // The caller seems to reserve 64 words (256b) in the primary ringbuffer - // for this method to do what it needs. We just zero them out. We could - // encode the parameters in the stream for the ringbuffer, if needed. + // for this method to do what it needs. We just zero them out and send a + // token value. It'd be nice to figure out what this is really doing so + // that we could simulate it, though due to TCR I bet all games need to + // use this method. xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4); + auto dwords = reinterpret_cast(SHIM_MEM_ADDR(unk0)); + dwords[0] = XESWAP32((0x03 << 30) | + ((1 - 1) << 16) | + (xenos::PM4_XE_SWAP << 8)); SHIM_SET_RETURN_64(0); } diff --git a/src/xenia/logging.cc b/src/xenia/logging.cc index 99f3963e6..3f9d840f9 100644 --- a/src/xenia/logging.cc +++ b/src/xenia/logging.cc @@ -10,6 +10,18 @@ #include #include +#include + +#include + + +DEFINE_bool(fast_stdout, false, + "Don't lock around stdout/stderr. May introduce weirdness."); + + +namespace { +xe_mutex_t* log_lock = xe_mutex_alloc(); +} // namespace void xe_format_log_line( @@ -46,6 +58,8 @@ void xe_format_log_line( void xe_log_line(const char* file_path, const uint32_t line_number, const char* function_name, const char level_char, const char* fmt, ...) { + SCOPE_profile_cpu_i("emu", "log_line"); + char buffer[2048]; va_list args; va_start(args, fmt); @@ -54,15 +68,18 @@ void xe_log_line(const char* file_path, const uint32_t line_number, fmt, args); va_end(args); - fprintf(stderr, buffer); - fflush(stderr); - + if (!FLAGS_fast_stdout) { + xe_mutex_lock(log_lock); + } #if 0// defined(OutputDebugString) OutputDebugStringA(buffer); #else XEIGNORE(fprintf(stdout, buffer)); fflush(stdout); #endif // OutputDebugString + if (!FLAGS_fast_stdout) { + xe_mutex_unlock(log_lock); + } } void xe_handle_fatal( @@ -76,12 +93,18 @@ void xe_handle_fatal( fmt, args); va_end(args); + if (!FLAGS_fast_stdout) { + xe_mutex_lock(log_lock); + } #if defined(OutputDebugString) OutputDebugStringA(buffer); -#endif // OutputDebugString - - fprintf(stderr, buffer); +#else + XEIGNORE(fprintf(stderr, buffer)); fflush(stderr); +#endif // OutputDebugString + if (!FLAGS_fast_stdout) { + xe_mutex_unlock(log_lock); + } #if XE_LIKE_WIN32 if (!xe_has_console()) { diff --git a/src/xenia/platform_includes.h b/src/xenia/platform_includes.h index e9d513316..6aaf86811 100644 --- a/src/xenia/platform_includes.h +++ b/src/xenia/platform_includes.h @@ -56,13 +56,8 @@ #include #include -#if XE_COMPILER_MSVC #include #include -#else -#include -#include -#endif // MSVC #endif // XENIA_PLATFORM_INCLUDES_H_ diff --git a/src/xenia/profiling.cc b/src/xenia/profiling.cc new file mode 100644 index 000000000..ce3a4ece0 --- /dev/null +++ b/src/xenia/profiling.cc @@ -0,0 +1,170 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#define MICRO_PROFILE_IMPL +#define MICROPROFILE_USE_THREAD_NAME_CALLBACK 1 +#include + +namespace xe { + +std::unique_ptr Profiler::display_ = nullptr; + +#if XE_OPTION_PROFILING + +void Profiler::Initialize() { + MicroProfileInit(); + MicroProfileSetDisplayMode(1); +} + +void Profiler::Dump() { + MicroProfileDumpTimers(); +} + +void Profiler::Shutdown() { + display_.reset(); + MicroProfileShutdown(); +} + +uint32_t Profiler::GetColor(const char* str) { + std::hash fn; + size_t value = fn(str); + return value & 0xFFFFFF; +} + +void Profiler::ThreadEnter(const char* name) { + MicroProfileOnThreadCreate(name); +} + +void Profiler::ThreadExit() { + MicroProfileOnThreadExit(); +} + +bool Profiler::OnKeyDown(int key_code) { + // http://msdn.microsoft.com/en-us/library/windows/desktop/dd375731(v=vs.85).aspx + switch (key_code) { + case VK_TAB: + MicroProfileToggleDisplayMode(); + return true; + case VK_OEM_3: // ` + MicroProfileTogglePause(); + return true; + case 0x31: // 1 + MicroProfileModKey(1); + return true; + } + return false; +} + +bool Profiler::OnKeyUp(int key_code) { + switch (key_code) { + case 0x31: // 1 + MicroProfileModKey(0); + return true; + } + return false; +} + +void Profiler::OnMouseDown(bool left_button, bool right_button) { + MicroProfileMouseButton(left_button, right_button); +} + +void Profiler::OnMouseUp() { + MicroProfileMouseButton(0, 0); +} + +void Profiler::OnMouseMove(int x, int y) { + MicroProfileMousePosition(x, y, 0); +} + +void Profiler::OnMouseWheel(int x, int y, int dy) { + MicroProfileMousePosition(x, y, dy); +} + +void Profiler::set_display(std::unique_ptr display) { + display_ = std::move(display); +} + +void Profiler::Present() { + MicroProfileFlip(); + if (!display_) { + return; + } + + display_->Begin(); + MicroProfileDraw(display_->width(), display_->height()); + display_->End(); +} + +#else + +void Profiler::Initialize() {} +void Profiler::Dump() {} +void Profiler::Shutdown() {} +uint32_t Profiler::GetColor(const char* str) { return 0; } +void Profiler::ThreadEnter(const char* name) {} +void Profiler::ThreadExit() {} +bool Profiler::OnKeyDown(int key_code) { return false; } +bool Profiler::OnKeyUp(int key_code) { return false; } +void Profiler::OnMouseDown(bool left_button, bool right_button) {} +void Profiler::OnMouseUp() {} +void Profiler::OnMouseMove(int x, int y) {} +void Profiler::OnMouseWheel(int x, int y, int dy) {} +void Profiler::set_display(std::unique_ptr display) {} +void Profiler::Present() {} + +#endif // XE_OPTION_PROFILING + +} // namespace xe + +#if XE_OPTION_PROFILING + +uint32_t MicroProfileGpuInsertTimeStamp() { + return 0; +} + +uint64_t MicroProfileGpuGetTimeStamp(uint32_t nKey) { + return 0; +} + +uint64_t MicroProfileTicksPerSecondGpu() { + return 0; +} + +const char* MicroProfileGetThreadName() { + return "TODO: get thread name!"; +} + +void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType type) { + auto display = xe::Profiler::display(); + if (!display) { + return; + } + display->DrawBox( + nX, nY, nX1, nY1, + nColor, + static_cast(type)); +} + +void MicroProfileDrawLine2D(uint32_t nVertices, float* pVertices, uint32_t nColor) { + auto display = xe::Profiler::display(); + if (!display) { + return; + } + display->DrawLine2D(nVertices, pVertices, nColor); +} + +void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, uint32_t nLen) { + auto display = xe::Profiler::display(); + if (!display) { + return; + } + display->DrawText(nX, nY, nColor, pText, nLen); +} + +#endif // XE_OPTION_PROFILING diff --git a/src/xenia/profiling.h b/src/xenia/profiling.h new file mode 100644 index 000000000..30ab996ac --- /dev/null +++ b/src/xenia/profiling.h @@ -0,0 +1,172 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_PROFILING_H_ +#define XENIA_PROFILING_H_ + +#include + +#include +#include +#include +#include +#include + +#if XE_OPTION_PROFILING +// Pollutes the global namespace. Yuck. +#include +#endif // XE_OPTION_PROFILING + +namespace xe { + +#if XE_OPTION_PROFILING + +// Defines a profiling scope for CPU tasks. +// Use `SCOPE_profile_cpu(name)` to activate the scope. +#define DEFINE_profile_cpu(name, group_name, scope_name) \ + MICROPROFILE_DEFINE(name, group_name, scope_name, xe::Profiler::GetColor(scope_name)) + +// Declares a previously defined profile scope. Use in a translation unit. +#define DECLARE_profile_cpu(name) MICROPROFILE_DECLARE(name) + +// Defines a profiling scope for GPU tasks. +// Use `COUNT_profile_gpu(name)` to activate the scope. +#define DEFINE_profile_gpu(name, group_name, scope_name) \ + MICROPROFILE_DEFINE_GPU(name, group_name, scope_name, xe::Profiler::GetColor(scope_name)) + +// Declares a previously defined profile scope. Use in a translation unit. +#define DECLARE_profile_gpu(name) MICROPROFILE_DECLARE_GPU(name) + +// Enters a previously defined CPU profiling scope, active for the duration +// of the containing block. +#define SCOPE_profile_cpu(name) \ + MICROPROFILE_SCOPE(name) + +// Enters a CPU profiling scope, active for the duration of the containing +// block. No previous definition required. +#define SCOPE_profile_cpu_i(group_name, scope_name) \ + MICROPROFILE_SCOPEI(group_name, scope_name, xe::Profiler::GetColor(scope_name)) + +// Enters a CPU profiling scope by function name, active for the duration of +// the containing block. No previous definition required. +#define SCOPE_profile_cpu_f(group_name) \ + MICROPROFILE_SCOPEI(group_name, XE_CURRENT_FUNCTION, xe::Profiler::GetColor(XE_CURRENT_FUNCTION)) + +// Enters a previously defined GPU profiling scope, active for the duration +// of the containing block. +#define SCOPE_profile_gpu(name) \ + MICROPROFILE_SCOPEGPU(name) + +// Enters a GPU profiling scope, active for the duration of the containing +// block. No previous definition required. +#define SCOPE_profile_gpu_i(group_name, scope_name) \ + MICROPROFILE_SCOPEGPUI(group_name, scope_name, xe::Profiler::GetColor(scope_name)) + +// Enters a GPU profiling scope by function name, active for the duration of +// the containing block. No previous definition required. +#define SCOPE_profile_gpu_f(group_name) \ + MICROPROFILE_SCOPEGPUI(group_name, XE_CURRENT_FUNCTION, xe::Profiler::GetColor(XE_CURRENT_FUNCTION)) + +// Tracks a CPU value counter. +#define COUNT_profile_cpu(name, count) MICROPROFILE_META_CPU(name, count) + +// Tracks a GPU value counter. +#define COUNT_profile_gpu(name, count) MICROPROFILE_META_GPU(name, count) + +#else + +#define DEFINE_profile_cpu(name, group_name, scope_name) +#define DEFINE_profile_gpu(name, group_name, scope_name) +#define DECLARE_profile_cpu(name) +#define DECLARE_profile_gpu(name) +#define SCOPE_profile_cpu(name) do {} while (false) +#define SCOPE_profile_cpu_f(name) do {} while (false) +#define SCOPE_profile_cpu_i(group_name, scope_name) do {} while (false) +#define SCOPE_profile_gpu(name) do {} while (false) +#define SCOPE_profile_gpu_f(name) do {} while (false) +#define SCOPE_profile_gpu_i(group_name, scope_name) do {} while (false) +#define COUNT_profile_cpu(name, count) do {} while (false) +#define COUNT_profile_gpu(name, count) do {} while (false) + +#define MICROPROFILE_TEXT_WIDTH 1 +#define MICROPROFILE_TEXT_HEIGHT 1 + +#endif // XE_OPTION_PROFILING + +class ProfilerDisplay { +public: + enum BoxType { +#if XE_OPTION_PROFILING + BOX_TYPE_BAR = MicroProfileBoxTypeBar, + BOX_TYPE_FLAT = MicroProfileBoxTypeFlat, +#else + BOX_TYPE_BAR, + BOX_TYPE_FLAT, +#endif // XE_OPTION_PROFILING + }; + + virtual uint32_t width() const = 0; + virtual uint32_t height() const = 0; + + // TODO(benvanik): GPU timestamping. + + virtual void Begin() = 0; + virtual void End() = 0; + virtual void DrawBox(int x, int y, int x1, int y1, uint32_t color, BoxType type) = 0; + virtual void DrawLine2D(uint32_t count, float* vertices, uint32_t color) = 0; + virtual void DrawText(int x, int y, uint32_t color, const char* text, size_t text_length) = 0; +}; + +class Profiler { +public: +#if XE_OPTION_PROFILING + static bool is_enabled() { return true; } +#else + static bool is_enabled() { return false; } +#endif // XE_OPTION_PROFILING + + // Initializes the profiler. Call at startup. + static void Initialize(); + // Dumps data to stdout. + static void Dump(); + // Cleans up profiling, releasing all memory. + static void Shutdown(); + + // Computes a color from the given string. + static uint32_t GetColor(const char* str); + + // Activates the calling thread for profiling. + // This must be called immediately after launching a thread. + static void ThreadEnter(const char* name = nullptr); + // Deactivates the calling thread for profiling. + static void ThreadExit(); + + static bool OnKeyDown(int key_code); + static bool OnKeyUp(int key_code); + static void OnMouseDown(bool left_button, bool right_button); + static void OnMouseUp(); + static void OnMouseMove(int x, int y); + static void OnMouseWheel(int x, int y, int dy); + + // Gets the current display, if any. + static ProfilerDisplay* display() { return display_.get(); } + // Initializes drawing with the given display. + static void set_display(std::unique_ptr display); + // Presents the profiler to the bound display, if any. + static void Present(); + + // TODO(benvanik): display mode/pause/etc? + +private: + static std::unique_ptr display_; +}; + +} // namespace xe + +#endif // XENIA_PROFILING_H_ diff --git a/src/xenia/sources.gypi b/src/xenia/sources.gypi index 5d2c066b0..0d20898e0 100644 --- a/src/xenia/sources.gypi +++ b/src/xenia/sources.gypi @@ -18,6 +18,8 @@ 'platform.cc', 'platform.h', 'platform_includes.h', + 'profiling.cc', + 'profiling.h', 'string.cc', 'string.h', 'types.h', diff --git a/src/xenia/types.h b/src/xenia/types.h index f4356e94a..4cd3f5daf 100644 --- a/src/xenia/types.h +++ b/src/xenia/types.h @@ -16,7 +16,7 @@ namespace xe { // TODO(benvanik): support other compilers/etc using std::auto_ptr; -using std::tr1::shared_ptr; +using std::shared_ptr; } // namespace xe @@ -134,6 +134,16 @@ typedef XECACHEALIGN volatile void xe_aligned_void_t; #endif // GNUC #endif // !MIN +XEFORCEINLINE size_t hash_combine(size_t seed) { + return seed; +} +template +size_t hash_combine(size_t seed, const T& v, const Ts&... vs) { + std::hash hasher; + seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2); + return hash_combine(seed, vs...); +} + #if XE_PLATFORM_WIN32 #define XESAFERELEASE(p) if (p) { p->Release(); } #endif // WIN32 @@ -145,6 +155,7 @@ typedef XECACHEALIGN volatile void xe_aligned_void_t; static inline uint32_t XENEXTPOW2(uint32_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } +#define XEALIGN(value, align) ((value + align - 1) & ~(align - 1)) #define XESUCCEED() goto XECLEANUP #define XEFAIL() goto XECLEANUP diff --git a/src/xenia/ui/ui_event.h b/src/xenia/ui/ui_event.h index 0c973b86f..07fae340e 100644 --- a/src/xenia/ui/ui_event.h +++ b/src/xenia/ui/ui_event.h @@ -32,6 +32,18 @@ private: Window* window_; }; +class KeyEvent : public UIEvent { +public: + KeyEvent(Window* window, int key_code) : + key_code_(key_code), + UIEvent(window) {} + virtual ~KeyEvent() {} + + int key_code() const { return key_code_; } + +private: + int key_code_; +}; class MouseEvent : public UIEvent { public: diff --git a/src/xenia/ui/win32/win32_window.cc b/src/xenia/ui/win32/win32_window.cc index 4bf5cc0cc..5e4b12738 100644 --- a/src/xenia/ui/win32/win32_window.cc +++ b/src/xenia/ui/win32/win32_window.cc @@ -281,12 +281,13 @@ bool Win32Window::HandleMouse(UINT message, WPARAM wParam, LPARAM lParam) { } bool Win32Window::HandleKeyboard(UINT message, WPARAM wParam, LPARAM lParam) { + auto e = KeyEvent(this, (int)wParam); switch (message) { case WM_KEYDOWN: - (byte)wParam; + key_down(e); return true; case WM_KEYUP: - (byte)wParam; + key_up(e); return true; default: return false; diff --git a/src/xenia/ui/window.h b/src/xenia/ui/window.h index 49fcbf17a..cdc7b2c39 100644 --- a/src/xenia/ui/window.h +++ b/src/xenia/ui/window.h @@ -48,6 +48,9 @@ public: alloy::Delegate closing; alloy::Delegate closed; + alloy::Delegate key_down; + alloy::Delegate key_up; + alloy::Delegate mouse_down; alloy::Delegate mouse_move; alloy::Delegate mouse_up; diff --git a/src/xenia/xbox.h b/src/xenia/xbox.h index ebf2f816f..fcf079751 100644 --- a/src/xenia/xbox.h +++ b/src/xenia/xbox.h @@ -43,6 +43,7 @@ typedef uint32_t X_STATUS; #define X_STATUS_INVALID_HANDLE ((X_STATUS)0xC0000008L) #define X_STATUS_INVALID_PARAMETER ((X_STATUS)0xC000000DL) #define X_STATUS_NO_SUCH_FILE ((X_STATUS)0xC000000FL) +#define X_STATUS_END_OF_FILE ((X_STATUS)0xC0000011L) #define X_STATUS_NO_MEMORY ((X_STATUS)0xC0000017L) #define X_STATUS_ALREADY_COMMITTED ((X_STATUS)0xC0000021L) #define X_STATUS_ACCESS_DENIED ((X_STATUS)0xC0000022L) @@ -62,6 +63,7 @@ typedef uint32_t X_RESULT; #define X_HRESULT_FROM_WIN32(x) ((X_RESULT)(x) <= 0 ? ((X_RESULT)(x)) : ((X_RESULT) (((x) & 0x0000FFFF) | (X_FACILITY_WIN32 << 16) | 0x80000000))) #define X_ERROR_SUCCESS X_HRESULT_FROM_WIN32(0x00000000L) #define X_ERROR_ACCESS_DENIED X_HRESULT_FROM_WIN32(0x00000005L) +#define X_ERROR_NO_MORE_FILES X_HRESULT_FROM_WIN32(0x00000018L) #define X_ERROR_INSUFFICIENT_BUFFER X_HRESULT_FROM_WIN32(0x0000007AL) #define X_ERROR_BAD_ARGUMENTS X_HRESULT_FROM_WIN32(0x000000A0L) #define X_ERROR_BUSY X_HRESULT_FROM_WIN32(0x000000AAL) @@ -114,6 +116,11 @@ typedef uint32_t X_RESULT; #define X_PROCTYPE_SYSTEM 2 +// Sockets/networking. +#define X_INVALID_SOCKET (uint32_t)(~0) +#define X_SOCKET_ERROR (uint32_t)(-1) + + // Thread enums. #define X_CREATE_SUSPENDED 0x00000004 @@ -253,6 +260,10 @@ public: }; +// Values seem to be all over the place - GUIDs? +typedef uint32_t XNotificationID; + + typedef enum _X_INPUT_FLAG { X_INPUT_FLAG_GAMEPAD = 0x00000001, } X_INPUT_FLAG; diff --git a/third_party/llvm.gypi b/third_party/llvm.gypi new file mode 100644 index 000000000..3b8449729 --- /dev/null +++ b/third_party/llvm.gypi @@ -0,0 +1,35 @@ +# Copyright 2014 Ben Vanik. All Rights Reserved. +{ + 'targets': [ + { + 'target_name': 'llvm', + 'type': '<(library)', + + 'direct_dependent_settings': { + 'include_dirs': [ + 'llvm/include/', + ], + + 'defines': [ + ], + }, + + 'msvs_disabled_warnings': [4267], + + 'defines': [ + ], + + 'include_dirs': [ + 'llvm/include/', + ], + + 'sources': [ + 'llvm/dummy.cc', + 'llvm/include/llvm/ADT/BitVector.h', + 'llvm/include/llvm/Support/Compiler.h', + 'llvm/include/llvm/Support/MathExtras.h', + 'llvm/include/llvm/Support/type_traits.h', + ], + } + ] +} diff --git a/third_party/llvm/LICENSE.txt b/third_party/llvm/LICENSE.txt new file mode 100644 index 000000000..37d3c2552 --- /dev/null +++ b/third_party/llvm/LICENSE.txt @@ -0,0 +1,71 @@ +============================================================================== +LLVM Release License +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2014 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== +Copyrights and Licenses for Third Party Software Distributed with LLVM: +============================================================================== +The LLVM software contains code written by third parties. Such software will +have its own individual LICENSE.TXT file in the directory in which it appears. +This file will describe the copyrights, license, and restrictions which apply +to that code. + +The disclaimer of warranty in the University of Illinois Open Source License +applies to all code in the LLVM Distribution, and nothing in any of the +other licenses gives permission to use the names of the LLVM Team or the +University of Illinois to endorse or promote products derived from this +Software. + +The following pieces of software have additional or alternate copyrights, +licenses, and/or restrictions: + +Program Directory +------- --------- +Autoconf llvm/autoconf + llvm/projects/ModuleMaker/autoconf + llvm/projects/sample/autoconf +Google Test llvm/utils/unittest/googletest +OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex} +pyyaml tests llvm/test/YAMLParser/{*.data, LICENSE.TXT} +ARM contributions llvm/lib/Target/ARM/LICENSE.TXT +md5 contributions llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h diff --git a/third_party/llvm/dummy.cc b/third_party/llvm/dummy.cc new file mode 100644 index 000000000..ef866db23 --- /dev/null +++ b/third_party/llvm/dummy.cc @@ -0,0 +1 @@ +// here just to keep gyp happy diff --git a/third_party/llvm/include/llvm/ADT/BitVector.h b/third_party/llvm/include/llvm/ADT/BitVector.h new file mode 100644 index 000000000..90e6d3652 --- /dev/null +++ b/third_party/llvm/include/llvm/ADT/BitVector.h @@ -0,0 +1,602 @@ +//===- llvm/ADT/BitVector.h - Bit vectors -----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the BitVector class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ADT_BITVECTOR_H +#define LLVM_ADT_BITVECTOR_H + +#include "llvm/Support/Compiler.h" +#ifdef LLVM_IGNORE_XENIA +#include "llvm/Support/ErrorHandling.h" +#else +#define llvm_unreachable(msg) assert(false) +#endif // LLVM_IGNORE_XENIA +#include "llvm/Support/MathExtras.h" +#include +#include +#include +#include + +namespace llvm { + +class BitVector { + typedef unsigned long BitWord; + + enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT }; + + BitWord *Bits; // Actual bits. + unsigned Size; // Size of bitvector in bits. + unsigned Capacity; // Size of allocated memory in BitWord. + +public: + // Encapsulation of a single bit. + class reference { + friend class BitVector; + + BitWord *WordRef; + unsigned BitPos; + + reference(); // Undefined + + public: + reference(BitVector &b, unsigned Idx) { + WordRef = &b.Bits[Idx / BITWORD_SIZE]; + BitPos = Idx % BITWORD_SIZE; + } + + ~reference() {} + + reference &operator=(reference t) { + *this = bool(t); + return *this; + } + + reference& operator=(bool t) { + if (t) + *WordRef |= 1L << BitPos; + else + *WordRef &= ~(1L << BitPos); + return *this; + } + + operator bool() const { + return ((*WordRef) & (1L << BitPos)) ? true : false; + } + }; + + + /// BitVector default ctor - Creates an empty bitvector. + BitVector() : Size(0), Capacity(0) { + Bits = 0; + } + + /// BitVector ctor - Creates a bitvector of specified number of bits. All + /// bits are initialized to the specified value. + explicit BitVector(unsigned s, bool t = false) : Size(s) { + Capacity = NumBitWords(s); + Bits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); + init_words(Bits, Capacity, t); + if (t) + clear_unused_bits(); + } + + /// BitVector copy ctor. + BitVector(const BitVector &RHS) : Size(RHS.size()) { + if (Size == 0) { + Bits = 0; + Capacity = 0; + return; + } + + Capacity = NumBitWords(RHS.size()); + Bits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); + std::memcpy(Bits, RHS.Bits, Capacity * sizeof(BitWord)); + } + +#if LLVM_HAS_RVALUE_REFERENCES + BitVector(BitVector &&RHS) + : Bits(RHS.Bits), Size(RHS.Size), Capacity(RHS.Capacity) { + RHS.Bits = 0; + } +#endif + + ~BitVector() { + std::free(Bits); + } + + /// empty - Tests whether there are no bits in this bitvector. + bool empty() const { return Size == 0; } + + /// size - Returns the number of bits in this bitvector. + unsigned size() const { return Size; } + + /// count - Returns the number of bits which are set. + unsigned count() const { + unsigned NumBits = 0; + for (unsigned i = 0; i < NumBitWords(size()); ++i) + if (sizeof(BitWord) == 4) + NumBits += CountPopulation_32((uint32_t)Bits[i]); + else if (sizeof(BitWord) == 8) + NumBits += CountPopulation_64(Bits[i]); + else + llvm_unreachable("Unsupported!"); + return NumBits; + } + + /// any - Returns true if any bit is set. + bool any() const { + for (unsigned i = 0; i < NumBitWords(size()); ++i) + if (Bits[i] != 0) + return true; + return false; + } + + /// all - Returns true if all bits are set. + bool all() const { + for (unsigned i = 0; i < Size / BITWORD_SIZE; ++i) + if (Bits[i] != ~0UL) + return false; + + // If bits remain check that they are ones. The unused bits are always zero. + if (unsigned Remainder = Size % BITWORD_SIZE) + return Bits[Size / BITWORD_SIZE] == (1UL << Remainder) - 1; + + return true; + } + + /// none - Returns true if none of the bits are set. + bool none() const { + return !any(); + } + + /// find_first - Returns the index of the first set bit, -1 if none + /// of the bits are set. + int find_first() const { + for (unsigned i = 0; i < NumBitWords(size()); ++i) + if (Bits[i] != 0) { + if (sizeof(BitWord) == 4) + return i * BITWORD_SIZE + countTrailingZeros((uint32_t)Bits[i]); + if (sizeof(BitWord) == 8) + return i * BITWORD_SIZE + countTrailingZeros(Bits[i]); + llvm_unreachable("Unsupported!"); + } + return -1; + } + + /// find_next - Returns the index of the next set bit following the + /// "Prev" bit. Returns -1 if the next set bit is not found. + int find_next(unsigned Prev) const { + ++Prev; + if (Prev >= Size) + return -1; + + unsigned WordPos = Prev / BITWORD_SIZE; + unsigned BitPos = Prev % BITWORD_SIZE; + BitWord Copy = Bits[WordPos]; + // Mask off previous bits. + Copy &= ~0UL << BitPos; + + if (Copy != 0) { + if (sizeof(BitWord) == 4) + return WordPos * BITWORD_SIZE + countTrailingZeros((uint32_t)Copy); + if (sizeof(BitWord) == 8) + return WordPos * BITWORD_SIZE + countTrailingZeros(Copy); + llvm_unreachable("Unsupported!"); + } + + // Check subsequent words. + for (unsigned i = WordPos+1; i < NumBitWords(size()); ++i) + if (Bits[i] != 0) { + if (sizeof(BitWord) == 4) + return i * BITWORD_SIZE + countTrailingZeros((uint32_t)Bits[i]); + if (sizeof(BitWord) == 8) + return i * BITWORD_SIZE + countTrailingZeros(Bits[i]); + llvm_unreachable("Unsupported!"); + } + return -1; + } + + /// clear - Clear all bits. + void clear() { + Size = 0; + } + + /// resize - Grow or shrink the bitvector. + void resize(unsigned N, bool t = false) { + if (N > Capacity * BITWORD_SIZE) { + unsigned OldCapacity = Capacity; + grow(N); + init_words(&Bits[OldCapacity], (Capacity-OldCapacity), t); + } + + // Set any old unused bits that are now included in the BitVector. This + // may set bits that are not included in the new vector, but we will clear + // them back out below. + if (N > Size) + set_unused_bits(t); + + // Update the size, and clear out any bits that are now unused + unsigned OldSize = Size; + Size = N; + if (t || N < OldSize) + clear_unused_bits(); + } + + void reserve(unsigned N) { + if (N > Capacity * BITWORD_SIZE) + grow(N); + } + + // Set, reset, flip + BitVector &set() { + init_words(Bits, Capacity, true); + clear_unused_bits(); + return *this; + } + + BitVector &set(unsigned Idx) { + Bits[Idx / BITWORD_SIZE] |= 1L << (Idx % BITWORD_SIZE); + return *this; + } + + /// set - Efficiently set a range of bits in [I, E) + BitVector &set(unsigned I, unsigned E) { + assert(I <= E && "Attempted to set backwards range!"); + assert(E <= size() && "Attempted to set out-of-bounds range!"); + + if (I == E) return *this; + + if (I / BITWORD_SIZE == E / BITWORD_SIZE) { + BitWord EMask = 1UL << (E % BITWORD_SIZE); + BitWord IMask = 1UL << (I % BITWORD_SIZE); + BitWord Mask = EMask - IMask; + Bits[I / BITWORD_SIZE] |= Mask; + return *this; + } + + BitWord PrefixMask = ~0UL << (I % BITWORD_SIZE); + Bits[I / BITWORD_SIZE] |= PrefixMask; + I = RoundUpToAlignment(I, BITWORD_SIZE); + + for (; I + BITWORD_SIZE <= E; I += BITWORD_SIZE) + Bits[I / BITWORD_SIZE] = ~0UL; + + BitWord PostfixMask = (1UL << (E % BITWORD_SIZE)) - 1; + if (I < E) + Bits[I / BITWORD_SIZE] |= PostfixMask; + + return *this; + } + + BitVector &reset() { + init_words(Bits, Capacity, false); + return *this; + } + + BitVector &reset(unsigned Idx) { + Bits[Idx / BITWORD_SIZE] &= ~(1L << (Idx % BITWORD_SIZE)); + return *this; + } + + /// reset - Efficiently reset a range of bits in [I, E) + BitVector &reset(unsigned I, unsigned E) { + assert(I <= E && "Attempted to reset backwards range!"); + assert(E <= size() && "Attempted to reset out-of-bounds range!"); + + if (I == E) return *this; + + if (I / BITWORD_SIZE == E / BITWORD_SIZE) { + BitWord EMask = 1UL << (E % BITWORD_SIZE); + BitWord IMask = 1UL << (I % BITWORD_SIZE); + BitWord Mask = EMask - IMask; + Bits[I / BITWORD_SIZE] &= ~Mask; + return *this; + } + + BitWord PrefixMask = ~0UL << (I % BITWORD_SIZE); + Bits[I / BITWORD_SIZE] &= ~PrefixMask; + I = RoundUpToAlignment(I, BITWORD_SIZE); + + for (; I + BITWORD_SIZE <= E; I += BITWORD_SIZE) + Bits[I / BITWORD_SIZE] = 0UL; + + BitWord PostfixMask = (1UL << (E % BITWORD_SIZE)) - 1; + if (I < E) + Bits[I / BITWORD_SIZE] &= ~PostfixMask; + + return *this; + } + + BitVector &flip() { + for (unsigned i = 0; i < NumBitWords(size()); ++i) + Bits[i] = ~Bits[i]; + clear_unused_bits(); + return *this; + } + + BitVector &flip(unsigned Idx) { + Bits[Idx / BITWORD_SIZE] ^= 1L << (Idx % BITWORD_SIZE); + return *this; + } + + // Indexing. + reference operator[](unsigned Idx) { + assert (Idx < Size && "Out-of-bounds Bit access."); + return reference(*this, Idx); + } + + bool operator[](unsigned Idx) const { + assert (Idx < Size && "Out-of-bounds Bit access."); + BitWord Mask = 1L << (Idx % BITWORD_SIZE); + return (Bits[Idx / BITWORD_SIZE] & Mask) != 0; + } + + bool test(unsigned Idx) const { + return (*this)[Idx]; + } + + /// Test if any common bits are set. + bool anyCommon(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + for (unsigned i = 0, e = std::min(ThisWords, RHSWords); i != e; ++i) + if (Bits[i] & RHS.Bits[i]) + return true; + return false; + } + + // Comparison operators. + bool operator==(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + if (Bits[i] != RHS.Bits[i]) + return false; + + // Verify that any extra words are all zeros. + if (i != ThisWords) { + for (; i != ThisWords; ++i) + if (Bits[i]) + return false; + } else if (i != RHSWords) { + for (; i != RHSWords; ++i) + if (RHS.Bits[i]) + return false; + } + return true; + } + + bool operator!=(const BitVector &RHS) const { + return !(*this == RHS); + } + + /// Intersection, union, disjoint union. + BitVector &operator&=(const BitVector &RHS) { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + Bits[i] &= RHS.Bits[i]; + + // Any bits that are just in this bitvector become zero, because they aren't + // in the RHS bit vector. Any words only in RHS are ignored because they + // are already zero in the LHS. + for (; i != ThisWords; ++i) + Bits[i] = 0; + + return *this; + } + + /// reset - Reset bits that are set in RHS. Same as *this &= ~RHS. + BitVector &reset(const BitVector &RHS) { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + Bits[i] &= ~RHS.Bits[i]; + return *this; + } + + /// test - Check if (This - RHS) is zero. + /// This is the same as reset(RHS) and any(). + bool test(const BitVector &RHS) const { + unsigned ThisWords = NumBitWords(size()); + unsigned RHSWords = NumBitWords(RHS.size()); + unsigned i; + for (i = 0; i != std::min(ThisWords, RHSWords); ++i) + if ((Bits[i] & ~RHS.Bits[i]) != 0) + return true; + + for (; i != ThisWords ; ++i) + if (Bits[i] != 0) + return true; + + return false; + } + + BitVector &operator|=(const BitVector &RHS) { + if (size() < RHS.size()) + resize(RHS.size()); + for (size_t i = 0, e = NumBitWords(RHS.size()); i != e; ++i) + Bits[i] |= RHS.Bits[i]; + return *this; + } + + BitVector &operator^=(const BitVector &RHS) { + if (size() < RHS.size()) + resize(RHS.size()); + for (size_t i = 0, e = NumBitWords(RHS.size()); i != e; ++i) + Bits[i] ^= RHS.Bits[i]; + return *this; + } + + // Assignment operator. + const BitVector &operator=(const BitVector &RHS) { + if (this == &RHS) return *this; + + Size = RHS.size(); + unsigned RHSWords = NumBitWords(Size); + if (Size <= Capacity * BITWORD_SIZE) { + if (Size) + std::memcpy(Bits, RHS.Bits, RHSWords * sizeof(BitWord)); + clear_unused_bits(); + return *this; + } + + // Grow the bitvector to have enough elements. + Capacity = RHSWords; + BitWord *NewBits = (BitWord *)std::malloc(Capacity * sizeof(BitWord)); + std::memcpy(NewBits, RHS.Bits, Capacity * sizeof(BitWord)); + + // Destroy the old bits. + std::free(Bits); + Bits = NewBits; + + return *this; + } + +#if LLVM_HAS_RVALUE_REFERENCES + const BitVector &operator=(BitVector &&RHS) { + if (this == &RHS) return *this; + + std::free(Bits); + Bits = RHS.Bits; + Size = RHS.Size; + Capacity = RHS.Capacity; + + RHS.Bits = 0; + + return *this; + } +#endif + + void swap(BitVector &RHS) { + std::swap(Bits, RHS.Bits); + std::swap(Size, RHS.Size); + std::swap(Capacity, RHS.Capacity); + } + + //===--------------------------------------------------------------------===// + // Portable bit mask operations. + //===--------------------------------------------------------------------===// + // + // These methods all operate on arrays of uint32_t, each holding 32 bits. The + // fixed word size makes it easier to work with literal bit vector constants + // in portable code. + // + // The LSB in each word is the lowest numbered bit. The size of a portable + // bit mask is always a whole multiple of 32 bits. If no bit mask size is + // given, the bit mask is assumed to cover the entire BitVector. + + /// setBitsInMask - Add '1' bits from Mask to this vector. Don't resize. + /// This computes "*this |= Mask". + void setBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + + /// clearBitsInMask - Clear any bits in this vector that are set in Mask. + /// Don't resize. This computes "*this &= ~Mask". + void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + + /// setBitsNotInMask - Add a bit to this vector for every '0' bit in Mask. + /// Don't resize. This computes "*this |= ~Mask". + void setBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + + /// clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask. + /// Don't resize. This computes "*this &= Mask". + void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords = ~0u) { + applyMask(Mask, MaskWords); + } + +private: + unsigned NumBitWords(unsigned S) const { + return (S + BITWORD_SIZE-1) / BITWORD_SIZE; + } + + // Set the unused bits in the high words. + void set_unused_bits(bool t = true) { + // Set high words first. + unsigned UsedWords = NumBitWords(Size); + if (Capacity > UsedWords) + init_words(&Bits[UsedWords], (Capacity-UsedWords), t); + + // Then set any stray high bits of the last used word. + unsigned ExtraBits = Size % BITWORD_SIZE; + if (ExtraBits) { + BitWord ExtraBitMask = ~0UL << ExtraBits; + if (t) + Bits[UsedWords-1] |= ExtraBitMask; + else + Bits[UsedWords-1] &= ~ExtraBitMask; + } + } + + // Clear the unused bits in the high words. + void clear_unused_bits() { + set_unused_bits(false); + } + + void grow(unsigned NewSize) { + Capacity = std::max(NumBitWords(NewSize), Capacity * 2); + Bits = (BitWord *)std::realloc(Bits, Capacity * sizeof(BitWord)); + + clear_unused_bits(); + } + + void init_words(BitWord *B, unsigned NumWords, bool t) { + memset(B, 0 - (int)t, NumWords*sizeof(BitWord)); + } + + template + void applyMask(const uint32_t *Mask, unsigned MaskWords) { + assert(BITWORD_SIZE % 32 == 0 && "Unsupported BitWord size."); + MaskWords = std::min(MaskWords, (size() + 31) / 32); + const unsigned Scale = BITWORD_SIZE / 32; + unsigned i; + for (i = 0; MaskWords >= Scale; ++i, MaskWords -= Scale) { + BitWord BW = Bits[i]; + // This inner loop should unroll completely when BITWORD_SIZE > 32. + for (unsigned b = 0; b != BITWORD_SIZE; b += 32) { + uint32_t M = *Mask++; + if (InvertMask) M = ~M; + if (AddBits) BW |= BitWord(M) << b; + else BW &= ~(BitWord(M) << b); + } + Bits[i] = BW; + } + for (unsigned b = 0; MaskWords; b += 32, --MaskWords) { + uint32_t M = *Mask++; + if (InvertMask) M = ~M; + if (AddBits) Bits[i] |= BitWord(M) << b; + else Bits[i] &= ~(BitWord(M) << b); + } + if (AddBits) + clear_unused_bits(); + } +}; + +} // End llvm namespace + +namespace std { + /// Implement std::swap in terms of BitVector swap. + inline void + swap(llvm::BitVector &LHS, llvm::BitVector &RHS) { + LHS.swap(RHS); + } +} + +#endif diff --git a/third_party/llvm/include/llvm/Support/Compiler.h b/third_party/llvm/include/llvm/Support/Compiler.h new file mode 100644 index 000000000..806e75917 --- /dev/null +++ b/third_party/llvm/include/llvm/Support/Compiler.h @@ -0,0 +1,446 @@ +//===-- llvm/Support/Compiler.h - Compiler abstraction support --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines several macros, based on the current compiler. This allows +// use of compiler-specific features in a way that remains portable. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_COMPILER_H +#define LLVM_SUPPORT_COMPILER_H + +//#include "llvm/Config/llvm-config.h" + +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +#ifndef __has_extension +# define __has_extension(x) 0 +#endif + +#ifndef __has_attribute +# define __has_attribute(x) 0 +#endif + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +/// \macro __GNUC_PREREQ +/// \brief Defines __GNUC_PREREQ if glibc's features.h isn't available. +#ifndef __GNUC_PREREQ +# if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) 0 +# endif +#endif + +/// \macro LLVM_MSC_PREREQ +/// \brief Is the compiler MSVC of at least the specified version? +/// The common \param version values to check for are: +/// * 1600: Microsoft Visual Studio 2010 / 10.0 +/// * 1700: Microsoft Visual Studio 2012 / 11.0 +/// * 1800: Microsoft Visual Studio 2013 / 12.0 +#ifdef _MSC_VER +#define LLVM_MSC_PREREQ(version) (_MSC_VER >= (version)) +#else +#define LLVM_MSC_PREREQ(version) 0 +#endif + +/// \brief Does the compiler support r-value references? +/// This implies that provides the one-argument std::move; it +/// does not imply the existence of any other C++ library features. +#if __has_feature(cxx_rvalue_references) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1600) +#define LLVM_HAS_RVALUE_REFERENCES 1 +#else +#define LLVM_HAS_RVALUE_REFERENCES 0 +#endif + +/// \brief Does the compiler support r-value reference *this? +/// +/// Sadly, this is separate from just r-value reference support because GCC +/// implemented everything but this thus far. No release of GCC yet has support +/// for this feature so it is enabled with Clang only. +/// FIXME: This should change to a version check when GCC grows support for it. +#if __has_feature(cxx_rvalue_references) +#define LLVM_HAS_RVALUE_REFERENCE_THIS 1 +#else +#define LLVM_HAS_RVALUE_REFERENCE_THIS 0 +#endif + +/// \macro LLVM_HAS_CXX11_TYPETRAITS +/// \brief Does the compiler have the C++11 type traits. +/// +/// #include +/// +/// * enable_if +/// * {true,false}_type +/// * is_constructible +/// * etc... +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_HAS_CXX11_TYPETRAITS 1 +#else +#define LLVM_HAS_CXX11_TYPETRAITS 0 +#endif + +/// \macro LLVM_HAS_CXX11_STDLIB +/// \brief Does the compiler have the C++11 standard library. +/// +/// Implies LLVM_HAS_RVALUE_REFERENCES, LLVM_HAS_CXX11_TYPETRAITS +#if defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_HAS_CXX11_STDLIB 1 +#else +#define LLVM_HAS_CXX11_STDLIB 0 +#endif + +/// \macro LLVM_HAS_VARIADIC_TEMPLATES +/// \brief Does this compiler support variadic templates. +/// +/// Implies LLVM_HAS_RVALUE_REFERENCES and the existence of std::forward. +#if __has_feature(cxx_variadic_templates) || LLVM_MSC_PREREQ(1800) +# define LLVM_HAS_VARIADIC_TEMPLATES 1 +#else +# define LLVM_HAS_VARIADIC_TEMPLATES 0 +#endif + +/// llvm_move - Expands to ::std::move if the compiler supports +/// r-value references; otherwise, expands to the argument. +#if LLVM_HAS_RVALUE_REFERENCES +#define llvm_move(value) (::std::move(value)) +#else +#define llvm_move(value) (value) +#endif + +/// Expands to '&' if r-value references are supported. +/// +/// This can be used to provide l-value/r-value overrides of member functions. +/// The r-value override should be guarded by LLVM_HAS_RVALUE_REFERENCE_THIS +#if LLVM_HAS_RVALUE_REFERENCE_THIS +#define LLVM_LVALUE_FUNCTION & +#else +#define LLVM_LVALUE_FUNCTION +#endif + +/// LLVM_DELETED_FUNCTION - Expands to = delete if the compiler supports it. +/// Use to mark functions as uncallable. Member functions with this should +/// be declared private so that some behavior is kept in C++03 mode. +/// +/// class DontCopy { +/// private: +/// DontCopy(const DontCopy&) LLVM_DELETED_FUNCTION; +/// DontCopy &operator =(const DontCopy&) LLVM_DELETED_FUNCTION; +/// public: +/// ... +/// }; +#if __has_feature(cxx_deleted_functions) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1800) +#define LLVM_DELETED_FUNCTION = delete +#else +#define LLVM_DELETED_FUNCTION +#endif + +/// LLVM_FINAL - Expands to 'final' if the compiler supports it. +/// Use to mark classes or virtual methods as final. +#if __has_feature(cxx_override_control) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_FINAL final +#else +#define LLVM_FINAL +#endif + +/// LLVM_OVERRIDE - Expands to 'override' if the compiler supports it. +/// Use to mark virtual methods as overriding a base class method. +#if __has_feature(cxx_override_control) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1700) +#define LLVM_OVERRIDE override +#else +#define LLVM_OVERRIDE +#endif + +#if __has_feature(cxx_constexpr) || defined(__GXX_EXPERIMENTAL_CXX0X__) +# define LLVM_CONSTEXPR constexpr +#else +# define LLVM_CONSTEXPR +#endif + +/// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked +/// into a shared library, then the class should be private to the library and +/// not accessible from outside it. Can also be used to mark variables and +/// functions, making them private to any shared library they are linked into. +/// On PE/COFF targets, library visibility is the default, so this isn't needed. +#if (__has_attribute(visibility) || __GNUC_PREREQ(4, 0)) && \ + !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32) +#define LLVM_LIBRARY_VISIBILITY __attribute__ ((visibility("hidden"))) +#else +#define LLVM_LIBRARY_VISIBILITY +#endif + +#if __has_attribute(used) || __GNUC_PREREQ(3, 1) +#define LLVM_ATTRIBUTE_USED __attribute__((__used__)) +#else +#define LLVM_ATTRIBUTE_USED +#endif + +#if __has_attribute(warn_unused_result) || __GNUC_PREREQ(3, 4) +#define LLVM_ATTRIBUTE_UNUSED_RESULT __attribute__((__warn_unused_result__)) +#else +#define LLVM_ATTRIBUTE_UNUSED_RESULT +#endif + +// Some compilers warn about unused functions. When a function is sometimes +// used or not depending on build settings (e.g. a function only called from +// within "assert"), this attribute can be used to suppress such warnings. +// +// However, it shouldn't be used for unused *variables*, as those have a much +// more portable solution: +// (void)unused_var_name; +// Prefer cast-to-void wherever it is sufficient. +#if __has_attribute(unused) || __GNUC_PREREQ(3, 1) +#define LLVM_ATTRIBUTE_UNUSED __attribute__((__unused__)) +#else +#define LLVM_ATTRIBUTE_UNUSED +#endif + +// FIXME: Provide this for PE/COFF targets. +#if (__has_attribute(weak) || __GNUC_PREREQ(4, 0)) && \ + (!defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(LLVM_ON_WIN32)) +#define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__)) +#else +#define LLVM_ATTRIBUTE_WEAK +#endif + +// Prior to clang 3.2, clang did not accept any spelling of +// __has_attribute(const), so assume it is supported. +#if defined(__clang__) || defined(__GNUC__) +// aka 'CONST' but following LLVM Conventions. +#define LLVM_READNONE __attribute__((__const__)) +#else +#define LLVM_READNONE +#endif + +#if __has_attribute(pure) || defined(__GNUC__) +// aka 'PURE' but following LLVM Conventions. +#define LLVM_READONLY __attribute__((__pure__)) +#else +#define LLVM_READONLY +#endif + +#if __has_builtin(__builtin_expect) || __GNUC_PREREQ(4, 0) +#define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true) +#define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false) +#else +#define LLVM_LIKELY(EXPR) (EXPR) +#define LLVM_UNLIKELY(EXPR) (EXPR) +#endif + +// C++ doesn't support 'extern template' of template specializations. GCC does, +// but requires __extension__ before it. In the header, use this: +// EXTERN_TEMPLATE_INSTANTIATION(class foo); +// in the .cpp file, use this: +// TEMPLATE_INSTANTIATION(class foo); +#ifdef __GNUC__ +#define EXTERN_TEMPLATE_INSTANTIATION(X) __extension__ extern template X +#define TEMPLATE_INSTANTIATION(X) template X +#else +#define EXTERN_TEMPLATE_INSTANTIATION(X) +#define TEMPLATE_INSTANTIATION(X) +#endif + +/// LLVM_ATTRIBUTE_NOINLINE - On compilers where we have a directive to do so, +/// mark a method "not for inlining". +#if __has_attribute(noinline) || __GNUC_PREREQ(3, 4) +#define LLVM_ATTRIBUTE_NOINLINE __attribute__((noinline)) +#elif defined(_MSC_VER) +#define LLVM_ATTRIBUTE_NOINLINE __declspec(noinline) +#else +#define LLVM_ATTRIBUTE_NOINLINE +#endif + +/// LLVM_ATTRIBUTE_ALWAYS_INLINE - On compilers where we have a directive to do +/// so, mark a method "always inline" because it is performance sensitive. GCC +/// 3.4 supported this but is buggy in various cases and produces unimplemented +/// errors, just use it in GCC 4.0 and later. +#if __has_attribute(always_inline) || __GNUC_PREREQ(4, 0) +#define LLVM_ATTRIBUTE_ALWAYS_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) +#define LLVM_ATTRIBUTE_ALWAYS_INLINE __forceinline +#else +#define LLVM_ATTRIBUTE_ALWAYS_INLINE +#endif + +#ifdef __GNUC__ +#define LLVM_ATTRIBUTE_NORETURN __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define LLVM_ATTRIBUTE_NORETURN __declspec(noreturn) +#else +#define LLVM_ATTRIBUTE_NORETURN +#endif + +/// LLVM_EXTENSION - Support compilers where we have a keyword to suppress +/// pedantic diagnostics. +#ifdef __GNUC__ +#define LLVM_EXTENSION __extension__ +#else +#define LLVM_EXTENSION +#endif + +// LLVM_ATTRIBUTE_DEPRECATED(decl, "message") +#if __has_feature(attribute_deprecated_with_message) +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + decl __attribute__((deprecated(message))) +#elif defined(__GNUC__) +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + decl __attribute__((deprecated)) +#elif defined(_MSC_VER) +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + __declspec(deprecated(message)) decl +#else +# define LLVM_ATTRIBUTE_DEPRECATED(decl, message) \ + decl +#endif + +/// LLVM_BUILTIN_UNREACHABLE - On compilers which support it, expands +/// to an expression which states that it is undefined behavior for the +/// compiler to reach this point. Otherwise is not defined. +#if __has_builtin(__builtin_unreachable) || __GNUC_PREREQ(4, 5) +# define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable() +#elif defined(_MSC_VER) +# define LLVM_BUILTIN_UNREACHABLE __assume(false) +#endif + +/// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression +/// which causes the program to exit abnormally. +#if __has_builtin(__builtin_trap) || __GNUC_PREREQ(4, 3) +# define LLVM_BUILTIN_TRAP __builtin_trap() +#else +# define LLVM_BUILTIN_TRAP *(volatile int*)0x11 = 0 +#endif + +/// \macro LLVM_ASSUME_ALIGNED +/// \brief Returns a pointer with an assumed alignment. +#if __has_builtin(__builtin_assume_aligned) && __GNUC_PREREQ(4, 7) +# define LLVM_ASSUME_ALIGNED(p, a) __builtin_assume_aligned(p, a) +#elif defined(LLVM_BUILTIN_UNREACHABLE) +// As of today, clang does not support __builtin_assume_aligned. +# define LLVM_ASSUME_ALIGNED(p, a) \ + (((uintptr_t(p) % (a)) == 0) ? (p) : (LLVM_BUILTIN_UNREACHABLE, (p))) +#else +# define LLVM_ASSUME_ALIGNED(p, a) (p) +#endif + +/// \macro LLVM_FUNCTION_NAME +/// \brief Expands to __func__ on compilers which support it. Otherwise, +/// expands to a compiler-dependent replacement. +#if defined(_MSC_VER) +# define LLVM_FUNCTION_NAME __FUNCTION__ +#else +# define LLVM_FUNCTION_NAME __func__ +#endif + +#if defined(HAVE_SANITIZER_MSAN_INTERFACE_H) +# include +#else +# define __msan_allocated_memory(p, size) +# define __msan_unpoison(p, size) +#endif + +/// \macro LLVM_MEMORY_SANITIZER_BUILD +/// \brief Whether LLVM itself is built with MemorySanitizer instrumentation. +#if __has_feature(memory_sanitizer) +# define LLVM_MEMORY_SANITIZER_BUILD 1 +#else +# define LLVM_MEMORY_SANITIZER_BUILD 0 +#endif + +/// \macro LLVM_ADDRESS_SANITIZER_BUILD +/// \brief Whether LLVM itself is built with AddressSanitizer instrumentation. +#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +# define LLVM_ADDRESS_SANITIZER_BUILD 1 +#else +# define LLVM_ADDRESS_SANITIZER_BUILD 0 +#endif + +/// \macro LLVM_IS_UNALIGNED_ACCESS_FAST +/// \brief Is unaligned memory access fast on the host machine. +/// +/// Don't specialize on alignment for platforms where unaligned memory accesses +/// generates the same code as aligned memory accesses for common types. +#if defined(_M_AMD64) || defined(_M_IX86) || defined(__amd64) || \ + defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || \ + defined(_X86_) || defined(__i386) || defined(__i386__) +# define LLVM_IS_UNALIGNED_ACCESS_FAST 1 +#else +# define LLVM_IS_UNALIGNED_ACCESS_FAST 0 +#endif + +/// \macro LLVM_EXPLICIT +/// \brief Expands to explicit on compilers which support explicit conversion +/// operators. Otherwise expands to nothing. +#if __has_feature(cxx_explicit_conversions) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1800) +#define LLVM_EXPLICIT explicit +#else +#define LLVM_EXPLICIT +#endif + +/// \macro LLVM_STATIC_ASSERT +/// \brief Expands to C/C++'s static_assert on compilers which support it. +#if __has_feature(cxx_static_assert) || \ + defined(__GXX_EXPERIMENTAL_CXX0X__) || LLVM_MSC_PREREQ(1600) +# define LLVM_STATIC_ASSERT(expr, msg) static_assert(expr, msg) +#elif __has_feature(c_static_assert) +# define LLVM_STATIC_ASSERT(expr, msg) _Static_assert(expr, msg) +#elif __has_extension(c_static_assert) +# define LLVM_STATIC_ASSERT(expr, msg) LLVM_EXTENSION _Static_assert(expr, msg) +#else +# define LLVM_STATIC_ASSERT(expr, msg) +#endif + +/// \macro LLVM_ENUM_INT_TYPE +/// \brief Expands to colon followed by the given integral type on compilers +/// which support C++11 strong enums. This can be used to make enums unsigned +/// with MSVC. +#if __has_feature(cxx_strong_enums) || LLVM_MSC_PREREQ(1600) +# define LLVM_ENUM_INT_TYPE(intty) : intty +#else +# define LLVM_ENUM_INT_TYPE(intty) +#endif + +/// \brief Does the compiler support C++11 semantics for strongly typed forward +/// declared enums? +#if __has_feature(cxx_strong_enums) || LLVM_MSC_PREREQ(1700) +#define LLVM_HAS_STRONG_ENUMS 1 +#else +#define LLVM_HAS_STRONG_ENUMS 0 +#endif + +/// \brief Does the compiler support generalized initializers (using braced +/// lists and std::initializer_list). While clang may claim it supports general +/// initializers, if we're using MSVC's headers, we might not have a usable +/// std::initializer list type from the STL. Disable this for now. +#if __has_feature(cxx_generalized_initializers) && !defined(_MSC_VER) +#define LLVM_HAS_INITIALIZER_LISTS 1 +#else +#define LLVM_HAS_INITIALIZER_LISTS 0 +#endif + +/// \brief Mark debug helper function definitions like dump() that should not be +/// stripped from debug builds. +// FIXME: Move this to a private config.h as it's not usable in public headers. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED +#else +#define LLVM_DUMP_METHOD LLVM_ATTRIBUTE_NOINLINE +#endif + +#endif diff --git a/third_party/llvm/include/llvm/Support/MathExtras.h b/third_party/llvm/include/llvm/Support/MathExtras.h new file mode 100644 index 000000000..4d2ff0989 --- /dev/null +++ b/third_party/llvm/include/llvm/Support/MathExtras.h @@ -0,0 +1,626 @@ +//===-- llvm/Support/MathExtras.h - Useful math functions -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains some functions that are useful for math stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_MATHEXTRAS_H +#define LLVM_SUPPORT_MATHEXTRAS_H + +#include "llvm/Support/Compiler.h" +#ifdef IGNORED_LLVM_XENIA +#include "llvm/Support/SwapByteOrder.h" +#endif // IGNORED_LLVM_XENIA +#include "llvm/Support/type_traits.h" +#include + +#ifdef _MSC_VER +#include +#include +#endif + +namespace llvm { +/// \brief The behavior an operation has on an input of 0. +enum ZeroBehavior { + /// \brief The returned value is undefined. + ZB_Undefined, + /// \brief The returned value is numeric_limits::max() + ZB_Max, + /// \brief The returned value is numeric_limits::digits + ZB_Width +}; + +/// \brief Count number of 0's from the least significant bit to the most +/// stopping at the first 1. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, std::size_t>::type +countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) { + (void)ZB; + + if (!Val) + return std::numeric_limits::digits; + if (Val & 0x1) + return 0; + + // Bisection method. + std::size_t ZeroBits = 0; + T Shift = std::numeric_limits::digits >> 1; + T Mask = std::numeric_limits::max() >> Shift; + while (Shift) { + if ((Val & Mask) == 0) { + Val >>= Shift; + ZeroBits |= Shift; + } + Shift >>= 1; + Mask >>= Shift; + } + return ZeroBits; +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, std::size_t>::type +countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) LLVM_DELETED_FUNCTION; + +#if __GNUC__ >= 4 || _MSC_VER +template <> +inline std::size_t countTrailingZeros(uint32_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 32; + +#if __has_builtin(__builtin_ctz) || __GNUC_PREREQ(4, 0) + return __builtin_ctz(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanForward(&Index, Val); + return Index; +#endif +} + +#if !defined(_MSC_VER) || defined(_M_X64) +template <> +inline std::size_t countTrailingZeros(uint64_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 64; + +#if __has_builtin(__builtin_ctzll) || __GNUC_PREREQ(4, 0) + return __builtin_ctzll(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanForward64(&Index, Val); + return Index; +#endif +} +#endif +#endif + +/// \brief Count number of 0's from the most significant bit to the least +/// stopping at the first 1. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Width and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, std::size_t>::type +countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) { + (void)ZB; + + if (!Val) + return std::numeric_limits::digits; + + // Bisection method. + std::size_t ZeroBits = 0; + for (T Shift = std::numeric_limits::digits >> 1; Shift; Shift >>= 1) { + T Tmp = Val >> Shift; + if (Tmp) + Val = Tmp; + else + ZeroBits |= Shift; + } + return ZeroBits; +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, std::size_t>::type +countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) LLVM_DELETED_FUNCTION; + +#if __GNUC__ >= 4 || _MSC_VER +template <> +inline std::size_t countLeadingZeros(uint32_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 32; + +#if __has_builtin(__builtin_clz) || __GNUC_PREREQ(4, 0) + return __builtin_clz(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanReverse(&Index, Val); + return Index ^ 31; +#endif +} + +#if !defined(_MSC_VER) || defined(_M_X64) +template <> +inline std::size_t countLeadingZeros(uint64_t Val, ZeroBehavior ZB) { + if (ZB != ZB_Undefined && Val == 0) + return 64; + +#if __has_builtin(__builtin_clzll) || __GNUC_PREREQ(4, 0) + return __builtin_clzll(Val); +#elif _MSC_VER + unsigned long Index; + _BitScanReverse64(&Index, Val); + return Index ^ 63; +#endif +} +#endif +#endif + +/// \brief Get the index of the first set bit starting from the least +/// significant bit. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, T>::type +findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) { + if (ZB == ZB_Max && Val == 0) + return std::numeric_limits::max(); + + return countTrailingZeros(Val, ZB_Undefined); +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, T>::type +findFirstSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION; + +/// \brief Get the index of the last set bit starting from the least +/// significant bit. +/// +/// Only unsigned integral types are allowed. +/// +/// \param ZB the behavior on an input of 0. Only ZB_Max and ZB_Undefined are +/// valid arguments. +template +typename enable_if_c::is_integer && + !std::numeric_limits::is_signed, T>::type +findLastSet(T Val, ZeroBehavior ZB = ZB_Max) { + if (ZB == ZB_Max && Val == 0) + return std::numeric_limits::max(); + + // Use ^ instead of - because both gcc and llvm can remove the associated ^ + // in the __builtin_clz intrinsic on x86. + return countLeadingZeros(Val, ZB_Undefined) ^ + (std::numeric_limits::digits - 1); +} + +// Disable signed. +template +typename enable_if_c::is_integer && + std::numeric_limits::is_signed, T>::type +findLastSet(T Val, ZeroBehavior ZB = ZB_Max) LLVM_DELETED_FUNCTION; + +/// \brief Macro compressed bit reversal table for 256 bits. +/// +/// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable +static const unsigned char BitReverseTable256[256] = { +#define R2(n) n, n + 2 * 64, n + 1 * 64, n + 3 * 64 +#define R4(n) R2(n), R2(n + 2 * 16), R2(n + 1 * 16), R2(n + 3 * 16) +#define R6(n) R4(n), R4(n + 2 * 4), R4(n + 1 * 4), R4(n + 3 * 4) + R6(0), R6(2), R6(1), R6(3) +}; + +/// \brief Reverse the bits in \p Val. +template +T reverseBits(T Val) { + unsigned char in[sizeof(Val)]; + unsigned char out[sizeof(Val)]; + std::memcpy(in, &Val, sizeof(Val)); + for (unsigned i = 0; i < sizeof(Val); ++i) + out[(sizeof(Val) - i) - 1] = BitReverseTable256[in[i]]; + std::memcpy(&Val, out, sizeof(Val)); + return Val; +} + +// NOTE: The following support functions use the _32/_64 extensions instead of +// type overloading so that signed and unsigned integers can be used without +// ambiguity. + +/// Hi_32 - This function returns the high 32 bits of a 64 bit value. +inline uint32_t Hi_32(uint64_t Value) { + return static_cast(Value >> 32); +} + +/// Lo_32 - This function returns the low 32 bits of a 64 bit value. +inline uint32_t Lo_32(uint64_t Value) { + return static_cast(Value); +} + +/// isInt - Checks if an integer fits into the given bit width. +template +inline bool isInt(int64_t x) { + return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1))); +} +// Template specializations to get better code for common cases. +template<> +inline bool isInt<8>(int64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isInt<16>(int64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isInt<32>(int64_t x) { + return static_cast(x) == x; +} + +/// isShiftedInt - Checks if a signed integer is an N bit number shifted +/// left by S. +template +inline bool isShiftedInt(int64_t x) { + return isInt(x) && (x % (1< +inline bool isUInt(uint64_t x) { + return N >= 64 || x < (UINT64_C(1)<<(N)); +} +// Template specializations to get better code for common cases. +template<> +inline bool isUInt<8>(uint64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isUInt<16>(uint64_t x) { + return static_cast(x) == x; +} +template<> +inline bool isUInt<32>(uint64_t x) { + return static_cast(x) == x; +} + +/// isShiftedUInt - Checks if a unsigned integer is an N bit number shifted +/// left by S. +template +inline bool isShiftedUInt(uint64_t x) { + return isUInt(x) && (x % (1<> (64 - N))); +} + +/// isIntN - Checks if an signed integer fits into the given (dynamic) +/// bit width. +inline bool isIntN(unsigned N, int64_t x) { + return N >= 64 || (-(INT64_C(1)<<(N-1)) <= x && x < (INT64_C(1)<<(N-1))); +} + +/// isMask_32 - This function returns true if the argument is a sequence of ones +/// starting at the least significant bit with the remainder zero (32 bit +/// version). Ex. isMask_32(0x0000FFFFU) == true. +inline bool isMask_32(uint32_t Value) { + return Value && ((Value + 1) & Value) == 0; +} + +/// isMask_64 - This function returns true if the argument is a sequence of ones +/// starting at the least significant bit with the remainder zero (64 bit +/// version). +inline bool isMask_64(uint64_t Value) { + return Value && ((Value + 1) & Value) == 0; +} + +/// isShiftedMask_32 - This function returns true if the argument contains a +/// sequence of ones with the remainder zero (32 bit version.) +/// Ex. isShiftedMask_32(0x0000FF00U) == true. +inline bool isShiftedMask_32(uint32_t Value) { + return isMask_32((Value - 1) | Value); +} + +/// isShiftedMask_64 - This function returns true if the argument contains a +/// sequence of ones with the remainder zero (64 bit version.) +inline bool isShiftedMask_64(uint64_t Value) { + return isMask_64((Value - 1) | Value); +} + +/// isPowerOf2_32 - This function returns true if the argument is a power of +/// two > 0. Ex. isPowerOf2_32(0x00100000U) == true (32 bit edition.) +inline bool isPowerOf2_32(uint32_t Value) { + return Value && !(Value & (Value - 1)); +} + +/// isPowerOf2_64 - This function returns true if the argument is a power of two +/// > 0 (64 bit edition.) +inline bool isPowerOf2_64(uint64_t Value) { + return Value && !(Value & (Value - int64_t(1L))); +} + +#ifdef IGNORED_LLVM_XENIA +/// ByteSwap_16 - This function returns a byte-swapped representation of the +/// 16-bit argument, Value. +inline uint16_t ByteSwap_16(uint16_t Value) { + return sys::SwapByteOrder_16(Value); +} + +/// ByteSwap_32 - This function returns a byte-swapped representation of the +/// 32-bit argument, Value. +inline uint32_t ByteSwap_32(uint32_t Value) { + return sys::SwapByteOrder_32(Value); +} + +/// ByteSwap_64 - This function returns a byte-swapped representation of the +/// 64-bit argument, Value. +inline uint64_t ByteSwap_64(uint64_t Value) { + return sys::SwapByteOrder_64(Value); +} +#endif // IGNORED_LLVM_XENIA + +/// CountLeadingOnes_32 - this function performs the operation of +/// counting the number of ones from the most significant bit to the first zero +/// bit. Ex. CountLeadingOnes_32(0xFF0FFF00) == 8. +/// Returns 32 if the word is all ones. +inline unsigned CountLeadingOnes_32(uint32_t Value) { + return countLeadingZeros(~Value); +} + +/// CountLeadingOnes_64 - This function performs the operation +/// of counting the number of ones from the most significant bit to the first +/// zero bit (64 bit edition.) +/// Returns 64 if the word is all ones. +inline unsigned CountLeadingOnes_64(uint64_t Value) { + return countLeadingZeros(~Value); +} + +/// CountTrailingOnes_32 - this function performs the operation of +/// counting the number of ones from the least significant bit to the first zero +/// bit. Ex. CountTrailingOnes_32(0x00FF00FF) == 8. +/// Returns 32 if the word is all ones. +inline unsigned CountTrailingOnes_32(uint32_t Value) { + return countTrailingZeros(~Value); +} + +/// CountTrailingOnes_64 - This function performs the operation +/// of counting the number of ones from the least significant bit to the first +/// zero bit (64 bit edition.) +/// Returns 64 if the word is all ones. +inline unsigned CountTrailingOnes_64(uint64_t Value) { + return countTrailingZeros(~Value); +} + +/// CountPopulation_32 - this function counts the number of set bits in a value. +/// Ex. CountPopulation(0xF000F000) = 8 +/// Returns 0 if the word is zero. +inline unsigned CountPopulation_32(uint32_t Value) { +#if __GNUC__ >= 4 + return __builtin_popcount(Value); +#else + uint32_t v = Value - ((Value >> 1) & 0x55555555); + v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; +#endif +} + +/// CountPopulation_64 - this function counts the number of set bits in a value, +/// (64 bit edition.) +inline unsigned CountPopulation_64(uint64_t Value) { +#if __GNUC__ >= 4 + return __builtin_popcountll(Value); +#else + uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL); + v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL); + v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL; + return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56); +#endif +} + +/// Log2_32 - This function returns the floor log base 2 of the specified value, +/// -1 if the value is zero. (32 bit edition.) +/// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 +inline unsigned Log2_32(uint32_t Value) { + return 31 - countLeadingZeros(Value); +} + +/// Log2_64 - This function returns the floor log base 2 of the specified value, +/// -1 if the value is zero. (64 bit edition.) +inline unsigned Log2_64(uint64_t Value) { + return 63 - countLeadingZeros(Value); +} + +/// Log2_32_Ceil - This function returns the ceil log base 2 of the specified +/// value, 32 if the value is zero. (32 bit edition). +/// Ex. Log2_32_Ceil(32) == 5, Log2_32_Ceil(1) == 0, Log2_32_Ceil(6) == 3 +inline unsigned Log2_32_Ceil(uint32_t Value) { + return 32 - countLeadingZeros(Value - 1); +} + +/// Log2_64_Ceil - This function returns the ceil log base 2 of the specified +/// value, 64 if the value is zero. (64 bit edition.) +inline unsigned Log2_64_Ceil(uint64_t Value) { + return 64 - countLeadingZeros(Value - 1); +} + +/// GreatestCommonDivisor64 - Return the greatest common divisor of the two +/// values using Euclid's algorithm. +inline uint64_t GreatestCommonDivisor64(uint64_t A, uint64_t B) { + while (B) { + uint64_t T = B; + B = A % B; + A = T; + } + return A; +} + +/// BitsToDouble - This function takes a 64-bit integer and returns the bit +/// equivalent double. +inline double BitsToDouble(uint64_t Bits) { + union { + uint64_t L; + double D; + } T; + T.L = Bits; + return T.D; +} + +/// BitsToFloat - This function takes a 32-bit integer and returns the bit +/// equivalent float. +inline float BitsToFloat(uint32_t Bits) { + union { + uint32_t I; + float F; + } T; + T.I = Bits; + return T.F; +} + +/// DoubleToBits - This function takes a double and returns the bit +/// equivalent 64-bit integer. Note that copying doubles around +/// changes the bits of NaNs on some hosts, notably x86, so this +/// routine cannot be used if these bits are needed. +inline uint64_t DoubleToBits(double Double) { + union { + uint64_t L; + double D; + } T; + T.D = Double; + return T.L; +} + +/// FloatToBits - This function takes a float and returns the bit +/// equivalent 32-bit integer. Note that copying floats around +/// changes the bits of NaNs on some hosts, notably x86, so this +/// routine cannot be used if these bits are needed. +inline uint32_t FloatToBits(float Float) { + union { + uint32_t I; + float F; + } T; + T.F = Float; + return T.I; +} + +/// Platform-independent wrappers for the C99 isnan() function. +int IsNAN(float f); +int IsNAN(double d); + +/// Platform-independent wrappers for the C99 isinf() function. +int IsInf(float f); +int IsInf(double d); + +/// MinAlign - A and B are either alignments or offsets. Return the minimum +/// alignment that may be assumed after adding the two together. +inline uint64_t MinAlign(uint64_t A, uint64_t B) { + // The largest power of 2 that divides both A and B. + // + // Replace "-Value" by "1+~Value" in the following commented code to avoid + // MSVC warning C4146 + // return (A | B) & -(A | B); + return (A | B) & (1 + ~(A | B)); +} + +/// NextPowerOf2 - Returns the next power of two (in 64-bits) +/// that is strictly greater than A. Returns zero on overflow. +inline uint64_t NextPowerOf2(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +/// Returns the power of two which is less than or equal to the given value. +/// Essentially, it is a floor operation across the domain of powers of two. +inline uint64_t PowerOf2Floor(uint64_t A) { + if (!A) return 0; + return 1ull << (63 - countLeadingZeros(A, ZB_Undefined)); +} + +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \p Align. \p Align must be non-zero. +/// +/// Examples: +/// \code +/// RoundUpToAlignment(5, 8) = 8 +/// RoundUpToAlignment(17, 8) = 24 +/// RoundUpToAlignment(~0LL, 8) = 0 +/// \endcode +inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) { + return ((Value + Align - 1) / Align) * Align; +} + +/// Returns the offset to the next integer (mod 2**64) that is greater than +/// or equal to \p Value and is a multiple of \p Align. \p Align must be +/// non-zero. +inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) { + return RoundUpToAlignment(Value, Align) - Value; +} + +/// abs64 - absolute value of a 64-bit int. Not all environments support +/// "abs" on whatever their name for the 64-bit int type is. The absolute +/// value of the largest negative number is undefined, as with "abs". +inline int64_t abs64(int64_t x) { + return (x < 0) ? -x : x; +} + +/// SignExtend32 - Sign extend B-bit number x to 32-bit int. +/// Usage int32_t r = SignExtend32<5>(x); +template inline int32_t SignExtend32(uint32_t x) { + return int32_t(x << (32 - B)) >> (32 - B); +} + +/// \brief Sign extend number in the bottom B bits of X to a 32-bit int. +/// Requires 0 < B <= 32. +inline int32_t SignExtend32(uint32_t X, unsigned B) { + return int32_t(X << (32 - B)) >> (32 - B); +} + +/// SignExtend64 - Sign extend B-bit number x to 64-bit int. +/// Usage int64_t r = SignExtend64<5>(x); +template inline int64_t SignExtend64(uint64_t x) { + return int64_t(x << (64 - B)) >> (64 - B); +} + +/// \brief Sign extend number in the bottom B bits of X to a 64-bit int. +/// Requires 0 < B <= 64. +inline int64_t SignExtend64(uint64_t X, unsigned B) { + return int64_t(X << (64 - B)) >> (64 - B); +} + +#if defined(_MSC_VER) + // Visual Studio defines the HUGE_VAL class of macros using purposeful + // constant arithmetic overflow, which it then warns on when encountered. + const float huge_valf = std::numeric_limits::infinity(); +#else + const float huge_valf = HUGE_VALF; +#endif +} // End llvm namespace + +#endif diff --git a/third_party/llvm/include/llvm/Support/type_traits.h b/third_party/llvm/include/llvm/Support/type_traits.h new file mode 100644 index 000000000..ad812de98 --- /dev/null +++ b/third_party/llvm/include/llvm/Support/type_traits.h @@ -0,0 +1,244 @@ +//===- llvm/Support/type_traits.h - Simplfied type traits -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides a template class that determines if a type is a class or +// not. The basic mechanism, based on using the pointer to member function of +// a zero argument to a function was "boosted" from the boost type_traits +// library. See http://www.boost.org/ for all the gory details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_TYPE_TRAITS_H +#define LLVM_SUPPORT_TYPE_TRAITS_H + +//#include "llvm/Support/DataTypes.h" +#include +#include + +#ifndef __has_feature +#define LLVM_DEFINED_HAS_FEATURE +#define __has_feature(x) 0 +#endif + +// This is actually the conforming implementation which works with abstract +// classes. However, enough compilers have trouble with it that most will use +// the one in boost/type_traits/object_traits.hpp. This implementation actually +// works with VC7.0, but other interactions seem to fail when we use it. + +namespace llvm { + +namespace dont_use +{ + // These two functions should never be used. They are helpers to + // the is_class template below. They cannot be located inside + // is_class because doing so causes at least GCC to think that + // the value of the "value" enumerator is not constant. Placing + // them out here (for some strange reason) allows the sizeof + // operator against them to magically be constant. This is + // important to make the is_class::value idiom zero cost. it + // evaluates to a constant 1 or 0 depending on whether the + // parameter T is a class or not (respectively). + template char is_class_helper(void(T::*)()); + template double is_class_helper(...); +} + +template +struct is_class +{ + // is_class<> metafunction due to Paul Mensonides (leavings@attbi.com). For + // more details: + // http://groups.google.com/groups?hl=en&selm=000001c1cc83%24e154d5e0%247772e50c%40c161550a&rnum=1 +public: + static const bool value = + sizeof(char) == sizeof(dont_use::is_class_helper(0)); +}; + + +/// isPodLike - This is a type trait that is used to determine whether a given +/// type can be copied around with memcpy instead of running ctors etc. +template +struct isPodLike { +#if __has_feature(is_trivially_copyable) + // If the compiler supports the is_trivially_copyable trait use it, as it + // matches the definition of isPodLike closely. + static const bool value = __is_trivially_copyable(T); +#else + // If we don't know anything else, we can (at least) assume that all non-class + // types are PODs. + static const bool value = !is_class::value; +#endif +}; + +// std::pair's are pod-like if their elements are. +template +struct isPodLike > { + static const bool value = isPodLike::value && isPodLike::value; +}; + + +template +struct integral_constant { + typedef T value_type; + static const value_type value = v; + typedef integral_constant type; + operator value_type() { return value; } +}; + +typedef integral_constant true_type; +typedef integral_constant false_type; + +/// \brief Metafunction that determines whether the two given types are +/// equivalent. +template struct is_same : public false_type {}; +template struct is_same : public true_type {}; + +/// \brief Metafunction that removes const qualification from a type. +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type; }; + +/// \brief Metafunction that removes volatile qualification from a type. +template struct remove_volatile { typedef T type; }; +template struct remove_volatile { typedef T type; }; + +/// \brief Metafunction that removes both const and volatile qualification from +/// a type. +template struct remove_cv { + typedef typename remove_const::type>::type type; +}; + +/// \brief Helper to implement is_integral metafunction. +template struct is_integral_impl : false_type {}; +template <> struct is_integral_impl< bool> : true_type {}; +template <> struct is_integral_impl< char> : true_type {}; +template <> struct is_integral_impl< signed char> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< wchar_t> : true_type {}; +template <> struct is_integral_impl< short> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< int> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< long> : true_type {}; +template <> struct is_integral_impl : true_type {}; +template <> struct is_integral_impl< long long> : true_type {}; +template <> struct is_integral_impl : true_type {}; + +/// \brief Metafunction that determines whether the given type is an integral +/// type. +template +struct is_integral : is_integral_impl {}; + +/// \brief Metafunction to remove reference from a type. +template struct remove_reference { typedef T type; }; +template struct remove_reference { typedef T type; }; + +/// \brief Metafunction that determines whether the given type is a pointer +/// type. +template struct is_pointer : false_type {}; +template struct is_pointer : true_type {}; +template struct is_pointer : true_type {}; +template struct is_pointer : true_type {}; +template struct is_pointer : true_type {}; + +/// \brief Metafunction that determines wheather the given type is a reference. +template struct is_reference : false_type {}; +template struct is_reference : true_type {}; + +/// \brief Metafunction that determines whether the given type is either an +/// integral type or an enumeration type. +/// +/// Note that this accepts potentially more integral types than we whitelist +/// above for is_integral because it is based on merely being convertible +/// implicitly to an integral type. +template class is_integral_or_enum { + // Provide an overload which can be called with anything implicitly + // convertible to an unsigned long long. This should catch integer types and + // enumeration types at least. We blacklist classes with conversion operators + // below. + static double check_int_convertible(unsigned long long); + static char check_int_convertible(...); + + typedef typename remove_reference::type UnderlyingT; + static UnderlyingT &nonce_instance; + +public: + static const bool + value = (!is_class::value && !is_pointer::value && + !is_same::value && + !is_same::value && + sizeof(char) != sizeof(check_int_convertible(nonce_instance))); +}; + +// enable_if_c - Enable/disable a template based on a metafunction +template +struct enable_if_c { + typedef T type; +}; + +template struct enable_if_c { }; + +// enable_if - Enable/disable a template based on a metafunction +template +struct enable_if : public enable_if_c { }; + +namespace dont_use { + template char base_of_helper(const volatile Base*); + template double base_of_helper(...); +} + +/// is_base_of - Metafunction to determine whether one type is a base class of +/// (or identical to) another type. +template +struct is_base_of { + static const bool value + = is_class::value && is_class::value && + sizeof(char) == sizeof(dont_use::base_of_helper((Derived*)0)); +}; + +// remove_pointer - Metafunction to turn Foo* into Foo. Defined in +// C++0x [meta.trans.ptr]. +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { + typedef T type; }; + +// If T is a pointer, just return it. If it is not, return T&. +template +struct add_lvalue_reference_if_not_pointer { typedef T &type; }; + +template +struct add_lvalue_reference_if_not_pointer >::type> { + typedef T type; +}; + +// If T is a pointer to X, return a pointer to const X. If it is not, return +// const T. +template +struct add_const_past_pointer { typedef const T type; }; + +template +struct add_const_past_pointer >::type> { + typedef const typename remove_pointer::type *type; +}; + +template +struct conditional { typedef T type; }; + +template +struct conditional { typedef F type; }; + +} + +#ifdef LLVM_DEFINED_HAS_FEATURE +#undef __has_feature +#endif + +#endif diff --git a/third_party/microprofile/README.md b/third_party/microprofile/README.md new file mode 100644 index 000000000..8b8040b20 --- /dev/null +++ b/third_party/microprofile/README.md @@ -0,0 +1 @@ +https://bitbucket.org/jonasmeyer/microprofile diff --git a/third_party/microprofile/microprofile.h b/third_party/microprofile/microprofile.h new file mode 100644 index 000000000..3e9dca24a --- /dev/null +++ b/third_party/microprofile/microprofile.h @@ -0,0 +1,4075 @@ +#pragma once +// This is free and unencumbered software released into the public domain. +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// For more information, please refer to +// +// *********************************************************************** +// +// +// +// +// Howto: +// Call these functions from your code: +// MicroProfileOnThreadCreate +// MicroProfileMouseButton +// MicroProfileMousePosition +// MicroProfileModKey +// MicroProfileFlip <-- Call this once per frame +// MicroProfileDraw <-- Call this once per frame +// MicroProfileToggleDisplayMode <-- Bind to a key to toggle profiling +// MicroProfileTogglePause <-- Bind to a key to toggle pause +// +// Use these macros in your code in blocks you want to time: +// +// MICROPROFILE_DECLARE +// MICROPROFILE_DEFINE +// MICROPROFILE_DECLARE_GPU +// MICROPROFILE_DEFINE_GPU +// MICROPROFILE_SCOPE +// MICROPROFILE_SCOPEI +// MICROPROFILE_SCOPEGPU +// MICROPROFILE_SCOPEGPUI +// MICROPROFILE_META +// +// +// Usage: +// +// { +// MICROPROFILE_SCOPEI("GroupName", "TimerName", nColorRgb): +// ..Code to be timed.. +// } +// +// MICROPROFILE_DECLARE / MICROPROFILE_DEFINE allows defining groups in a shared place, to ensure sorting of the timers +// +// (in global scope) +// MICROPROFILE_DEFINE(g_ProfileFisk, "Fisk", "Skalle", nSomeColorRgb); +// +// (in some other file) +// MICROPROFILE_DECLARE(g_ProfileFisk); +// +// void foo(){ +// MICROPROFILE_SCOPE(g_ProfileFisk); +// } +// +// Once code is instrumented the gui is activeted by calling MicroProfileToggleDisplayMode or by clicking in the upper left corner of +// the screen +// +// The following functions must be implemented before the profiler is usable +// debug render: +// void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, uint32_t nNumCharacters); +// void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType = MicroProfileBoxTypeFlat); +// void MicroProfileDrawLine2D(uint32_t nVertices, float* pVertices, uint32_t nColor); +// Gpu time stamps: +// uint32_t MicroProfileGpuInsertTimeStamp(); +// uint64_t MicroProfileGpuGetTimeStamp(uint32_t nKey); +// uint64_t MicroProfileTicksPerSecondGpu(); +// threading: +// const char* MicroProfileGetThreadName(); Threadnames in detailed view + + +#ifndef MICROPROFILE_ENABLED +#define MICROPROFILE_ENABLED 1 +#endif + +#if 0 == MICROPROFILE_ENABLED + +#define MICROPROFILE_DECLARE(var) +#define MICROPROFILE_DEFINE(var, group, name, color) +#define MICROPROFILE_DECLARE_GPU(var) +#define MICROPROFILE_DEFINE_GPU(var, group, name, color) +#define MICROPROFILE_SCOPE(var) do{}while(0) +#define MICROPROFILE_SCOPEI(group, name, color) do{}while(0) +#define MICROPROFILE_SCOPEGPU(var) do{}while(0) +#define MICROPROFILE_SCOPEGPUI(group, name, color) do{}while(0) +#define MICROPROFILE_META(name, count) +#define MICROPROFILE_FORCEENABLECPUGROUP(s) do{} while(0) +#define MICROPROFILE_FORCEDISABLECPUGROUP(s) do{} while(0) +#define MICROPROFILE_FORCEENABLEGPUGROUP(s) do{} while(0) +#define MICROPROFILE_FORCEDISABLEGPUGROUP(s) do{} while(0) + +#define MicroProfileGetTime(group, name) 0.f +#define MicroProfileOnThreadCreate(foo) do{}while(0) +#define MicroProfileMouseButton(foo, bar) do{}while(0) +#define MicroProfileMousePosition(foo, bar) do{}while(0) +#define MicroProfileModKey(key) do{}while(0) +#define MicroProfileFlip() do{}while(0) +#define MicroProfileDraw(foo, bar) do{}while(0) +#define MicroProfileIsDrawing() 0 +#define MicroProfileToggleDisplayMode() do{}while(0) +#define MicroProfileSetDisplayMode() do{}while(0) +#define MicroProfileTogglePause() do{}while(0) +#define MicroProfileDumpTimers() do{}while(0) + +#else + +#include +#include + +#if defined(__APPLE__) +#include +#include +#include +#include +#include +#if TARGET_OS_IPHONE +#define MICROPROFILE_IOS +#endif + +#define MP_TICK() mach_absolute_time() +inline int64_t MicroProfileTicksPerSecondCpu() +{ + static int64_t nTicksPerSecond = 0; + if(nTicksPerSecond == 0) + { + mach_timebase_info_data_t sTimebaseInfo; + mach_timebase_info(&sTimebaseInfo); + nTicksPerSecond = 1000000000ll * sTimebaseInfo.denom / sTimebaseInfo.numer; + } + return nTicksPerSecond; +} + +#define MP_BREAK() __builtin_trap() +#define MP_THREAD_LOCAL __thread +#define MP_STRCASECMP strcasecmp +#define MP_GETCURRENTTHREADID() (uint64_t)pthread_self() +typedef uint64_t ThreadIdType; + +#elif defined(_WIN32) +int64_t MicroProfileTicksPerSecondCpu(); +int64_t MicroProfileGetTick(); +#define MP_TICK() MicroProfileGetTick() +#define MP_BREAK() __debugbreak() +#define MP_THREAD_LOCAL __declspec(thread) +#define MP_STRCASECMP _stricmp +#define MP_GETCURRENTTHREADID() GetCurrentThreadId() +typedef uint32_t ThreadIdType; + +#elif defined(__linux__) +#include +#include +inline int64_t MicroProfileTicksPerSecondCpu() +{ + return 1000000000ll; +} + +inline int64_t MicroProfileGetTick() +{ + timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return 1000000000ll * ts.tv_sec + ts.tv_nsec; +} +#define MP_TICK() MicroProfileGetTick() +#define MP_BREAK() __builtin_trap() +#define MP_THREAD_LOCAL __thread +#define MP_STRCASECMP strcasecmp +#define MP_GETCURRENTTHREADID() (uint64_t)pthread_self() +typedef uint64_t ThreadIdType; +#endif + +#ifndef MP_GETCURRENTTHREADID +#define MP_GETCURRENTTHREADID() 0 +typedef uint32_t ThreadIdType; +#endif + +#ifndef MICROPROFILE_API +#define MICROPROFILE_API +#endif + +#define MP_ASSERT(a) do{if(!(a)){MP_BREAK();} }while(0) +#define MICROPROFILE_DECLARE(var) extern MicroProfileToken g_mp_##var +#define MICROPROFILE_DEFINE(var, group, name, color) MicroProfileToken g_mp_##var = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeCpu) +#define MICROPROFILE_DECLARE_GPU(var) extern MicroProfileToken g_mp_##var +#define MICROPROFILE_DEFINE_GPU(var, group, name, color) MicroProfileToken g_mp_##var = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeGpu) +#define MICROPROFILE_TOKEN_PASTE0(a, b) a ## b +#define MICROPROFILE_TOKEN_PASTE(a, b) MICROPROFILE_TOKEN_PASTE0(a,b) +#define MICROPROFILE_SCOPE(var) MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(g_mp_##var) +#define MICROPROFILE_SCOPEI(group, name, color) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__) = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeCpu); MicroProfileScopeHandler MICROPROFILE_TOKEN_PASTE(foo,__LINE__)( MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__)) +#define MICROPROFILE_SCOPEGPU(var) MicroProfileScopeGpuHandler MICROPROFILE_TOKEN_PASTE(foo, __LINE__)(g_mp_##var) +#define MICROPROFILE_SCOPEGPUI(group, name, color) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__) = MicroProfileGetToken(group, name, color, MicroProfileTokenTypeGpu); MicroProfileScopeGpuHandler MICROPROFILE_TOKEN_PASTE(foo,__LINE__)( MICROPROFILE_TOKEN_PASTE(g_mp,__LINE__)) +#define MICROPROFILE_META_CPU(name, count) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__) = MicroProfileGetMetaToken(name); MicroProfileMetaUpdate(MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__), count, MicroProfileTokenTypeCpu) +#define MICROPROFILE_META_GPU(name, count) static MicroProfileToken MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__) = MicroProfileGetMetaToken(name); MicroProfileMetaUpdate(MICROPROFILE_TOKEN_PASTE(g_mp_meta,__LINE__), count, MicroProfileTokenTypeGpu) + +///configuration + +#ifndef MICROPROFILE_TEXT_WIDTH +#define MICROPROFILE_TEXT_WIDTH 5 +#endif + +#ifndef MICROPROFILE_TEXT_HEIGHT +#define MICROPROFILE_TEXT_HEIGHT 8 +#endif + +#ifndef MICROPROFILE_DETAILED_BAR_HEIGHT +#define MICROPROFILE_DETAILED_BAR_HEIGHT 12 +#endif + +#ifndef MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT +#define MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT 7 +#endif + +#ifndef MICROPROFILE_GRAPH_WIDTH +#define MICROPROFILE_GRAPH_WIDTH 256 +#endif + +#ifndef MICROPROFILE_GRAPH_HEIGHT +#define MICROPROFILE_GRAPH_HEIGHT 256 +#endif + +#ifndef MICROPROFILE_BORDER_SIZE +#define MICROPROFILE_BORDER_SIZE 1 +#endif + +#ifndef MICROPROFILE_USE_THREAD_NAME_CALLBACK +#define MICROPROFILE_USE_THREAD_NAME_CALLBACK 0 +#endif + +#ifndef MICROPROFILE_DRAWCURSOR +#define MICROPROFILE_DRAWCURSOR 0 +#endif + +#ifndef MICROPROFILE_DETAILED_BAR_NAMES +#define MICROPROFILE_DETAILED_BAR_NAMES 1 +#endif + +#ifndef MICROPROFILE_GPU_FRAME_DELAY +#define MICROPROFILE_GPU_FRAME_DELAY 3 //must be > 0 +#endif + +#ifndef MICROPROFILE_PER_THREAD_BUFFER_SIZE +#define MICROPROFILE_PER_THREAD_BUFFER_SIZE (2048<<10) +#endif + +#ifndef MICROPROFILE_HELP_LEFT +#define MICROPROFILE_HELP_LEFT "Left-Click" +#endif + +#ifndef MICROPROFILE_HELP_ALT +#define MICROPROFILE_HELP_ALT "Alt-Click" +#endif + +#ifndef MICROPROFILE_HELP_MOD +#define MICROPROFILE_HELP_MOD "Mod" +#endif + +#ifndef MICROPROFILE_PRINTF +#define MICROPROFILE_PRINTF printf +#endif + +#ifndef MICROPROFILE_META_MAX +#define MICROPROFILE_META_MAX 8 +#endif + + + +#define MICROPROFILE_FORCEENABLECPUGROUP(s) MicroProfileForceEnableGroup(s, MicroProfileTokenTypeCpu) +#define MICROPROFILE_FORCEDISABLECPUGROUP(s) MicroProfileForceDisableGroup(s, MicroProfileTokenTypeCpu) +#define MICROPROFILE_FORCEENABLEGPUGROUP(s) MicroProfileForceEnableGroup(s, MicroProfileTokenTypeGpu) +#define MICROPROFILE_FORCEDISABLEGPUGROUP(s) MicroProfileForceDisableGroup(s, MicroProfileTokenTypeGpu) + +#define MICROPROFILE_INVALID_TICK ((uint64_t)-1) +#define MICROPROFILE_GROUP_MASK_ALL 0xffffffffffff + + +typedef uint64_t MicroProfileToken; +typedef uint16_t MicroProfileGroupId; + +#define MICROPROFILE_INVALID_TOKEN (uint64_t)-1 + +enum MicroProfileTokenType +{ + MicroProfileTokenTypeCpu, + MicroProfileTokenTypeGpu, +}; +enum MicroProfileBoxType +{ + MicroProfileBoxTypeBar, + MicroProfileBoxTypeFlat, +}; + +struct MicroProfileState +{ + uint32_t nDisplay; + uint32_t nMenuAllGroups; + uint64_t nMenuActiveGroup; + uint32_t nMenuAllThreads; + uint32_t nAggregateFlip; + uint32_t nBars; + float fReferenceTime; +}; + + +MICROPROFILE_API void MicroProfileInit(); +MICROPROFILE_API void MicroProfileShutdown(); +MICROPROFILE_API MicroProfileToken MicroProfileFindToken(const char* sGroup, const char* sName); +MICROPROFILE_API MicroProfileToken MicroProfileGetToken(const char* sGroup, const char* sName, uint32_t nColor, MicroProfileTokenType Token = MicroProfileTokenTypeCpu); +MICROPROFILE_API MicroProfileToken MicroProfileGetMetaToken(const char* pName); +MICROPROFILE_API void MicroProfileMetaUpdate(MicroProfileToken, int nCount, MicroProfileTokenType eTokenType); +MICROPROFILE_API uint64_t MicroProfileEnter(MicroProfileToken nToken); +MICROPROFILE_API void MicroProfileLeave(MicroProfileToken nToken, uint64_t nTick); +MICROPROFILE_API uint64_t MicroProfileGpuEnter(MicroProfileToken nToken); +MICROPROFILE_API void MicroProfileGpuLeave(MicroProfileToken nToken, uint64_t nTick); +inline uint16_t MicroProfileGetTimerIndex(MicroProfileToken t){ return (t&0xffff); } +inline uint64_t MicroProfileGetGroupMask(MicroProfileToken t){ return ((t>>16)&MICROPROFILE_GROUP_MASK_ALL);} +inline MicroProfileToken MicroProfileMakeToken(uint64_t nGroupMask, uint16_t nTimer){ return (nGroupMask<<16) | nTimer;} + +MICROPROFILE_API void MicroProfileFlip(); //! called once per frame. +MICROPROFILE_API void MicroProfileDraw(uint32_t nWidth, uint32_t nHeight); //! call if drawing microprofilers +MICROPROFILE_API bool MicroProfileIsDrawing(); +MICROPROFILE_API void MicroProfileToggleGraph(MicroProfileToken nToken); +MICROPROFILE_API bool MicroProfileDrawGraph(uint32_t nScreenWidth, uint32_t nScreenHeight); +MICROPROFILE_API void MicroProfileSetAggregateCount(uint32_t nCount); //!Set no. of frames to aggregate over. 0 for infinite +MICROPROFILE_API void MicroProfileToggleDisplayMode(); //switch between off, bars, detailed +MICROPROFILE_API void MicroProfileSetDisplayMode(int); //switch between off, bars, detailed +MICROPROFILE_API void MicroProfileClearGraph(); +MICROPROFILE_API void MicroProfileTogglePause(); +MICROPROFILE_API void MicroProfileGetState(MicroProfileState* pStateOut); +MICROPROFILE_API void MicroProfileSetState(MicroProfileState* pStateIn); +MICROPROFILE_API void MicroProfileForceEnableGroup(const char* pGroup, MicroProfileTokenType Type); +MICROPROFILE_API void MicroProfileForceDisableGroup(const char* pGroup, MicroProfileTokenType Type); +MICROPROFILE_API float MicroProfileGetTime(const char* pGroup, const char* pName); +MICROPROFILE_API void MicroProfileMousePosition(uint32_t nX, uint32_t nY, int nWheelDelta); +MICROPROFILE_API void MicroProfileModKey(uint32_t nKeyState); +MICROPROFILE_API void MicroProfileMouseButton(uint32_t nLeft, uint32_t nRight); +MICROPROFILE_API void MicroProfileOnThreadCreate(const char* pThreadName); //should be called from newly created threads +MICROPROFILE_API void MicroProfileOnThreadExit(); //call on exit to reuse log +MICROPROFILE_API void MicroProfileInitThreadLog(); +MICROPROFILE_API void MicroProfileDrawLineVertical(int nX, int nTop, int nBottom, uint32_t nColor); +MICROPROFILE_API void MicroProfileDrawLineHorizontal(int nLeft, int nRight, int nY, uint32_t nColor); +MICROPROFILE_API void MicroProfileDumpTimers(); + + + +//UNDEFINED: MUST BE IMPLEMENTED ELSEWHERE +MICROPROFILE_API void MicroProfileDrawText(int nX, int nY, uint32_t nColor, const char* pText, uint32_t nNumCharacters); +MICROPROFILE_API void MicroProfileDrawBox(int nX, int nY, int nX1, int nY1, uint32_t nColor, MicroProfileBoxType = MicroProfileBoxTypeFlat); +MICROPROFILE_API void MicroProfileDrawLine2D(uint32_t nVertices, float* pVertices, uint32_t nColor); +MICROPROFILE_API uint32_t MicroProfileGpuInsertTimeStamp(); +MICROPROFILE_API uint64_t MicroProfileGpuGetTimeStamp(uint32_t nKey); +MICROPROFILE_API uint64_t MicroProfileTicksPerSecondGpu(); +#if MICROPROFILE_USE_THREAD_NAME_CALLBACK +MICROPROFILE_API const char* MicroProfileGetThreadName(); +#else +#define MicroProfileGetThreadName() "" +#endif + +struct MicroProfileScopeHandler +{ + MicroProfileToken nToken; + uint64_t nTick; + MicroProfileScopeHandler(MicroProfileToken Token):nToken(Token) + { + nTick = MicroProfileEnter(nToken); + } + ~MicroProfileScopeHandler() + { + MicroProfileLeave(nToken, nTick); + } +}; + +struct MicroProfileScopeGpuHandler +{ + MicroProfileToken nToken; + uint64_t nTick; + MicroProfileScopeGpuHandler(MicroProfileToken Token):nToken(Token) + { + nTick = MicroProfileGpuEnter(nToken); + } + ~MicroProfileScopeGpuHandler() + { + MicroProfileGpuLeave(nToken, nTick); + } +}; + + + + +#ifdef MICRO_PROFILE_IMPL + +#ifdef _WIN32 +#include +#define snprintf _snprintf + +#pragma warning(push) +#pragma warning(disable: 4244) +int64_t MicroProfileTicksPerSecondCpu() +{ + static int64_t nTicksPerSecond = 0; + if(nTicksPerSecond == 0) + { + QueryPerformanceFrequency((LARGE_INTEGER*)&nTicksPerSecond); + } + return nTicksPerSecond; +} +int64_t MicroProfileGetTick() +{ + int64_t ticks; + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + return ticks; +} + +#endif + +#include +#include +#include +#include +#include +#include +#include + + +#define S g_MicroProfile +#define MICROPROFILE_MAX_TIMERS 1024 +#define MICROPROFILE_MAX_GROUPS 48 //dont bump! no. of bits used it bitmask +#define MICROPROFILE_MAX_GRAPHS 5 +#define MICROPROFILE_GRAPH_HISTORY 128 +#define MICROPROFILE_BUFFER_SIZE ((MICROPROFILE_PER_THREAD_BUFFER_SIZE)/sizeof(MicroProfileLogEntry)) +#define MICROPROFILE_MAX_THREADS 32 +#define MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS 256 +#define MICROPROFILE_STACK_MAX 128 +#define MICROPROFILE_MAX_PRESETS 5 +#define MICROPROFILE_DEBUG 0 +#define MICROPROFILE_TOOLTIP_MAX_STRINGS (32 + MICROPROFILE_MAX_GROUPS*2) +#define MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE 1024 +#define MICROPROFILE_TOOLTIP_MAX_LOCKED 3 +#define MICROPROFILE_MAX_FRAME_HISTORY 512 +#define MICROPROFILE_ANIM_DELAY_PRC 0.5f +#define MICROPROFILE_GAP_TIME 50 //extra ms to fetch to close timers from earlier frames + +#ifndef MICROPROFILE_CONTEXT_SWITCH_TRACE +#ifdef _WIN32 +#define MICROPROFILE_CONTEXT_SWITCH_TRACE 1 +#else +#define MICROPROFILE_CONTEXT_SWITCH_TRACE 0 +#endif +#endif + +#if MICROPROFILE_CONTEXT_SWITCH_TRACE +#define MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE (128*1024) //2mb with 16 byte entry size +#else +#define MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE (1) +#endif + +enum MicroProfileDrawMask +{ + MP_DRAW_OFF = 0x0, + MP_DRAW_BARS = 0x1, + MP_DRAW_DETAILED = 0x2, + MP_DRAW_HIDDEN = 0x3, +}; + +enum MicroProfileDrawBarsMask +{ + MP_DRAW_TIMERS = 0x1, + MP_DRAW_AVERAGE = 0x2, + MP_DRAW_MAX = 0x4, + MP_DRAW_CALL_COUNT = 0x8, + MP_DRAW_TIMERS_EXCLUSIVE = 0x10, + MP_DRAW_AVERAGE_EXCLUSIVE = 0x20, + MP_DRAW_MAX_EXCLUSIVE = 0x40, + MP_DRAW_META_FIRST = 0x80, + MP_DRAW_ALL = 0xffffffff, + +}; + +struct MicroProfileTimer +{ + uint64_t nTicks; + uint32_t nCount; +}; + +struct MicroProfileGroupInfo +{ + const char* pName; + uint32_t nNameLen; + uint32_t nGroupIndex; + uint32_t nNumTimers; + uint32_t nMaxTimerNameLen; + MicroProfileTokenType Type; +}; + +struct MicroProfileTimerInfo +{ + MicroProfileToken nToken; + uint32_t nTimerIndex; + uint32_t nGroupIndex; + const char* pName; + uint32_t nNameLen; + uint32_t nColor; +}; + +struct MicroProfileGraphState +{ + int64_t nHistory[MICROPROFILE_GRAPH_HISTORY]; + MicroProfileToken nToken; + int32_t nKey; +}; + +struct MicroProfileContextSwitch +{ + ThreadIdType nThreadOut; + ThreadIdType nThreadIn; + int64_t nCpu : 8; + int64_t nTicks : 56; +}; + + +#define MP_LOG_TICK_MASK 0x0000ffffffffffff +#define MP_LOG_INDEX_MASK 0x3fff000000000000 +#define MP_LOG_BEGIN_MASK 0xc000000000000000 +#define MP_LOG_META 0x1 +#define MP_LOG_ENTER 0x2 +#define MP_LOG_LEAVE 0x0 +typedef uint64_t MicroProfileLogEntry; + + + +inline int MicroProfileLogType(MicroProfileLogEntry Index) +{ + return ((MP_LOG_BEGIN_MASK & Index)>>62) & 0x3; +} + +inline uint64_t MicroProfileLogTimerIndex(MicroProfileLogEntry Index) +{ + return (0x3fff&(Index>>48)); +} + +inline MicroProfileLogEntry MicroProfileMakeLogIndex(uint64_t nBegin, MicroProfileToken nToken, int64_t nTick) +{ + MicroProfileLogEntry Entry = (nBegin<<62) | ((0x3fff&nToken)<<48) | (MP_LOG_TICK_MASK&nTick); + int t = MicroProfileLogType(Entry); + uint64_t nTimerIndex = MicroProfileLogTimerIndex(Entry); + MP_ASSERT(t == nBegin); + MP_ASSERT(nTimerIndex == (nToken&0x3fff)); + return Entry; + +} + +inline int64_t MicroProfileLogTickDifference(MicroProfileLogEntry Start, MicroProfileLogEntry End) +{ + uint64_t nStart = Start; + uint64_t nEnd = End; + int64_t nDifference = ((nEnd<<16) - (nStart<<16)); + return nDifference >> 16; +} + +inline int64_t MicroProfileLogGetTick(MicroProfileLogEntry e) +{ + return MP_LOG_TICK_MASK & e; +} + +inline int64_t MicroProfileLogSetTick(MicroProfileLogEntry e, int64_t nTick) +{ + return (MP_LOG_TICK_MASK & nTick) | (e & ~MP_LOG_TICK_MASK); +} + +struct MicroProfileFrameState +{ + int64_t nFrameStartCpu; + int64_t nFrameStartGpu; + uint32_t nLogStart[MICROPROFILE_MAX_GROUPS]; +}; + +struct MicroProfileThreadLog +{ + MicroProfileThreadLog* pNext; + MicroProfileLogEntry Log[MICROPROFILE_BUFFER_SIZE]; + + std::atomic nPut; + std::atomic nGet; + uint32_t nActive; + uint32_t nGpu; + ThreadIdType nThreadId; + enum + { + THREAD_MAX_LEN = 64, + }; + char ThreadName[64]; + int nFreeListNext; +}; + +struct MicroProfileStringArray +{ + const char* ppStrings[MICROPROFILE_TOOLTIP_MAX_STRINGS]; + char Buffer[MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE]; + char* pBufferPos; + uint32_t nNumStrings; +}; + + +struct +{ + uint32_t nTotalTimers; + uint32_t nGroupCount; + uint32_t nAggregateFlip; + uint32_t nAggregateFlipCount; + uint32_t nAggregateFrames; + + uint32_t nDisplay; + uint32_t nBars; + uint64_t nActiveGroup; + uint32_t nActiveBars; + + uint64_t nForceGroup; + + //menu/mouse over stuff + uint64_t nMenuActiveGroup; + uint32_t nMenuAllGroups; + uint32_t nMenuAllThreads; + uint64_t nHoverToken; + int64_t nHoverTime; + int nHoverFrame; +#if MICROPROFILE_DEBUG + uint64_t nHoverAddressEnter; + uint64_t nHoverAddressLeave; +#endif + uint32_t nOverflow; + + uint64_t nGroupMask; + uint32_t nRunning; + uint32_t nMaxGroupSize; + + float fGraphBaseTime; //old kill + float fGraphBaseTimePos; //old kill + float fReferenceTime; + float fRcpReferenceTime; + uint32_t nOpacityBackground; + uint32_t nOpacityForeground; + + float fDetailedOffset; //display offset relative to start of latest displayable frame. + float fDetailedRange; //no. of ms to display + + float fDetailedOffsetTarget; + float fDetailedRangeTarget; + + int nOffsetY; + + uint32_t nWidth; + uint32_t nHeight; + + uint32_t nBarWidth; + uint32_t nBarHeight; + + + MicroProfileGroupInfo GroupInfo[MICROPROFILE_MAX_GROUPS]; + MicroProfileTimerInfo TimerInfo[MICROPROFILE_MAX_TIMERS]; + + MicroProfileTimer AggregateTimers[MICROPROFILE_MAX_TIMERS]; + uint64_t MaxTimers[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateTimersExclusive[MICROPROFILE_MAX_TIMERS]; + uint64_t MaxTimersExclusive[MICROPROFILE_MAX_TIMERS]; + + MicroProfileTimer Frame[MICROPROFILE_MAX_TIMERS]; + uint64_t FrameExclusive[MICROPROFILE_MAX_TIMERS]; + + MicroProfileTimer Aggregate[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateMax[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateExclusive[MICROPROFILE_MAX_TIMERS]; + uint64_t AggregateMaxExclusive[MICROPROFILE_MAX_TIMERS]; + + struct + { + uint64_t nCounters[MICROPROFILE_MAX_TIMERS]; + const char* pName; + } MetaCounters[MICROPROFILE_META_MAX]; + + MicroProfileGraphState Graph[MICROPROFILE_MAX_GRAPHS]; + uint32_t nGraphPut; + + uint32_t nMouseX; + uint32_t nMouseY; + int nMouseWheelDelta; + uint32_t nMouseDownLeft; + uint32_t nMouseDownRight; + uint32_t nMouseLeft; + uint32_t nMouseRight; + uint32_t nMouseLeftMod; + uint32_t nMouseRightMod; + uint32_t nModDown; + uint32_t nActiveMenu; + + uint32_t nThreadActive[MICROPROFILE_MAX_THREADS]; + MicroProfileThreadLog* Pool[MICROPROFILE_MAX_THREADS]; + uint32_t nNumLogs; + uint32_t nMemUsage; + int nFreeListHead; + + uint32_t nFrameCurrent; + uint32_t nFramePut; + + MicroProfileFrameState Frames[MICROPROFILE_MAX_FRAME_HISTORY]; + + MicroProfileLogEntry* pDisplayMouseOver; + + + uint64_t nFlipTicks; + uint64_t nFlipAggregate; + uint64_t nFlipMax; + uint64_t nFlipAggregateDisplay; + uint64_t nFlipMaxDisplay; + + + MicroProfileStringArray LockedToolTips[MICROPROFILE_TOOLTIP_MAX_LOCKED]; + uint32_t nLockedToolTipColor[MICROPROFILE_TOOLTIP_MAX_LOCKED]; + int LockedToolTipFront; + + + int64_t nRangeBegin; + int64_t nRangeEnd; + int64_t nRangeBeginGpu; + int64_t nRangeEndGpu; + uint32_t nRangeBeginIndex; + uint32_t nRangeEndIndex; + MicroProfileThreadLog* pRangeLog; + uint32_t nHoverColor; + uint32_t nHoverColorShared; + + + std::thread* pContextSwitchThread; + bool bContextSwitchRunning; + bool bContextSwitchStop; + bool bContextSwitchAllThreads; + bool bContextSwitchNoBars; + uint32_t nContextSwitchUsage; + uint32_t nContextSwitchLastPut; + + int64_t nContextSwitchHoverTickIn; + int64_t nContextSwitchHoverTickOut; + uint32_t nContextSwitchHoverThread; + uint32_t nContextSwitchHoverThreadBefore; + uint32_t nContextSwitchHoverThreadAfter; + uint8_t nContextSwitchHoverCpu; + uint8_t nContextSwitchHoverCpuNext; + + uint32_t nContextSwitchPut; + MicroProfileContextSwitch ContextSwitch[MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE]; + +} g_MicroProfile; + +MicroProfileThreadLog* g_MicroProfileGpuLog = 0; +#ifdef MICROPROFILE_IOS +// iOS doesn't support __thread +static pthread_key_t g_MicroProfileThreadLogKey; +static pthread_once_t g_MicroProfileThreadLogKeyOnce = PTHREAD_ONCE_INIT; +static void MicroProfileCreateThreadLogKey() +{ + pthread_key_create(&g_MicroProfileThreadLogKey, NULL); +} +#else +MP_THREAD_LOCAL MicroProfileThreadLog* g_MicroProfileThreadLog = 0; +#endif +static bool g_bUseLock = false; /// This is used because windows does not support using mutexes under dll init(which is where global initialization is handled) +static uint32_t g_nMicroProfileBackColors[2] = { 0x474747, 0x313131 }; + +#define MICROPROFILE_NUM_CONTEXT_SWITCH_COLORS 16 +static uint32_t g_nMicroProfileContextSwitchThreadColors[MICROPROFILE_NUM_CONTEXT_SWITCH_COLORS] = //palette generated by http://tools.medialab.sciences-po.fr/iwanthue/index.php +{ + 0x63607B, + 0x755E2B, + 0x326A55, + 0x523135, + 0x904F42, + 0x87536B, + 0x346875, + 0x5E6046, + 0x35404C, + 0x224038, + 0x413D1E, + 0x5E3A26, + 0x5D6161, + 0x4C6234, + 0x7D564F, + 0x5C4352, +}; +static uint32_t g_MicroProfileAggregatePresets[] = {0, 10, 20, 30, 60, 120}; +static float g_MicroProfileReferenceTimePresets[] = {5.f, 10.f, 15.f,20.f, 33.33f, 66.66f, 100.f}; +static uint32_t g_MicroProfileOpacityPresets[] = {0x40, 0x80, 0xc0, 0xff}; +static const char* g_MicroProfilePresetNames[] = +{ + "Default", + "Render", + "GPU", + "Lighting", + "AI", + "Visibility", + "Sound", +}; + + +MICROPROFILE_DEFINE(g_MicroProfileDetailed, "MicroProfile", "Detailed View", 0x8888000); +MICROPROFILE_DEFINE(g_MicroProfileDrawGraph, "MicroProfile", "Draw Graph", 0xff44ee00); +MICROPROFILE_DEFINE(g_MicroProfileFlip, "MicroProfile", "MicroProfileFlip", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileThreadLoop, "MicroProfile", "ThreadLoop", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileClear, "MicroProfile", "Clear", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileAccumulate, "MicroProfile", "Accumulate", 0x3355ee); +MICROPROFILE_DEFINE(g_MicroProfileDrawBarView, "MicroProfile", "DrawBarView", 0x00dd77); +MICROPROFILE_DEFINE(g_MicroProfileDraw,"MicroProfile", "Draw", 0x737373); +MICROPROFILE_DEFINE(g_MicroProfileContextSwitchDraw, "MicroProfile", "ContextSwitchDraw", 0x730073); +MICROPROFILE_DEFINE(g_MicroProfileContextSwitchSearch,"MicroProfile", "ContextSwitchSearch", 0xDD7300); + +void MicroProfileStartContextSwitchTrace(); +void MicroProfileStopContextSwitchTrace(); +bool MicroProfileIsLocalThread(uint32_t nThreadId); + +inline std::recursive_mutex& MicroProfileMutex() +{ + static std::recursive_mutex Mutex; + return Mutex; +} + +template +T MicroProfileMin(T a, T b) +{ return a < b ? a : b; } + +template +T MicroProfileMax(T a, T b) +{ return a > b ? a : b; } + + + +void MicroProfileStringArrayClear(MicroProfileStringArray* pArray) +{ + pArray->nNumStrings = 0; + pArray->pBufferPos = &pArray->Buffer[0]; +} + +void MicroProfileStringArrayAddLiteral(MicroProfileStringArray* pArray, const char* pLiteral) +{ + pArray->ppStrings[pArray->nNumStrings++] = pLiteral; +} + +void MicroProfileStringArrayFormat(MicroProfileStringArray* pArray, const char* fmt, ...) +{ + pArray->ppStrings[pArray->nNumStrings++] = pArray->pBufferPos; + va_list args; + va_start (args, fmt); + pArray->pBufferPos += 1 + vsprintf(pArray->pBufferPos, fmt, args); + va_end(args); + MP_ASSERT(pArray->pBufferPos < pArray->Buffer + MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE); +} +void MicroProfileStringArrayCopy(MicroProfileStringArray* pDest, MicroProfileStringArray* pSrc) +{ + memcpy(&pDest->ppStrings[0], &pSrc->ppStrings[0], sizeof(pDest->ppStrings)); + memcpy(&pDest->Buffer[0], &pSrc->Buffer[0], sizeof(pDest->Buffer)); + for(uint32_t i = 0; i < MICROPROFILE_TOOLTIP_MAX_STRINGS; ++i) + { + if(i < pSrc->nNumStrings) + { + if(pSrc->ppStrings[i] >= &pSrc->Buffer[0] && pSrc->ppStrings[i] < &pSrc->Buffer[0] + MICROPROFILE_TOOLTIP_STRING_BUFFER_SIZE) + { + pDest->ppStrings[i] += &pDest->Buffer[0] - &pSrc->Buffer[0]; + } + } + } + pDest->nNumStrings = pSrc->nNumStrings; +} + +MicroProfileThreadLog* MicroProfileCreateThreadLog(const char* pName); +void MicroProfileLoadPreset(const char* pSuffix); +void MicroProfileSavePreset(const char* pSuffix); + + +inline int64_t MicroProfileMsToTick(float fMs, int64_t nTicksPerSecond) +{ + return (int64_t)(fMs*0.001f*nTicksPerSecond); +} + +inline float MicroProfileTickToMsMultiplier(int64_t nTicksPerSecond) +{ + return 1000.f / nTicksPerSecond; +} + +inline uint16_t MicroProfileGetGroupIndex(MicroProfileToken t) +{ + return (uint16_t)S.TimerInfo[MicroProfileGetTimerIndex(t)].nGroupIndex; +} + + +void MicroProfileInit() +{ + std::recursive_mutex& mutex = MicroProfileMutex(); + bool bUseLock = g_bUseLock; + if(bUseLock) + mutex.lock(); + static bool bOnce = true; + if(bOnce) + { + S.nMemUsage += sizeof(S); + bOnce = false; + memset(&S, 0, sizeof(S)); + S.nGroupCount = 0; + S.nBarWidth = 100; + S.nBarHeight = MICROPROFILE_TEXT_HEIGHT; + S.nActiveGroup = 0; + S.nActiveBars = 0; + S.nForceGroup = 0; + S.nMenuAllGroups = 0; + S.nMenuActiveGroup = 0; + S.nMenuAllThreads = 1; + S.nAggregateFlip = 30; + S.nTotalTimers = 0; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + } + S.nBars = MP_DRAW_ALL; + S.nRunning = 1; + S.fGraphBaseTime = 40.f; + S.nWidth = 100; + S.nHeight = 100; + S.nActiveMenu = (uint32_t)-1; + S.fReferenceTime = 33.33f; + S.fRcpReferenceTime = 1.f / S.fReferenceTime; + S.nFreeListHead = -1; + int64_t nTick = MP_TICK(); + for(int i = 0; i < MICROPROFILE_MAX_FRAME_HISTORY; ++i) + { + S.Frames[i].nFrameStartCpu = nTick; + S.Frames[i].nFrameStartGpu = -1; + } + + MicroProfileThreadLog* pGpu = MicroProfileCreateThreadLog("GPU"); + g_MicroProfileGpuLog = pGpu; + MP_ASSERT(S.Pool[0] == pGpu); + pGpu->nGpu = 1; + pGpu->nThreadId = 0; + + + S.fDetailedOffsetTarget = S.fDetailedOffset = 0.f; + S.fDetailedRangeTarget = S.fDetailedRange = 50.f; + + S.nOpacityBackground = 0xff<<24; + S.nOpacityForeground = 0xff<<24; + } + if(bUseLock) + mutex.unlock(); +} + +void MicroProfileShutdown() +{ +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + std::lock_guard Lock(MicroProfileMutex()); + if(S.pContextSwitchThread) + { + if(S.pContextSwitchThread->joinable()) + { + S.bContextSwitchStop = true; + S.pContextSwitchThread->join(); + } + delete S.pContextSwitchThread; + } +#endif + + +} + +#ifdef MICROPROFILE_IOS +inline MicroProfileThreadLog* MicroProfileGetThreadLog() +{ + pthread_once(&g_MicroProfileThreadLogKeyOnce, MicroProfileCreateThreadLogKey); + return (MicroProfileThreadLog*)pthread_getspecific(g_MicroProfileThreadLogKey); +} + +inline void MicroProfileSetThreadLog(MicroProfileThreadLog* pLog) +{ + pthread_once(&g_MicroProfileThreadLogKeyOnce, MicroProfileCreateThreadLogKey); + pthread_setspecific(g_MicroProfileThreadLogKey, pLog); +} +#else +MicroProfileThreadLog* MicroProfileGetThreadLog() +{ + return g_MicroProfileThreadLog; +} +inline void MicroProfileSetThreadLog(MicroProfileThreadLog* pLog) +{ + g_MicroProfileThreadLog = pLog; +} +#endif + + +MicroProfileThreadLog* MicroProfileCreateThreadLog(const char* pName) +{ + MicroProfileThreadLog* pLog = 0; + if(S.nFreeListHead != -1) + { + pLog = S.Pool[S.nFreeListHead]; + S.nFreeListHead = S.Pool[S.nFreeListHead]->nFreeListNext; + } + else + { + pLog = new MicroProfileThreadLog; + S.nMemUsage += sizeof(MicroProfileThreadLog); + S.Pool[S.nNumLogs++] = pLog; + } + memset(pLog, 0, sizeof(*pLog)); + int len = (int)strlen(pName); + int maxlen = sizeof(pLog->ThreadName)-1; + len = len < maxlen ? len : maxlen; + memcpy(&pLog->ThreadName[0], pName, len); + pLog->ThreadName[len] = '\0'; + pLog->nThreadId = MP_GETCURRENTTHREADID(); + pLog->nFreeListNext = -1; + return pLog; +} + +void MicroProfileOnThreadCreate(const char* pThreadName) +{ + g_bUseLock = true; + MicroProfileInit(); + std::lock_guard Lock(MicroProfileMutex()); + MP_ASSERT(MicroProfileGetThreadLog() == 0); + MicroProfileThreadLog* pLog = MicroProfileCreateThreadLog(pThreadName ? pThreadName : MicroProfileGetThreadName()); + MP_ASSERT(pLog); + MicroProfileSetThreadLog(pLog); +} + +void MicroProfileOnThreadExit() +{ + MicroProfileThreadLog* pLog = MicroProfileGetThreadLog(); + if(pLog) + { + int32_t nLogIndex = -1; + for(int i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + if(pLog == S.Pool[i]) + { + nLogIndex = i; + break; + } + } + MP_ASSERT(nLogIndex < MICROPROFILE_MAX_THREADS && nLogIndex > 0); + pLog->nFreeListNext = S.nFreeListHead; + pLog->nThreadId = 0; + S.nFreeListHead = nLogIndex; + } +} + +void MicroProfileInitThreadLog() +{ + MicroProfileOnThreadCreate(nullptr); +} + + +struct MicroProfileScopeLock +{ + bool bUseLock; + std::recursive_mutex& m; + MicroProfileScopeLock(std::recursive_mutex& m) : bUseLock(g_bUseLock), m(m) + { + if(bUseLock) + m.lock(); + } + ~MicroProfileScopeLock() + { + if(bUseLock) + m.unlock(); + } +}; + +MicroProfileToken MicroProfileFindToken(const char* pGroup, const char* pName) +{ + MicroProfileInit(); + MicroProfileScopeLock L(MicroProfileMutex()); + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + if(!MP_STRCASECMP(pName, S.TimerInfo[i].pName) && !MP_STRCASECMP(pGroup, S.GroupInfo[S.TimerInfo[i].nGroupIndex].pName)) + { + return S.TimerInfo[i].nToken; + } + } + return MICROPROFILE_INVALID_TOKEN; +} + +uint16_t MicroProfileGetGroup(const char* pGroup, MicroProfileTokenType Type) +{ + for(uint32_t i = 0; i < S.nGroupCount; ++i) + { + if(!MP_STRCASECMP(pGroup, S.GroupInfo[i].pName)) + { + return i; + } + } + uint16_t nGroupIndex = 0xffff; + S.GroupInfo[S.nGroupCount].pName = pGroup; + S.GroupInfo[S.nGroupCount].nNameLen = (uint32_t)strlen(pGroup); + S.GroupInfo[S.nGroupCount].nGroupIndex = S.nGroupCount; + S.GroupInfo[S.nGroupCount].nNumTimers = 0; + S.GroupInfo[S.nGroupCount].Type = Type; + S.GroupInfo[S.nGroupCount].nMaxTimerNameLen = 0; + nGroupIndex = S.nGroupCount++; + S.nGroupMask = (S.nGroupMask<<1)|1; + MP_ASSERT(nGroupIndex < MICROPROFILE_MAX_GROUPS); + return nGroupIndex; +} + + +MicroProfileToken MicroProfileGetToken(const char* pGroup, const char* pName, uint32_t nColor, MicroProfileTokenType Type) +{ + MicroProfileInit(); + MicroProfileScopeLock L(MicroProfileMutex()); + MicroProfileToken ret = MicroProfileFindToken(pGroup, pName); + if(ret != MICROPROFILE_INVALID_TOKEN) + return ret; + uint16_t nGroupIndex = MicroProfileGetGroup(pGroup, Type); + uint16_t nTimerIndex = (uint16_t)(S.nTotalTimers++); + uint64_t nGroupMask = 1ll << nGroupIndex; + MicroProfileToken nToken = MicroProfileMakeToken(nGroupMask, nTimerIndex); + S.GroupInfo[nGroupIndex].nNumTimers++; + S.GroupInfo[nGroupIndex].nMaxTimerNameLen = MicroProfileMax(S.GroupInfo[nGroupIndex].nMaxTimerNameLen, (uint32_t)strlen(pName)); + MP_ASSERT(S.GroupInfo[nGroupIndex].Type == Type); //dont mix cpu & gpu timers in the same group + S.nMaxGroupSize = MicroProfileMax(S.nMaxGroupSize, S.GroupInfo[nGroupIndex].nNumTimers); + S.TimerInfo[nTimerIndex].nToken = nToken; + S.TimerInfo[nTimerIndex].pName = pName; + S.TimerInfo[nTimerIndex].nNameLen = (uint32_t)strlen(pName); + S.TimerInfo[nTimerIndex].nColor = nColor&0xffffff; + S.TimerInfo[nTimerIndex].nGroupIndex = nGroupIndex; + return nToken; +} + +MicroProfileToken MicroProfileGetMetaToken(const char* pName) +{ + MicroProfileInit(); + MicroProfileScopeLock L(MicroProfileMutex()); + for(uint32_t i = 0; i < MICROPROFILE_META_MAX; ++i) + { + if(!S.MetaCounters[i].pName) + { + S.MetaCounters[i].pName = pName; + return i; + } + else if(!MP_STRCASECMP(pName, S.MetaCounters[i].pName)) + { + return i; + } + } + MP_ASSERT(0);//out of slots, increase MICROPROFILE_META_MAX + return (MicroProfileToken)-1; +} + + +inline void MicroProfileLogPut(MicroProfileToken nToken_, uint64_t nTick, uint64_t nBegin, MicroProfileThreadLog* pLog) +{ + MP_ASSERT(pLog != 0); //this assert is hit if MicroProfileOnCreateThread is not called + uint32_t nPos = pLog->nPut.load(std::memory_order_relaxed); + uint32_t nNextPos = (nPos+1) % MICROPROFILE_BUFFER_SIZE; + if(nNextPos == pLog->nGet.load(std::memory_order_relaxed)) + { + S.nOverflow = 100; + } + else + { + int64_t test = MicroProfileMakeLogIndex(nBegin, nToken_, nTick);; + MP_ASSERT(MicroProfileLogType(test) == nBegin); + MP_ASSERT(MicroProfileLogTimerIndex(test) == MicroProfileGetTimerIndex(nToken_)); + pLog->Log[nPos] = MicroProfileMakeLogIndex(nBegin, nToken_, nTick); + pLog->nPut.store(nNextPos, std::memory_order_release); + } +} + +uint64_t MicroProfileEnter(MicroProfileToken nToken_) +{ + if(MicroProfileGetGroupMask(nToken_) & S.nActiveGroup) + { + if(!MicroProfileGetThreadLog()) + { + MicroProfileInitThreadLog(); + } + uint64_t nTick = MP_TICK(); + MicroProfileLogPut(nToken_, nTick, MP_LOG_ENTER, MicroProfileGetThreadLog()); + return nTick; + } + return MICROPROFILE_INVALID_TICK; +} + +void MicroProfileMetaUpdate(MicroProfileToken nToken, int nCount, MicroProfileTokenType eTokenType) +{ + if((MP_DRAW_META_FIRST< nGet) + { + nRange[0][0] = nGet; + nRange[0][1] = nPut; + nRange[1][0] = nRange[1][1] = 0; + } + else if(nPut != nGet) + { + MP_ASSERT(nGet != MICROPROFILE_BUFFER_SIZE); + uint32_t nCountEnd = MICROPROFILE_BUFFER_SIZE - nGet; + nRange[0][0] = nGet; + nRange[0][1] = nGet + nCountEnd; + nRange[1][0] = 0; + nRange[1][1] = nPut; + } +} + +void MicroProfileFlip() +{ + #if 0 + //verify LogEntry wraps correctly + MicroProfileLogEntry c = MP_LOG_TICK_MASK-5000; + for(int i = 0; i < 10000; ++i, c += 1) + { + MicroProfileLogEntry l2 = (c+2500) & MP_LOG_TICK_MASK; + MP_ASSERT(2500 == MicroProfileLogTickDifference(c, l2)); + } + #endif + MICROPROFILE_SCOPE(g_MicroProfileFlip); + std::lock_guard Lock(MicroProfileMutex()); + + { + static int once = 0; + if(0 == once) + { + uint32_t nDisplay = S.nDisplay; + MicroProfileLoadPreset(g_MicroProfilePresetNames[0]); + once++; + S.nDisplay = nDisplay;// dont load display, just state + } + } + + if(S.nRunning) + { + S.nFramePut = (S.nFramePut+1) % MICROPROFILE_MAX_FRAME_HISTORY; + S.nFrameCurrent = (S.nFramePut + MICROPROFILE_MAX_FRAME_HISTORY - MICROPROFILE_GPU_FRAME_DELAY - 1) % MICROPROFILE_MAX_FRAME_HISTORY; + uint32_t nFrameNext = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + + uint32_t nContextSwitchPut = S.nContextSwitchPut; + if(S.nContextSwitchLastPut < nContextSwitchPut) + { + S.nContextSwitchUsage = (nContextSwitchPut - S.nContextSwitchLastPut); + } + else + { + S.nContextSwitchUsage = MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE - S.nContextSwitchLastPut + nContextSwitchPut; + } + S.nContextSwitchLastPut = nContextSwitchPut; + + MicroProfileFrameState* pFramePut = &S.Frames[S.nFramePut]; + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + MicroProfileFrameState* pFrameNext = &S.Frames[nFrameNext]; + + pFramePut->nFrameStartCpu = MP_TICK(); + pFramePut->nFrameStartGpu = (uint32_t)MicroProfileGpuInsertTimeStamp(); + if(pFrameNext->nFrameStartGpu != (uint64_t)-1) + pFrameNext->nFrameStartGpu = MicroProfileGpuGetTimeStamp((uint32_t)pFrameNext->nFrameStartGpu); + + if(pFrameCurrent->nFrameStartGpu == (uint64_t)-1) + pFrameCurrent->nFrameStartGpu = pFrameNext->nFrameStartGpu + 1; + + uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; + uint64_t nFrameEndCpu = pFrameNext->nFrameStartCpu; + uint64_t nFrameStartGpu = pFrameCurrent->nFrameStartGpu; + uint64_t nFrameEndGpu = pFrameNext->nFrameStartGpu; + + { + uint64_t nTick = nFrameEndCpu - nFrameStartCpu; + S.nFlipTicks = nTick; + S.nFlipAggregate += nTick; + S.nFlipMax = MicroProfileMax(S.nFlipMax, nTick); + } + + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(!pLog) + { + pFramePut->nLogStart[i] = 0; + } + else + { + pFramePut->nLogStart[i] = pLog->nPut.load(std::memory_order_acquire); + //need to keep last frame around to close timers. timers more than 1 frame old is ditched. + pLog->nGet.store(pFrameCurrent->nLogStart[i], std::memory_order_relaxed); + } + } + + if(S.nRunning) + { + { + MICROPROFILE_SCOPE(g_MicroProfileClear); + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + S.Frame[i].nTicks = 0; + S.Frame[i].nCount = 0; + S.FrameExclusive[i] = 0; + } + for(uint32_t j = 0; j < MICROPROFILE_META_MAX; ++j) + { + if(S.MetaCounters[j].pName) + { + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + S.MetaCounters[j].nCounters[i] = 0; + } + } + } + } + { + MICROPROFILE_SCOPE(g_MicroProfileThreadLoop); + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(!pLog) + continue; + + uint32_t nPut = pFrameNext->nLogStart[i]; + uint32_t nGet = pFrameCurrent->nLogStart[i]; + uint32_t nRange[2][2] = { {0, 0}, {0, 0}, }; + MicroProfileGetRange(nPut, nGet, nRange); + + + uint64_t nFrameStart = pLog->nGpu ? nFrameStartGpu : nFrameStartCpu; + uint64_t nFrameEnd = pLog->nGpu ? nFrameEndGpu : nFrameEndCpu; + //fetch gpu results. + if(pLog->nGpu) + { + for(uint32_t j = 0; j < 2; ++j) + { + uint32_t nStart = nRange[j][0]; + uint32_t nEnd = nRange[j][1]; + for(uint32_t k = nStart; k < nEnd; ++k) + { + MicroProfileLogEntry L = pLog->Log[k]; + pLog->Log[k] = MicroProfileLogSetTick(L, MicroProfileGpuGetTimeStamp((uint32_t)MicroProfileLogGetTick(L))); + } + } + } + uint32_t nStack[MICROPROFILE_STACK_MAX]; + int64_t nChildTickStack[MICROPROFILE_STACK_MAX]; + uint32_t nStackPos = 0; + nChildTickStack[0] = 0; + + for(uint32_t j = 0; j < 2; ++j) + { + uint32_t nStart = nRange[j][0]; + uint32_t nEnd = nRange[j][1]; + for(uint32_t k = nStart; k < nEnd; ++k) + { + MicroProfileLogEntry LE = pLog->Log[k]; + int nType = MicroProfileLogType(LE); + if(MP_LOG_ENTER == nType) + { + MP_ASSERT(nStackPos < MICROPROFILE_STACK_MAX); + nStack[nStackPos++] = k; + nChildTickStack[nStackPos] = 0; + } + else if(MP_LOG_META == nType) + { + if(nStackPos) + { + int64_t nMetaIndex = MicroProfileLogTimerIndex(LE); + int64_t nMetaCount = MicroProfileLogGetTick(LE); + MP_ASSERT(nMetaIndex < MICROPROFILE_META_MAX); + int64_t nCounter = MicroProfileLogTimerIndex(pLog->Log[nStack[nStackPos-1]]); + S.MetaCounters[nMetaIndex].nCounters[nCounter] += nMetaCount; + } + } + else + { + MP_ASSERT(nType == MP_LOG_LEAVE); + //todo: reconsider the fallback for Leaves without enters + int64_t nTickStart = 0 != nStackPos ? pLog->Log[nStack[nStackPos-1]] : nFrameStart; + int64_t nTicks = MicroProfileLogTickDifference(nTickStart, LE); + int64_t nChildTicks = nChildTickStack[nStackPos]; + if(0 != nStackPos) + { + MP_ASSERT(MicroProfileLogTimerIndex(pLog->Log[nStack[nStackPos-1]]) == MicroProfileLogTimerIndex(LE)); + nStackPos--; + nChildTickStack[nStackPos] += nTicks; + } + uint32_t nTimerIndex = MicroProfileLogTimerIndex(LE); + S.Frame[nTimerIndex].nTicks += nTicks; + S.FrameExclusive[nTimerIndex] += (nTicks-nChildTicks); + S.Frame[nTimerIndex].nCount += 1; + } + } + } + //todo: reconsider the fallback for enters without leaves + for(uint32_t j = 0; j < nStackPos; ++j) + { + MicroProfileLogEntry LE = pLog->Log[nStack[j]]; + uint64_t nTicks = MicroProfileLogTickDifference(LE, nFrameEnd); + uint32_t nTimerIndex = MicroProfileLogTimerIndex(LE); + S.Frame[nTimerIndex].nTicks += nTicks; + } + } + } + { + MICROPROFILE_SCOPE(g_MicroProfileAccumulate); + for(uint32_t i = 0; i < S.nTotalTimers; ++i) + { + S.AggregateTimers[i].nTicks += S.Frame[i].nTicks; + S.AggregateTimers[i].nCount += S.Frame[i].nCount; + S.MaxTimers[i] = MicroProfileMax(S.MaxTimers[i], S.Frame[i].nTicks); + S.AggregateTimersExclusive[i] += S.FrameExclusive[i]; + S.MaxTimersExclusive[i] = MicroProfileMax(S.MaxTimersExclusive[i], S.FrameExclusive[i]); + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + { + MicroProfileToken nToken = S.Graph[i].nToken; + S.Graph[i].nHistory[S.nGraphPut] = S.Frame[MicroProfileGetTimerIndex(nToken)].nTicks; + } + } + S.nGraphPut = (S.nGraphPut+1) % MICROPROFILE_GRAPH_HISTORY; + + } + + + if(S.nRunning && S.nAggregateFlip <= ++S.nAggregateFlipCount) + { + memcpy(&S.Aggregate[0], &S.AggregateTimers[0], sizeof(S.Aggregate[0]) * S.nTotalTimers); + memcpy(&S.AggregateMax[0], &S.MaxTimers[0], sizeof(S.AggregateMax[0]) * S.nTotalTimers); + memcpy(&S.AggregateExclusive[0], &S.AggregateTimersExclusive[0], sizeof(S.AggregateExclusive[0]) * S.nTotalTimers); + memcpy(&S.AggregateMaxExclusive[0], &S.MaxTimersExclusive[0], sizeof(S.AggregateMaxExclusive[0]) * S.nTotalTimers); + + S.nAggregateFrames = S.nAggregateFlipCount; + S.nFlipAggregateDisplay = S.nFlipAggregate; + S.nFlipMaxDisplay = S.nFlipMax; + + + if(S.nAggregateFlip) // if 0 accumulate indefinitely + { + memset(&S.AggregateTimers[0], 0, sizeof(S.Aggregate[0]) * S.nTotalTimers); + memset(&S.MaxTimers[0], 0, sizeof(S.MaxTimers[0]) * S.nTotalTimers); + memset(&S.AggregateTimersExclusive[0], 0, sizeof(S.AggregateExclusive[0]) * S.nTotalTimers); + memset(&S.MaxTimersExclusive[0], 0, sizeof(S.MaxTimersExclusive[0]) * S.nTotalTimers); + S.nAggregateFlipCount = 0; + S.nFlipAggregate = 0; + S.nFlipMax = 0; + } + } + } + uint64_t nNewActiveGroup = 0; + if(S.nDisplay && S.nRunning) + nNewActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; + nNewActiveGroup |= S.nForceGroup; + if(S.nActiveGroup != nNewActiveGroup) + S.nActiveGroup = nNewActiveGroup; + uint32_t nNewActiveBars = 0; + if(S.nDisplay && S.nRunning) + nNewActiveBars = S.nBars; + if(nNewActiveBars != S.nActiveBars) + S.nActiveBars = nNewActiveBars; + + S.fDetailedOffset = S.fDetailedOffset + (S.fDetailedOffsetTarget - S.fDetailedOffset) * MICROPROFILE_ANIM_DELAY_PRC; + S.fDetailedRange = S.fDetailedRange + (S.fDetailedRangeTarget - S.fDetailedRange) * MICROPROFILE_ANIM_DELAY_PRC; + +} + +void MicroProfileSetDisplayMode(int nValue) +{ + nValue = nValue >= 0 && nValue < 4 ? nValue : S.nDisplay; + S.nDisplay = nValue; + S.fGraphBaseTime = 40.f; + S.nOffsetY = 0; +} + +void MicroProfileToggleDisplayMode() +{ + S.nDisplay = (S.nDisplay + 1) % 4; + S.nOffsetY = 0; + +} + + +void MicroProfileFloatWindowSize(const char** ppStrings, uint32_t nNumStrings, uint32_t* pColors, uint32_t& nWidth, uint32_t& nHeight, uint32_t* pStringLengths = 0) +{ + uint32_t* nStringLengths = pStringLengths ? pStringLengths : (uint32_t*)alloca(nNumStrings * sizeof(uint32_t)); + uint32_t nTextCount = nNumStrings/2; + for(uint32_t i = 0; i < nTextCount; ++i) + { + uint32_t i0 = i * 2; + uint32_t s0, s1; + nStringLengths[i0] = s0 = (uint32_t)strlen(ppStrings[i0]); + nStringLengths[i0+1] = s1 = (uint32_t)strlen(ppStrings[i0+1]); + nWidth = MicroProfileMax(s0+s1, nWidth); + } + nWidth = (MICROPROFILE_TEXT_WIDTH+1) * (2+nWidth) + 2 * MICROPROFILE_BORDER_SIZE; + if(pColors) + nWidth += MICROPROFILE_TEXT_WIDTH + 1; + nHeight = (MICROPROFILE_TEXT_HEIGHT+1) * nTextCount + 2 * MICROPROFILE_BORDER_SIZE; +} + +void MicroProfileDrawFloatWindow(uint32_t nX, uint32_t nY, const char** ppStrings, uint32_t nNumStrings, uint32_t nColor, uint32_t* pColors = 0) +{ + uint32_t nWidth = 0, nHeight = 0; + uint32_t* nStringLengths = (uint32_t*)alloca(nNumStrings * sizeof(uint32_t)); + MicroProfileFloatWindowSize(ppStrings, nNumStrings, pColors, nWidth, nHeight, nStringLengths); + uint32_t nTextCount = nNumStrings/2; + if(nX + nWidth > S.nWidth) + nX = S.nWidth - nWidth; + if(nY + nHeight > S.nHeight) + nY = S.nHeight - nHeight; + MicroProfileDrawBox(nX-1, nY-1, nX + nWidth+1, nY + nHeight+1, 0xff000000|nColor); + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000); + if(pColors) + { + nX += MICROPROFILE_TEXT_WIDTH+1; + nWidth -= MICROPROFILE_TEXT_WIDTH+1; + } + for(uint32_t i = 0; i < nTextCount; ++i) + { + int i0 = i * 2; + if(pColors) + { + MicroProfileDrawBox(nX-MICROPROFILE_TEXT_WIDTH, nY, nX, nY + MICROPROFILE_TEXT_WIDTH, pColors[i]|0xff000000); + } + MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i0], (uint32_t)strlen(ppStrings[i0])); + MicroProfileDrawText(nX + nWidth - nStringLengths[i0+1] * (MICROPROFILE_TEXT_WIDTH+1), nY + 1, (uint32_t)-1, ppStrings[i0+1], (uint32_t)strlen(ppStrings[i0+1])); + nY += (MICROPROFILE_TEXT_HEIGHT+1); + } +} + +void MicroProfileDrawTextBox(uint32_t nX, uint32_t nY, const char** ppStrings, uint32_t nNumStrings, uint32_t nColor, uint32_t* pColors = 0) +{ + uint32_t nWidth = 0, nHeight = 0; + uint32_t* nStringLengths = (uint32_t*)alloca(nNumStrings * sizeof(uint32_t)); + for(uint32_t i = 0; i < nNumStrings; ++i) + { + nStringLengths[i] = (uint32_t)strlen(ppStrings[i]); + nWidth = MicroProfileMax(nWidth, nStringLengths[i]); + nHeight++; + } + nWidth = (MICROPROFILE_TEXT_WIDTH+1) * (2+nWidth) + 2 * MICROPROFILE_BORDER_SIZE; + nHeight = (MICROPROFILE_TEXT_HEIGHT+1) * nHeight + 2 * MICROPROFILE_BORDER_SIZE; + if(nX + nWidth > S.nWidth) + nX = S.nWidth - nWidth; + if(nY + nHeight > S.nHeight) + nY = S.nHeight - nHeight; + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000); + for(uint32_t i = 0; i < nNumStrings; ++i) + { + MicroProfileDrawText(nX + 1, nY + 1, (uint32_t)-1, ppStrings[i], (uint32_t)strlen(ppStrings[i])); + nY += (MICROPROFILE_TEXT_HEIGHT+1); + } +} + + + +void MicroProfileToolTipMeta(MicroProfileStringArray* pToolTip) +{ + if(S.nRangeBeginIndex != S.nRangeEndIndex && S.pRangeLog) + { + uint64_t nMetaSum[MICROPROFILE_META_MAX] = {0}; + + uint32_t nRange[2][2]; + MicroProfileThreadLog* pLog = S.pRangeLog; + + + MicroProfileGetRange(S.nRangeEndIndex, S.nRangeBeginIndex, nRange); + for(uint32_t i = 0; i < 2; ++i) + { + uint32_t nStart = nRange[i][0]; + uint32_t nEnd = nRange[i][1]; + for(uint32_t j = nStart; j < nEnd; ++j) + { + MicroProfileLogEntry LE = pLog->Log[j]; + int nType = MicroProfileLogType(LE); + if(MP_LOG_META == nType) + { + int64_t nMetaIndex = MicroProfileLogTimerIndex(LE); + int64_t nMetaCount = MicroProfileLogGetTick(LE); + MP_ASSERT(nMetaIndex < MICROPROFILE_META_MAX); + nMetaSum[nMetaIndex] += nMetaCount; + } + } + } + bool bSpaced = false; + for(int i = 0; i < MICROPROFILE_META_MAX; ++i) + { + if(S.MetaCounters[i].pName && nMetaSum[i]) + { + if(!bSpaced) + { + bSpaced = true; + MicroProfileStringArrayAddLiteral(pToolTip, ""); + MicroProfileStringArrayAddLiteral(pToolTip, ""); + } + MicroProfileStringArrayFormat(pToolTip, "%s", S.MetaCounters[i].pName); + MicroProfileStringArrayFormat(pToolTip, "%5d", nMetaSum[i]); + } + } + } +} + + +void MicroProfileDrawFloatTooltip(uint32_t nX, uint32_t nY, uint32_t nToken, uint64_t nTime) +{ + uint32_t nIndex = MicroProfileGetTimerIndex(nToken); + uint32_t nAggregateFrames = S.nAggregateFrames ? S.nAggregateFrames : 1; + uint32_t nAggregateCount = S.Aggregate[nIndex].nCount ? S.Aggregate[nIndex].nCount : 1; + + uint32_t nGroupId = MicroProfileGetGroupIndex(nToken); + uint32_t nTimerId = MicroProfileGetTimerIndex(nToken); + bool bGpu = S.GroupInfo[nGroupId].Type == MicroProfileTokenTypeGpu; + + float fToMs = MicroProfileTickToMsMultiplier(bGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + + float fMs = fToMs * (nTime); + float fFrameMs = fToMs * (S.Frame[nIndex].nTicks); + float fAverage = fToMs * (S.Aggregate[nIndex].nTicks/nAggregateFrames); + float fCallAverage = fToMs * (S.Aggregate[nIndex].nTicks / nAggregateCount); + float fMax = fToMs * (S.AggregateMax[nIndex]); + + float fFrameMsExclusive = fToMs * (S.FrameExclusive[nIndex]); + float fAverageExclusive = fToMs * (S.AggregateExclusive[nIndex]/nAggregateFrames); + float fMaxExclusive = fToMs * (S.AggregateMaxExclusive[nIndex]); + + + MicroProfileStringArray ToolTip; + MicroProfileStringArrayClear(&ToolTip); + const char* pGroupName = S.GroupInfo[nGroupId].pName; + const char* pTimerName = S.TimerInfo[nTimerId].pName; + MicroProfileStringArrayFormat(&ToolTip, "%s", pGroupName); + MicroProfileStringArrayFormat(&ToolTip,"%s", pTimerName); + +#if MICROPROFILE_DEBUG + MicroProfileStringArrayFormat(&ToolTip,"0x%p", S.nHoverAddressEnter); + MicroProfileStringArrayFormat(&ToolTip,"0x%p", S.nHoverAddressLeave); +#endif + + if(nTime != (uint64_t)0) + { + MicroProfileStringArrayAddLiteral(&ToolTip, "Time:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fMs); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + } + + MicroProfileStringArrayAddLiteral(&ToolTip, "Frame Time:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fFrameMs); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Average:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fAverage); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Max:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fMax); + + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Frame Call Average:"); + MicroProfileStringArrayFormat(&ToolTip,"%6.3fms", fCallAverage); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Frame Call Count:"); + MicroProfileStringArrayFormat(&ToolTip, "%6d", nAggregateCount / nAggregateFrames); + + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Exclusive Frame Time:"); + MicroProfileStringArrayFormat(&ToolTip, "%6.3fms", fFrameMsExclusive); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Exclusive Average:"); + MicroProfileStringArrayFormat(&ToolTip, "%6.3fms", fAverageExclusive); + + MicroProfileStringArrayAddLiteral(&ToolTip, "Exclusive Max:"); + MicroProfileStringArrayFormat(&ToolTip, "%6.3fms", fMaxExclusive); + + MicroProfileToolTipMeta(&ToolTip); + + + MicroProfileDrawFloatWindow(nX, nY+20, &ToolTip.ppStrings[0], ToolTip.nNumStrings, S.TimerInfo[nTimerId].nColor); + + if(S.nMouseLeftMod) + { + int nIndex = (S.LockedToolTipFront + MICROPROFILE_TOOLTIP_MAX_LOCKED - 1) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + S.nLockedToolTipColor[nIndex] = S.TimerInfo[nTimerId].nColor; + MicroProfileStringArrayCopy(&S.LockedToolTips[nIndex], &ToolTip); + S.LockedToolTipFront = nIndex; + + } +} + +#define MICROPROFILE_FRAME_HISTORY_HEIGHT 50 +#define MICROPROFILE_FRAME_HISTORY_WIDTH 7 +#define MICROPROFILE_FRAME_HISTORY_COLOR_CPU 0xffff7f27 //255 127 39 +#define MICROPROFILE_FRAME_HISTORY_COLOR_GPU 0xff37a0ee //55 160 238 +#define MICROPROFILE_FRAME_HISTORY_COLOR_HIGHTLIGHT 0x7733bb44 +#define MICROPROFILE_FRAME_COLOR_HIGHTLIGHT 0x20009900 +#define MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU 0x20996600 +#define MICROPROFILE_NUM_FRAMES (MICROPROFILE_MAX_FRAME_HISTORY - (MICROPROFILE_GPU_FRAME_DELAY+1)) + +void MicroProfileZoomTo(int64_t nTickStart, int64_t nTickEnd) +{ + int64_t nStart = S.Frames[S.nFrameCurrent].nFrameStartCpu; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + S.fDetailedOffsetTarget = MicroProfileLogTickDifference(nStart, nTickStart) * fToMs; + S.fDetailedRangeTarget = MicroProfileLogTickDifference(nTickStart, nTickEnd) * fToMs; + + +} + +void MicroProfileCenter(int64_t nTickCenter) +{ + int64_t nStart = S.Frames[S.nFrameCurrent].nFrameStartCpu; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fCenter = MicroProfileLogTickDifference(nStart, nTickCenter) * fToMs; + S.fDetailedOffsetTarget = S.fDetailedOffset = fCenter - 0.5f * S.fDetailedRange; +} +#ifdef MICROPROFILE_DEBUG +uint64_t* g_pMicroProfileDumpStart = 0; +uint64_t* g_pMicroProfileDumpEnd = 0; +void MicroProfileDebugDumpRange() +{ + if(g_pMicroProfileDumpStart != g_pMicroProfileDumpEnd) + { + uint64_t* pStart = g_pMicroProfileDumpStart; + uint64_t* pEnd = g_pMicroProfileDumpEnd; + while(pStart != pEnd) + { + uint64_t nTick = MicroProfileLogGetTick(*pStart); + uint64_t nToken = MicroProfileLogTimerIndex(*pStart); + uint32_t nTimerId = MicroProfileGetTimerIndex(nToken); + + const char* pTimerName = S.TimerInfo[nTimerId].pName; + char buffer[256]; + int type = MicroProfileLogType(*pStart); + + const char* pBegin = type == MP_LOG_LEAVE ? "END" : + (type == MP_LOG_ENTER ? "BEGIN" : "META"); + snprintf(buffer, 255, "DUMP 0x%p: %s :: %llx: %s\n", pStart, pBegin, nTick, pTimerName); +#ifdef _WIN32 + OutputDebugStringA(buffer); +#else + printf("%s", buffer); +#endif + pStart++; + } + + g_pMicroProfileDumpStart = g_pMicroProfileDumpEnd; + } +} +#define MP_DEBUG_DUMP_RANGE() MicroProfileDebugDumpRange(); +#else +#define MP_DEBUG_DUMP_RANGE() do{} while(0) +#endif + +#define MICROPROFILE_HOVER_DIST 0.5f + +void MicroProfileDrawDetailedContextSwitchBars(uint32_t nY, uint32_t nThreadId, uint32_t nContextSwitchStart, uint32_t nContextSwitchEnd, int64_t nBaseTicks, uint32_t nBaseY) +{ + MICROPROFILE_SCOPE(g_MicroProfileContextSwitchDraw); + int64_t nTickIn = -1; + uint32_t nThreadBefore = -1; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fMsToScreen = S.nWidth / S.fDetailedRange; + float fMouseX = (float)S.nMouseX; + float fMouseY = (float)S.nMouseY; + + + for(uint32_t j = nContextSwitchStart; j != nContextSwitchEnd; j = (j+1) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE) + { + MP_ASSERT(j < MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE); + MicroProfileContextSwitch CS = S.ContextSwitch[j]; + + if(nTickIn == -1) + { + if(CS.nThreadIn == nThreadId) + { + nTickIn = CS.nTicks; + nThreadBefore = CS.nThreadOut; + } + } + else + { + if(CS.nThreadOut == nThreadId) + { + int64_t nTickOut = CS.nTicks; + float fMsStart = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickIn); + float fMsEnd = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickOut); + if(fMsStart <= fMsEnd) + { + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + float fYStart = (float)nY; + float fYEnd = fYStart + (MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT); + uint32_t nColor = g_nMicroProfileContextSwitchThreadColors[CS.nCpu%MICROPROFILE_NUM_CONTEXT_SWITCH_COLORS]; + float fXDist = MicroProfileMax(fXStart - fMouseX, fMouseX - fXEnd); + bool bHover = fXDist < MICROPROFILE_HOVER_DIST && fYStart <= fMouseY && fMouseY <= fYEnd && nBaseY < fMouseY; + if(bHover) + { + S.nRangeBegin = nTickIn; + S.nRangeEnd = nTickOut; + S.nContextSwitchHoverTickIn = nTickIn; + S.nContextSwitchHoverTickOut = nTickOut; + S.nContextSwitchHoverThread = CS.nThreadOut; + S.nContextSwitchHoverThreadBefore = nThreadBefore; + S.nContextSwitchHoverThreadAfter = CS.nThreadIn; + S.nContextSwitchHoverCpuNext = CS.nCpu; + nColor = S.nHoverColor; + } + if(CS.nCpu == S.nContextSwitchHoverCpu) + { + nColor = S.nHoverColorShared; + } + MicroProfileDrawBox(fXStart, fYStart, fXEnd, fYEnd, nColor|S.nOpacityForeground, MicroProfileBoxTypeFlat); + } + nTickIn = -1; + } + } + } +} + +void MicroProfileDrawDetailedBars(uint32_t nWidth, uint32_t nHeight, int nBaseY, int nSelectedFrame) +{ + MP_DEBUG_DUMP_RANGE(); + int nY = nBaseY - S.nOffsetY; + int64_t nNumBoxes = 0; + int64_t nNumLines = 0; + + uint32_t nFrameNext = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + MicroProfileFrameState* pFrameNext = &S.Frames[nFrameNext]; + + S.nRangeBegin = 0; + S.nRangeEnd = 0; + S.nRangeBeginGpu = 0; + S.nRangeEndGpu = 0; + S.nRangeBeginIndex = S.nRangeEndIndex = 0; + S.pRangeLog = 0; + uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; + uint64_t nFrameStartGpu = pFrameCurrent->nFrameStartGpu; + float fToMsCpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fToMsGpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondGpu()); + + float fDetailedOffset = S.fDetailedOffset; + float fDetailedRange = S.fDetailedRange; + int64_t nDetailedOffsetTicksCpu = MicroProfileMsToTick(fDetailedOffset, MicroProfileTicksPerSecondCpu()); + int64_t nDetailedOffsetTicksGpu = MicroProfileMsToTick(fDetailedOffset, MicroProfileTicksPerSecondGpu()); + int64_t nBaseTicksCpu = nDetailedOffsetTicksCpu + nFrameStartCpu; + int64_t nBaseTicksGpu = nDetailedOffsetTicksGpu + nFrameStartGpu; + int64_t nBaseTicksEndCpu = nBaseTicksCpu + MicroProfileMsToTick(fDetailedRange, MicroProfileTicksPerSecondCpu()); + + MicroProfileFrameState* pFrameFirst = pFrameCurrent; + int64_t nGapTime = MicroProfileTicksPerSecondCpu() * MICROPROFILE_GAP_TIME / 1000; + for(uint32_t i = 0; i < MICROPROFILE_MAX_FRAME_HISTORY - MICROPROFILE_GPU_FRAME_DELAY; ++i) + { + uint32_t nNextIndex = (S.nFrameCurrent + MICROPROFILE_MAX_FRAME_HISTORY - i) % MICROPROFILE_MAX_FRAME_HISTORY; + pFrameFirst = &S.Frames[nNextIndex]; + if(pFrameFirst->nFrameStartCpu <= nBaseTicksCpu-nGapTime) + break; + } + + float fMsBase = fToMsCpu * nDetailedOffsetTicksCpu; + float fMs = fDetailedRange; + float fMsEnd = fMs + fMsBase; + float fWidth = (float)nWidth; + float fMsToScreen = fWidth / fMs; + + { + float fRate = floor(2*(log10(fMs)-1))/2; + float fStep = powf(10.f, fRate); + float fRcpStep = 1.f / fStep; + int nColorIndex = (int)(floor(fMsBase*fRcpStep)); + float fStart = floor(fMsBase*fRcpStep) * fStep; + for(float f = fStart; f < fMsEnd; ) + { + float fStart = f; + float fNext = f + fStep; + MicroProfileDrawBox(((fStart-fMsBase) * fMsToScreen), nBaseY, (fNext-fMsBase) * fMsToScreen+1, nBaseY + nHeight, S.nOpacityBackground | g_nMicroProfileBackColors[nColorIndex++ & 1]); + f = fNext; + } + } + + nY += MICROPROFILE_TEXT_HEIGHT+1; + MicroProfileLogEntry* pMouseOver = S.pDisplayMouseOver; + MicroProfileLogEntry* pMouseOverNext = 0; + uint64_t nMouseOverToken = pMouseOver ? MicroProfileLogTimerIndex(*pMouseOver) : MICROPROFILE_INVALID_TOKEN; + float fMouseX = (float)S.nMouseX; + float fMouseY = (float)S.nMouseY; + uint64_t nHoverToken = MICROPROFILE_INVALID_TOKEN; + int64_t nHoverTime = 0; + + static int nHoverCounter = 155; + static int nHoverCounterDelta = 10; + nHoverCounter += nHoverCounterDelta; + if(nHoverCounter >= 245) + nHoverCounterDelta = -10; + else if(nHoverCounter < 100) + nHoverCounterDelta = 10; + S.nHoverColor = (nHoverCounter<<24)|(nHoverCounter<<16)|(nHoverCounter<<8)|nHoverCounter; + uint32_t nHoverCounterShared = nHoverCounter>>2; + S.nHoverColorShared = (nHoverCounterShared<<24)|(nHoverCounterShared<<16)|(nHoverCounterShared<<8)|nHoverCounterShared; + + uint32_t nLinesDrawn[MICROPROFILE_STACK_MAX]={0}; + + uint32_t nContextSwitchHoverThreadAfter = S.nContextSwitchHoverThreadAfter; + uint32_t nContextSwitchHoverThreadBefore = S.nContextSwitchHoverThreadBefore; + S.nContextSwitchHoverThread = S.nContextSwitchHoverThreadAfter = S.nContextSwitchHoverThreadBefore = -1; + + uint32_t nContextSwitchStart = -1; + uint32_t nContextSwitchEnd = -1; + S.nContextSwitchHoverCpuNext = 0xff; + S.nContextSwitchHoverTickIn = -1; + S.nContextSwitchHoverTickOut = -1; + if(S.bContextSwitchRunning) + { + MICROPROFILE_SCOPE(g_MicroProfileContextSwitchSearch); + uint32_t nContextSwitchPut = S.nContextSwitchPut; + nContextSwitchStart = nContextSwitchEnd = (nContextSwitchPut + MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE - 1) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE; + int64_t nSearchEnd = nBaseTicksEndCpu + MicroProfileMsToTick(30.f, MicroProfileTicksPerSecondCpu()); + int64_t nSearchBegin = nBaseTicksCpu - MicroProfileMsToTick(30.f, MicroProfileTicksPerSecondCpu()); + for(uint32_t i = 0; i < MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE; ++i) + { + uint32_t nIndex = (nContextSwitchPut + MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE - (i+1)) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE; + MicroProfileContextSwitch& CS = S.ContextSwitch[nIndex]; + if(CS.nTicks > nSearchEnd) + { + nContextSwitchEnd = nIndex; + } + if(CS.nTicks > nSearchBegin) + { + nContextSwitchStart = nIndex; + } + } + } + + bool bSkipBarView = S.bContextSwitchRunning && S.bContextSwitchNoBars; + + if(!bSkipBarView) + { + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(!pLog) + continue; + + uint32_t nPut = pFrameNext->nLogStart[i]; + ///note: this may display new samples as old data, but this will only happen when + // unpaused, where the detailed view is hardly perceptible + uint32_t nFront = S.Pool[i]->nPut.load(std::memory_order_relaxed); + MicroProfileFrameState* pFrameLogFirst = pFrameCurrent; + MicroProfileFrameState* pFrameLogLast = pFrameNext; + uint32_t nGet = pFrameLogFirst->nLogStart[i]; + do + { + MP_ASSERT(pFrameLogFirst >= &S.Frames[0] && pFrameLogFirst < &S.Frames[MICROPROFILE_MAX_FRAME_HISTORY]); + uint32_t nNewGet = pFrameLogFirst->nLogStart[i]; + bool bIsValid = false; + if(nPut < nFront) + { + bIsValid = nNewGet <= nPut || nNewGet >= nFront; + } + else + { + bIsValid = nNewGet <= nPut && nNewGet >= nFront; + } + if(bIsValid) + { + nGet = nNewGet; + if(pFrameLogFirst->nFrameStartCpu > nBaseTicksEndCpu) + { + pFrameLogLast = pFrameLogFirst;//pick the last frame that ends after + } + + + pFrameLogFirst--; + if(pFrameLogFirst < &S.Frames[0]) + pFrameLogFirst = &S.Frames[MICROPROFILE_MAX_FRAME_HISTORY-1]; + } + else + { + break; + } + }while(pFrameLogFirst != pFrameFirst); + + + if(nGet == (uint32_t)-1) + continue; + MP_ASSERT(nGet != (uint32_t)-1); + + nPut = pFrameLogLast->nLogStart[i]; + + uint32_t nRange[2][2] = { {0, 0}, {0, 0}, }; + + MicroProfileGetRange(nPut, nGet, nRange); + if(nPut == nGet) + continue; + if(0==S.nThreadActive[i] && 0==S.nMenuAllThreads) + continue; + uint32_t nMaxStackDepth = 0; + + bool bGpu = pLog->nGpu != 0; + float fToMs = bGpu ? fToMsGpu : fToMsCpu; + int64_t nBaseTicks = bGpu ? nBaseTicksGpu : nBaseTicksCpu; + char ThreadName[MicroProfileThreadLog::THREAD_MAX_LEN + 16]; + uint64_t nThreadId = pLog->nThreadId; + snprintf(ThreadName, sizeof(ThreadName)-1, "%04llx: %s", nThreadId, &pLog->ThreadName[0] ); + nY += 3; + uint32_t nThreadColor = -1; + if(pLog->nThreadId == nContextSwitchHoverThreadAfter || pLog->nThreadId == nContextSwitchHoverThreadBefore) + nThreadColor = S.nHoverColorShared|0x906060; + MicroProfileDrawText(0, nY, nThreadColor, &ThreadName[0], (uint32_t)strlen(&ThreadName[0])); + nY += 3; + nY += MICROPROFILE_TEXT_HEIGHT + 1; + + if(S.bContextSwitchRunning) + { + MicroProfileDrawDetailedContextSwitchBars(nY, pLog->nThreadId, nContextSwitchStart, nContextSwitchEnd, nBaseTicks, nBaseY); + nY -= MICROPROFILE_DETAILED_BAR_HEIGHT; + nY += MICROPROFILE_DETAILED_CONTEXT_SWITCH_HEIGHT+1; + } + + uint32_t nYDelta = MICROPROFILE_DETAILED_BAR_HEIGHT; + uint32_t nStack[MICROPROFILE_STACK_MAX]; + uint32_t nStackPos = 0; + for(uint32_t j = 0; j < 2; ++j) + { + uint32_t nStart = nRange[j][0]; + uint32_t nEnd = nRange[j][1]; + for(uint32_t k = nStart; k < nEnd; ++k) + { + MicroProfileLogEntry* pEntry = pLog->Log + k; + int nType = MicroProfileLogType(*pEntry); + if(MP_LOG_ENTER == nType) + { + MP_ASSERT(nStackPos < MICROPROFILE_STACK_MAX); + nStack[nStackPos++] = k; + } + else if(MP_LOG_META == nType) + { + + } + else if(MP_LOG_LEAVE == nType) + { + if(0 == nStackPos) + { + continue; + } + + MicroProfileLogEntry* pEntryEnter = pLog->Log + nStack[nStackPos-1]; + if(MicroProfileLogTimerIndex(*pEntryEnter) != MicroProfileLogTimerIndex(*pEntry)) + { + //uprintf("mismatch %llx %llx\n", pEntryEnter->nToken, pEntry->nToken); + continue; + } + int64_t nTickStart = MicroProfileLogGetTick(*pEntryEnter); + int64_t nTickEnd = MicroProfileLogGetTick(*pEntry); + uint64_t nTimerIndex = MicroProfileLogTimerIndex(*pEntry); + uint32_t nColor = S.TimerInfo[nTimerIndex].nColor; + if(nMouseOverToken == nTimerIndex) + { + if(pEntry == pMouseOver) + { + nColor = S.nHoverColor; + if(bGpu) + { + S.nRangeBeginGpu = *pEntryEnter; + S.nRangeEndGpu = *pEntry; + S.nRangeBeginIndex = nStack[nStackPos-1]; + S.nRangeEndIndex = k; + S.pRangeLog = pLog; + } + else + { + S.nRangeBegin = *pEntryEnter; + S.nRangeEnd = *pEntry; + S.nRangeBeginIndex = nStack[nStackPos-1]; + S.nRangeEndIndex = k; + S.pRangeLog = pLog; + + } + } + else + { + nColor = S.nHoverColorShared; + } + } + + nMaxStackDepth = MicroProfileMax(nMaxStackDepth, nStackPos); + float fMsStart = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickStart); + float fMsEnd = fToMs * MicroProfileLogTickDifference(nBaseTicks, nTickEnd); + MP_ASSERT(fMsStart <= fMsEnd); + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + float fYStart = (float)(nY + nStackPos * nYDelta); + float fYEnd = fYStart + (MICROPROFILE_DETAILED_BAR_HEIGHT); + float fXDist = MicroProfileMax(fXStart - fMouseX, fMouseX - fXEnd); + bool bHover = fXDist < MICROPROFILE_HOVER_DIST && fYStart <= fMouseY && fMouseY <= fYEnd && nBaseY < fMouseY; + uint32_t nIntegerWidth = (uint32_t)(fXEnd - fXStart); + if(nIntegerWidth) + { + if(bHover && S.nActiveMenu == -1) + { + nHoverToken = MicroProfileLogTimerIndex(*pEntry); + #if MICROPROFILE_DEBUG + S.nHoverAddressEnter = (uint64_t)pEntryEnter; + S.nHoverAddressLeave = (uint64_t)pEntry; + #endif + nHoverTime = MicroProfileLogTickDifference(nTickStart, nTickEnd); + pMouseOverNext = pEntry; + } + + MicroProfileDrawBox(fXStart, fYStart, fXEnd, fYEnd, nColor|S.nOpacityForeground, MicroProfileBoxTypeBar); +#if MICROPROFILE_DETAILED_BAR_NAMES + if(nIntegerWidth>3*MICROPROFILE_TEXT_WIDTH) + { + int nCharacters = (nIntegerWidth - 2*MICROPROFILE_TEXT_WIDTH) / MICROPROFILE_TEXT_WIDTH; + MicroProfileDrawText(fXStart+1, fYStart+1, -1, S.TimerInfo[nTimerIndex].pName, MicroProfileMin(S.TimerInfo[nTimerIndex].nNameLen, nCharacters)); + } +#endif + ++nNumBoxes; + } + else + { + float fXAvg = 0.5f * (fXStart + fXEnd); + int nLineX = (int)floor(fXAvg+0.5f); + if(nLineX != (int)nLinesDrawn[nStackPos]) + { + if(bHover && S.nActiveMenu == -1) + { + nHoverToken = (uint32_t)MicroProfileLogTimerIndex(*pEntry); + nHoverTime = MicroProfileLogTickDifference(nTickStart, nTickEnd); + pMouseOverNext = pEntry; + } + nLinesDrawn[nStackPos] = nLineX; + MicroProfileDrawLineVertical(nLineX, fYStart + 0.5f, fYEnd + 0.5f, nColor|S.nOpacityForeground); + ++nNumLines; + } + } + nStackPos--; + } + } + } + nY += nMaxStackDepth * nYDelta + MICROPROFILE_DETAILED_BAR_HEIGHT+1; + } + } + if(S.bContextSwitchRunning && (S.bContextSwitchAllThreads||S.bContextSwitchNoBars)) + { + uint32_t nNumThreads = 0; + uint32_t nThreads[MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS]; + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS && S.Pool[i]; ++i) + nThreads[nNumThreads++] = S.Pool[i]->nThreadId; + uint32_t nNumThreadsBase = nNumThreads; + if(S.bContextSwitchAllThreads) + { + for(uint32_t i = nContextSwitchStart; i != nContextSwitchEnd; i = (i+1) % MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE) + { + MicroProfileContextSwitch CS = S.ContextSwitch[i]; + uint32_t nThreadId = CS.nThreadIn; + if(nThreadId) + { + bool bSeen = false; + for(uint32_t j = 0; j < nNumThreads; ++j) + { + if(nThreads[j] == nThreadId) + { + bSeen = true; + break; + } + } + if(!bSeen) + { + nThreads[nNumThreads++] = nThreadId; + } + } + if(nNumThreads == MICROPROFILE_MAX_CONTEXT_SWITCH_THREADS) + { + S.nOverflow = 10; + break; + } + } + std::sort(&nThreads[nNumThreadsBase], &nThreads[nNumThreads]); + } + uint32_t nStart = nNumThreadsBase; + if(S.bContextSwitchNoBars) + nStart = 0; + for(uint32_t i = nStart; i < nNumThreads; ++i) + { + uint32_t nThreadId = nThreads[i]; + if(nThreadId) + { + char ThreadName[MicroProfileThreadLog::THREAD_MAX_LEN + 16]; + const char* cLocal = MicroProfileIsLocalThread(nThreadId) ? "*": " "; + int nStrLen = snprintf(ThreadName, sizeof(ThreadName)-1, "%04x: %s", nThreadId, i < nNumThreadsBase ? &S.Pool[i]->ThreadName[0] : cLocal ); + uint32_t nThreadColor = -1; + if(nThreadId == nContextSwitchHoverThreadAfter || nThreadId == nContextSwitchHoverThreadBefore) + nThreadColor = S.nHoverColorShared|0x906060; + MicroProfileDrawDetailedContextSwitchBars(nY+2, nThreadId, nContextSwitchStart, nContextSwitchEnd, nBaseTicksCpu, nBaseY); + MicroProfileDrawText(0, nY, nThreadColor, &ThreadName[0], nStrLen); + nY += MICROPROFILE_TEXT_HEIGHT+1; + } + } + } + + S.nContextSwitchHoverCpu = S.nContextSwitchHoverCpuNext; + + + + + S.pDisplayMouseOver = pMouseOverNext; + + if(!S.nRunning) + { + if(nHoverToken != MICROPROFILE_INVALID_TOKEN && nHoverTime) + { + S.nHoverToken = nHoverToken; + S.nHoverTime = nHoverTime; + } + + if(nSelectedFrame != -1) + { + S.nRangeBegin = S.Frames[nSelectedFrame].nFrameStartCpu; + S.nRangeEnd = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartCpu; + S.nRangeBeginGpu = S.Frames[nSelectedFrame].nFrameStartGpu; + S.nRangeEndGpu = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartGpu; + } + if(S.nRangeBegin != S.nRangeEnd) + { + float fMsStart = fToMsCpu * MicroProfileLogTickDifference(nBaseTicksCpu, S.nRangeBegin); + float fMsEnd = fToMsCpu * MicroProfileLogTickDifference(nBaseTicksCpu, S.nRangeEnd); + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + MicroProfileDrawBox(fXStart, nBaseY, fXEnd, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT, MicroProfileBoxTypeFlat); + MicroProfileDrawLineVertical(fXStart, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT | 0x44000000); + MicroProfileDrawLineVertical(fXEnd, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT | 0x44000000); + + fMsStart += fDetailedOffset; + fMsEnd += fDetailedOffset; + char sBuffer[32]; + uint32_t nLenStart = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsStart); + float fStartTextWidth = (float)((1+MICROPROFILE_TEXT_WIDTH) * nLenStart); + float fStartTextX = fXStart - fStartTextWidth - 2; + MicroProfileDrawBox(fStartTextX, nBaseY, fStartTextX + fStartTextWidth + 2, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fStartTextX+1, nBaseY, (uint32_t)-1, sBuffer, nLenStart); + uint32_t nLenEnd = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsEnd); + MicroProfileDrawBox(fXEnd+1, nBaseY, fXEnd+1+(1+MICROPROFILE_TEXT_WIDTH) * nLenEnd + 3, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fXEnd+2, nBaseY+1, (uint32_t)-1, sBuffer, nLenEnd); + + if(S.nMouseRight) + { + MicroProfileZoomTo(S.nRangeBegin, S.nRangeEnd); + } + } + + if(S.nRangeBeginGpu != S.nRangeEndGpu) + { + float fMsStart = fToMsGpu * MicroProfileLogTickDifference(nBaseTicksGpu, S.nRangeBeginGpu); + float fMsEnd = fToMsGpu * MicroProfileLogTickDifference(nBaseTicksGpu, S.nRangeEndGpu); + float fXStart = fMsStart * fMsToScreen; + float fXEnd = fMsEnd * fMsToScreen; + MicroProfileDrawBox(fXStart, nBaseY, fXEnd, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU, MicroProfileBoxTypeFlat); + MicroProfileDrawLineVertical(fXStart, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU | 0x44000000); + MicroProfileDrawLineVertical(fXEnd, nBaseY, nHeight, MICROPROFILE_FRAME_COLOR_HIGHTLIGHT_GPU | 0x44000000); + + nBaseY += MICROPROFILE_TEXT_HEIGHT+1; + + fMsStart += fDetailedOffset; + fMsEnd += fDetailedOffset; + char sBuffer[32]; + uint32_t nLenStart = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsStart); + float fStartTextWidth = (float)((1+MICROPROFILE_TEXT_WIDTH) * nLenStart); + float fStartTextX = fXStart - fStartTextWidth - 2; + MicroProfileDrawBox(fStartTextX, nBaseY, fStartTextX + fStartTextWidth + 2, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fStartTextX+1, nBaseY, (uint32_t)-1, sBuffer, nLenStart); + uint32_t nLenEnd = snprintf(sBuffer, sizeof(sBuffer)-1, "%.2fms", fMsEnd); + MicroProfileDrawBox(fXEnd+1, nBaseY, fXEnd+1+(1+MICROPROFILE_TEXT_WIDTH) * nLenEnd + 3, MICROPROFILE_TEXT_HEIGHT + 2 + nBaseY, 0x33000000, MicroProfileBoxTypeFlat); + MicroProfileDrawText(fXEnd+2, nBaseY+1, (uint32_t)-1, sBuffer, nLenEnd); + } + } +} + + +void MicroProfileDrawDetailedFrameHistory(uint32_t nWidth, uint32_t nHeight, uint32_t nBaseY, uint32_t nSelectedFrame) +{ + const uint32_t nBarHeight = MICROPROFILE_FRAME_HISTORY_HEIGHT; + float fBaseX = (float)nWidth; + float fDx = fBaseX / MICROPROFILE_NUM_FRAMES; + + uint32_t nLastIndex = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileDrawBox(0, nBaseY, nWidth, nBaseY+MICROPROFILE_FRAME_HISTORY_HEIGHT, 0xff000000 | g_nMicroProfileBackColors[0], MicroProfileBoxTypeFlat); + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()) * S.fRcpReferenceTime; + float fToMsGpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondGpu()) * S.fRcpReferenceTime; + + + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + uint64_t nFrameStartCpu = pFrameCurrent->nFrameStartCpu; + int64_t nDetailedOffsetTicksCpu = MicroProfileMsToTick(S.fDetailedOffset, MicroProfileTicksPerSecondCpu()); + int64_t nCpuStart = nDetailedOffsetTicksCpu + nFrameStartCpu; + int64_t nCpuEnd = nCpuStart + MicroProfileMsToTick(S.fDetailedRange, MicroProfileTicksPerSecondCpu());; + + + float fSelectionStart = (float)nWidth; + float fSelectionEnd = 0.f; + for(uint32_t i = 0; i < MICROPROFILE_NUM_FRAMES; ++i) + { + uint32_t nIndex = (S.nFrameCurrent + MICROPROFILE_MAX_FRAME_HISTORY - i) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileFrameState* pCurrent = &S.Frames[nIndex]; + MicroProfileFrameState* pNext = &S.Frames[nLastIndex]; + + int64_t nTicks = pNext->nFrameStartCpu - pCurrent->nFrameStartCpu; + int64_t nTicksGpu = pNext->nFrameStartGpu - pCurrent->nFrameStartGpu; + float fScale = fToMs * nTicks; + float fScaleGpu = fToMsGpu * nTicksGpu; + fScale = fScale > 1.f ? 0.f : 1.f - fScale; + fScaleGpu = fScaleGpu > 1.f ? 0.f : 1.f - fScaleGpu; + float fXEnd = fBaseX; + float fXStart = fBaseX - fDx; + fBaseX = fXStart; + uint32_t nColor = MICROPROFILE_FRAME_HISTORY_COLOR_CPU; + if(nIndex == nSelectedFrame) + nColor = (uint32_t)-1; + MicroProfileDrawBox(fXStart, nBaseY + fScale * nBarHeight, fXEnd, nBaseY+MICROPROFILE_FRAME_HISTORY_HEIGHT, nColor, MicroProfileBoxTypeBar); + if(pNext->nFrameStartCpu > nCpuStart) + { + fSelectionStart = fXStart; + } + if(pCurrent->nFrameStartCpu < nCpuEnd && fSelectionEnd == 0.f) + { + fSelectionEnd = fXEnd; + } + nLastIndex = nIndex; + } + MicroProfileDrawBox(fSelectionStart, nBaseY, fSelectionEnd, nBaseY+MICROPROFILE_FRAME_HISTORY_HEIGHT, MICROPROFILE_FRAME_HISTORY_COLOR_HIGHTLIGHT, MicroProfileBoxTypeFlat); +} +void MicroProfileDrawDetailedView(uint32_t nWidth, uint32_t nHeight) +{ + MICROPROFILE_SCOPE(g_MicroProfileDetailed); + uint32_t nBaseY = S.nBarHeight + 1; + + int nSelectedFrame = -1; + if(S.nMouseY > nBaseY && S.nMouseY <= nBaseY + MICROPROFILE_FRAME_HISTORY_HEIGHT && S.nActiveMenu == -1) + { + + nSelectedFrame = ((MICROPROFILE_NUM_FRAMES) * (S.nWidth-S.nMouseX) / S.nWidth); + nSelectedFrame = (S.nFrameCurrent + MICROPROFILE_MAX_FRAME_HISTORY - nSelectedFrame) % MICROPROFILE_MAX_FRAME_HISTORY; + S.nHoverFrame = nSelectedFrame; + if(S.nMouseRight) + { + int64_t nRangeBegin = S.Frames[nSelectedFrame].nFrameStartCpu; + int64_t nRangeEnd = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartCpu; + MicroProfileZoomTo(nRangeBegin, nRangeEnd); + } + if(S.nMouseDownLeft) + { + uint64_t nFrac = (1024 * (MICROPROFILE_NUM_FRAMES) * (S.nMouseX) / S.nWidth) % 1024; + int64_t nRangeBegin = S.Frames[nSelectedFrame].nFrameStartCpu; + int64_t nRangeEnd = S.Frames[(nSelectedFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY].nFrameStartCpu; + MicroProfileCenter(nRangeBegin + (nRangeEnd-nRangeBegin) * nFrac / 1024); + } + } + else + { + S.nHoverFrame = -1; + } + + MicroProfileDrawDetailedBars(nWidth, nHeight, nBaseY + MICROPROFILE_FRAME_HISTORY_HEIGHT, nSelectedFrame); + MicroProfileDrawDetailedFrameHistory(nWidth, nHeight, nBaseY, nSelectedFrame); +} + +template +void MicroProfileLoopActiveGroupsDraw(int32_t nX, int32_t nY, const char* pName, T CB) +{ + if(pName) + MicroProfileDrawText(nX, nY, (uint32_t)-1, pName, (uint32_t)strlen(pName)); + + nY += S.nBarHeight + 2; + uint64_t nGroup = S.nActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; + uint32_t nCount = 0; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + uint64_t nMask = 1ll << j; + if(nMask & nGroup) + { + nY += S.nBarHeight + 1; + for(uint32_t i = 0; i < S.nTotalTimers;++i) + { + uint64_t nTokenMask = MicroProfileGetGroupMask(S.TimerInfo[i].nToken); + if(nTokenMask & nMask) + { + if(nY >= 0) + CB(i, nCount, nMask, nX, nY); + + nCount += 2; + nY += S.nBarHeight + 1; + + if(nY > (int)S.nHeight) + return; + } + } + + } + } +} + + +void MicroProfileCalcTimers(float* pTimers, float* pAverage, float* pMax, float* pCallAverage, float* pExclusive, float* pAverageExclusive, float* pMaxExclusive, uint64_t nGroup, uint32_t nSize) +{ + uint32_t nCount = 0; + uint64_t nMask = 1; + + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(nMask & nGroup) + { + const float fToMs = MicroProfileTickToMsMultiplier(S.GroupInfo[j].Type == MicroProfileTokenTypeGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + for(uint32_t i = 0; i < S.nTotalTimers;++i) + { + uint64_t nTokenMask = MicroProfileGetGroupMask(S.TimerInfo[i].nToken); + if(nTokenMask & nMask) + { + { + uint32_t nTimer = i; + uint32_t nIdx = nCount; + uint32_t nAggregateFrames = S.nAggregateFrames ? S.nAggregateFrames : 1; + uint32_t nAggregateCount = S.Aggregate[nTimer].nCount ? S.Aggregate[nTimer].nCount : 1; + float fToPrc = S.fRcpReferenceTime; + float fMs = fToMs * (S.Frame[nTimer].nTicks); + float fPrc = MicroProfileMin(fMs * fToPrc, 1.f); + float fAverageMs = fToMs * (S.Aggregate[nTimer].nTicks / nAggregateFrames); + float fAveragePrc = MicroProfileMin(fAverageMs * fToPrc, 1.f); + float fMaxMs = fToMs * (S.AggregateMax[nTimer]); + float fMaxPrc = MicroProfileMin(fMaxMs * fToPrc, 1.f); + float fCallAverageMs = fToMs * (S.Aggregate[nTimer].nTicks / nAggregateCount); + float fCallAveragePrc = MicroProfileMin(fCallAverageMs * fToPrc, 1.f); + float fMsExclusive = fToMs * (S.FrameExclusive[nTimer]); + float fPrcExclusive = MicroProfileMin(fMsExclusive * fToPrc, 1.f); + float fAverageMsExclusive = fToMs * (S.AggregateExclusive[nTimer] / nAggregateFrames); + float fAveragePrcExclusive = MicroProfileMin(fAverageMsExclusive * fToPrc, 1.f); + float fMaxMsExclusive = fToMs * (S.AggregateMaxExclusive[nTimer]); + float fMaxPrcExclusive = MicroProfileMin(fMaxMsExclusive * fToPrc, 1.f); + pTimers[nIdx] = fMs; + pTimers[nIdx+1] = fPrc; + pAverage[nIdx] = fAverageMs; + pAverage[nIdx+1] = fAveragePrc; + pMax[nIdx] = fMaxMs; + pMax[nIdx+1] = fMaxPrc; + pCallAverage[nIdx] = fCallAverageMs; + pCallAverage[nIdx+1] = fCallAveragePrc; + pExclusive[nIdx] = fMsExclusive; + pExclusive[nIdx+1] = fPrcExclusive; + pAverageExclusive[nIdx] = fAverageMsExclusive; + pAverageExclusive[nIdx+1] = fAveragePrcExclusive; + pMaxExclusive[nIdx] = fMaxMsExclusive; + pMaxExclusive[nIdx+1] = fMaxPrcExclusive; + } + nCount += 2; + } + } + } + nMask <<= 1ll; + } +} + +#define SBUF_MAX 32 + +uint32_t MicroProfileDrawBarArray(int32_t nX, int32_t nY, float* pTimers, const char* pName, uint32_t nTotalHeight) +{ + const uint32_t nHeight = S.nBarHeight; + const uint32_t nWidth = S.nBarWidth; + const uint32_t nTextWidth = 6 * (1+MICROPROFILE_TEXT_WIDTH); + const float fWidth = (float)S.nBarWidth; + + MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground|g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + + MicroProfileLoopActiveGroupsDraw(nX, nY, pName, + [=](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + char sBuffer[SBUF_MAX]; + int nLen = snprintf(sBuffer, SBUF_MAX-1, "%5.2f", pTimers[nIdx]); + MicroProfileDrawBox(nX + nTextWidth, nY, nX + nTextWidth + fWidth * pTimers[nIdx+1], nY + nHeight, S.nOpacityForeground|S.TimerInfo[nTimer].nColor, MicroProfileBoxTypeBar); + MicroProfileDrawText(nX, nY, (uint32_t)-1, sBuffer, nLen); + }); + return nWidth + 5 + nTextWidth; + +} + +uint32_t MicroProfileDrawBarCallCount(int32_t nX, int32_t nY, const char* pName) +{ + MicroProfileLoopActiveGroupsDraw(nX, nY, pName, + [](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + char sBuffer[SBUF_MAX]; + int nLen = snprintf(sBuffer, SBUF_MAX-1, "%5d", S.Frame[nTimer].nCount);//fix + MicroProfileDrawText(nX, nY, (uint32_t)-1, sBuffer, nLen); + }); + uint32_t nTextWidth = 6 * MICROPROFILE_TEXT_WIDTH; + return 5 + nTextWidth; +} + +uint32_t MicroProfileDrawBarMetaCount(int32_t nX, int32_t nY, uint64_t* pCounters, const char* pName, uint32_t nTotalHeight) +{ + MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground|g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + uint32_t nTextWidth = (1+MICROPROFILE_TEXT_WIDTH) * MicroProfileMax(6, (uint32_t)strlen(pName)); + + + MicroProfileLoopActiveGroupsDraw(nX, nY, pName, + [=](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + char sBuffer[SBUF_MAX]; + int nLen = snprintf(sBuffer, SBUF_MAX-1, "%5llu", pCounters[nTimer]); + MicroProfileDrawText(nX + nTextWidth - nLen * (MICROPROFILE_TEXT_WIDTH+1), nY, (uint32_t)-1, sBuffer, nLen); + }); + return 5 + nTextWidth; +} + + +uint32_t MicroProfileDrawBarLegend(int32_t nX, int32_t nY, uint32_t nTotalHeight) +{ + MicroProfileDrawLineVertical(nX-5, nY, nTotalHeight, S.nOpacityBackground | g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + MicroProfileLoopActiveGroupsDraw(nX, nY, 0, + [](uint32_t nTimer, uint32_t nIdx, uint64_t nGroupMask, uint32_t nX, uint32_t nY){ + MicroProfileDrawText(nX, nY, S.TimerInfo[nTimer].nColor, S.TimerInfo[nTimer].pName, (uint32_t)strlen(S.TimerInfo[nTimer].pName)); + if(S.nMouseY >= nY && S.nMouseY < nY + MICROPROFILE_TEXT_HEIGHT+1 && S.nMouseX < nX + 20 * (MICROPROFILE_TEXT_WIDTH+1)) + { + S.nHoverToken = nTimer; + S.nHoverTime = 0; + } + }); + return nX; +} + +bool MicroProfileDrawGraph(uint32_t nScreenWidth, uint32_t nScreenHeight) +{ + MICROPROFILE_SCOPE(g_MicroProfileDrawGraph); + bool bEnabled = false; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + bEnabled = true; + if(!bEnabled) + return false; + + uint32_t nX = nScreenWidth - MICROPROFILE_GRAPH_WIDTH; + uint32_t nY = nScreenHeight - MICROPROFILE_GRAPH_HEIGHT; + MicroProfileDrawBox(nX, nY, nX + MICROPROFILE_GRAPH_WIDTH, nY + MICROPROFILE_GRAPH_HEIGHT, S.nOpacityBackground | g_nMicroProfileBackColors[0]|g_nMicroProfileBackColors[1]); + bool bMouseOver = S.nMouseX >= nX && S.nMouseY >= nY; + float fMouseXPrc =(float(S.nMouseX - nX)) / MICROPROFILE_GRAPH_WIDTH; + if(bMouseOver) + { + float fXAvg = fMouseXPrc * MICROPROFILE_GRAPH_WIDTH + nX; + MicroProfileDrawLineVertical(fXAvg, nY, nY + MICROPROFILE_GRAPH_HEIGHT, (uint32_t)-1); + } + + + float fY = (float)nScreenHeight; + float fDX = MICROPROFILE_GRAPH_WIDTH * 1.f / MICROPROFILE_GRAPH_HISTORY; + float fDY = MICROPROFILE_GRAPH_HEIGHT; + uint32_t nPut = S.nGraphPut; + float* pGraphData = (float*)alloca(sizeof(float)* MICROPROFILE_GRAPH_HISTORY*2); + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + { + uint32_t nGroupId = MicroProfileGetGroupIndex(S.Graph[i].nToken); + bool bGpu = S.GroupInfo[nGroupId].Type == MicroProfileTokenTypeGpu; + float fToMs = MicroProfileTickToMsMultiplier(bGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + float fToPrc = fToMs * S.fRcpReferenceTime * 3 / 4; + + float fX = (float)nX; + for(uint32_t j = 0; j < MICROPROFILE_GRAPH_HISTORY; ++j) + { + float fWeigth = MicroProfileMin(fToPrc * (S.Graph[i].nHistory[(j+nPut)%MICROPROFILE_GRAPH_HISTORY]), 1.f); + pGraphData[(j*2)] = fX; + pGraphData[(j*2)+1] = fY - fDY * fWeigth; + fX += fDX; + } + MicroProfileDrawLine2D(MICROPROFILE_GRAPH_HISTORY, pGraphData, S.TimerInfo[MicroProfileGetTimerIndex(S.Graph[i].nToken)].nColor); + } + } + { + float fY1 = 0.25f * MICROPROFILE_GRAPH_HEIGHT + nY; + float fY2 = 0.50f * MICROPROFILE_GRAPH_HEIGHT + nY; + float fY3 = 0.75f * MICROPROFILE_GRAPH_HEIGHT + nY; + MicroProfileDrawLineHorizontal(nX, nX + MICROPROFILE_GRAPH_WIDTH, fY1, 0xffdd4444); + MicroProfileDrawLineHorizontal(nX, nX + MICROPROFILE_GRAPH_WIDTH, fY2, 0xff000000| g_nMicroProfileBackColors[0]); + MicroProfileDrawLineHorizontal(nX, nX + MICROPROFILE_GRAPH_WIDTH, fY3, 0xff000000|g_nMicroProfileBackColors[0]); + + char buf[32]; + int nLen = snprintf(buf, sizeof(buf)-1, "%5.2fms", S.fReferenceTime); + MicroProfileDrawText(nX+1, fY1 - (2+MICROPROFILE_TEXT_HEIGHT), (uint32_t)-1, buf, nLen); + } + + + + if(bMouseOver) + { + uint32_t pColors[MICROPROFILE_MAX_GRAPHS]; + MicroProfileStringArray Strings; + MicroProfileStringArrayClear(&Strings); + uint32_t nTextCount = 0; + uint32_t nGraphIndex = (S.nGraphPut + MICROPROFILE_GRAPH_HISTORY - int(MICROPROFILE_GRAPH_HISTORY*(1.f - fMouseXPrc))) % MICROPROFILE_GRAPH_HISTORY; + + uint32_t nX = S.nMouseX; + uint32_t nY = S.nMouseY + 20; + + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != MICROPROFILE_INVALID_TOKEN) + { + uint32_t nGroupId = MicroProfileGetGroupIndex(S.Graph[i].nToken); + bool bGpu = S.GroupInfo[nGroupId].Type == MicroProfileTokenTypeGpu; + float fToMs = MicroProfileTickToMsMultiplier(bGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + uint32_t nIndex = MicroProfileGetTimerIndex(S.Graph[i].nToken); + uint32_t nColor = S.TimerInfo[nIndex].nColor; + const char* pName = S.TimerInfo[nIndex].pName; + pColors[nTextCount++] = nColor; + MicroProfileStringArrayAddLiteral(&Strings, pName); + MicroProfileStringArrayFormat(&Strings, "%5.2fms", fToMs * (S.Graph[i].nHistory[nGraphIndex])); + } + } + if(nTextCount) + { + MicroProfileDrawFloatWindow(nX, nY, Strings.ppStrings, Strings.nNumStrings, 0, pColors); + } + + if(S.nMouseRight) + { + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + } + } + } + + return bMouseOver; +} + +void MicroProfileDumpTimers() +{ + uint64_t nActiveGroup = S.nGroupMask; + + uint32_t nNumTimers = S.nTotalTimers; + uint32_t nBlockSize = 2 * nNumTimers; + float* pTimers = (float*)alloca(nBlockSize * 7 * sizeof(float)); + float* pAverage = pTimers + nBlockSize; + float* pMax = pTimers + 2 * nBlockSize; + float* pCallAverage = pTimers + 3 * nBlockSize; + float* pTimersExclusive = pTimers + 4 * nBlockSize; + float* pAverageExclusive = pTimers + 5 * nBlockSize; + float* pMaxExclusive = pTimers + 6 * nBlockSize; + MicroProfileCalcTimers(pTimers, pAverage, pMax, pCallAverage, pTimersExclusive, pAverageExclusive, pMaxExclusive, nActiveGroup, nNumTimers); + + MICROPROFILE_PRINTF("%11s, ", "Time"); + MICROPROFILE_PRINTF("%11s, ", "Average"); + MICROPROFILE_PRINTF("%11s, ", "Max"); + MICROPROFILE_PRINTF("%11s, ", "Call Avg"); + MICROPROFILE_PRINTF("%9s, ", "Count"); + MICROPROFILE_PRINTF("%11s, ", "Excl"); + MICROPROFILE_PRINTF("%11s, ", "Avg Excl"); + MICROPROFILE_PRINTF("%11s, \n", "Max Excl"); + + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + uint64_t nMask = 1ll << j; + if(nMask & nActiveGroup) + { + MICROPROFILE_PRINTF("%s\n", S.GroupInfo[j].pName); + for(uint32_t i = 0; i < S.nTotalTimers;++i) + { + uint64_t nTokenMask = MicroProfileGetGroupMask(S.TimerInfo[i].nToken); + if(nTokenMask & nMask) + { + uint32_t nIdx = i * 2; + MICROPROFILE_PRINTF("%9.2fms, ", pTimers[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pAverage[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pMax[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pCallAverage[nIdx]); + MICROPROFILE_PRINTF("%9d, ", S.Frame[i].nCount); + MICROPROFILE_PRINTF("%9.2fms, ", pTimersExclusive[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pAverageExclusive[nIdx]); + MICROPROFILE_PRINTF("%9.2fms, ", pMaxExclusive[nIdx]); + MICROPROFILE_PRINTF("%s\n", S.TimerInfo[i].pName); + } + } + } + } +} + +void MicroProfileDrawBarView(uint32_t nScreenWidth, uint32_t nScreenHeight) +{ + uint64_t nActiveGroup = S.nMenuAllGroups ? S.nGroupMask : S.nMenuActiveGroup; + if(!nActiveGroup) + return; + MICROPROFILE_SCOPE(g_MicroProfileDrawBarView); + + const uint32_t nHeight = S.nBarHeight; + int nColorIndex = 0; + uint32_t nX = 0; + uint32_t nY = nHeight + 1 - S.nOffsetY; + uint32_t nNumTimers = 0; + uint32_t nNumGroups = 0; + uint32_t nMaxTimerNameLen = 1; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(nActiveGroup & (1ll << j)) + { + nNumTimers += S.GroupInfo[j].nNumTimers; + nNumGroups += 1; + nMaxTimerNameLen = MicroProfileMax(nMaxTimerNameLen, S.GroupInfo[j].nMaxTimerNameLen); + } + } + uint32_t nBlockSize = 2 * nNumTimers; + float* pTimers = (float*)alloca(nBlockSize * 7 * sizeof(float)); + float* pAverage = pTimers + nBlockSize; + float* pMax = pTimers + 2 * nBlockSize; + float* pCallAverage = pTimers + 3 * nBlockSize; + float* pTimersExclusive = pTimers + 4 * nBlockSize; + float* pAverageExclusive = pTimers + 5 * nBlockSize; + float* pMaxExclusive = pTimers + 6 * nBlockSize; + MicroProfileCalcTimers(pTimers, pAverage, pMax, pCallAverage, pTimersExclusive, pAverageExclusive, pMaxExclusive, nActiveGroup, nNumTimers); + { + uint32_t nWidth = 0; + for(uint32_t i = 1; i ; i <<= 1) + { + if(S.nBars & i) + { + nWidth += S.nBarWidth + 5 + 6 * (1+MICROPROFILE_TEXT_WIDTH); + if(i & MP_DRAW_CALL_COUNT) + nWidth += 5 + 6 * MICROPROFILE_TEXT_WIDTH; + } + } + nWidth += (1+nMaxTimerNameLen) * (MICROPROFILE_TEXT_WIDTH+1); + for(uint32_t i = 0; i < nNumTimers+nNumGroups+1; ++i) + { + int nY0 = nY + i * (nHeight + 1); + MicroProfileDrawBox(nX, nY0, nWidth, nY0 + (nHeight+1)+1, S.nOpacityBackground | g_nMicroProfileBackColors[nColorIndex++ & 1]); + } + } + int nTotalHeight = (nNumTimers+nNumGroups+2) * (nHeight+1); + uint32_t nLegendOffset = 1; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(nActiveGroup & (1ll << j)) + { + MicroProfileDrawText(nX, nY + (1+nHeight) * nLegendOffset, (uint32_t)-1, S.GroupInfo[j].pName, S.GroupInfo[j].nNameLen); + nLegendOffset += S.GroupInfo[j].nNumTimers+1; + } + } + if(S.nBars & MP_DRAW_TIMERS) + nX += MicroProfileDrawBarArray(nX, nY, pTimers, "Time", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_AVERAGE) + nX += MicroProfileDrawBarArray(nX, nY, pAverage, "Average", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_MAX) + nX += MicroProfileDrawBarArray(nX, nY, pMax, "Max Time", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_CALL_COUNT) + { + nX += MicroProfileDrawBarArray(nX, nY, pCallAverage, "Call Average", nTotalHeight) + 1; + nX += MicroProfileDrawBarCallCount(nX, nY, "Count") + 1; + } + if(S.nBars & MP_DRAW_TIMERS_EXCLUSIVE) + nX += MicroProfileDrawBarArray(nX, nY, pTimersExclusive, "Exclusive Time", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_AVERAGE_EXCLUSIVE) + nX += MicroProfileDrawBarArray(nX, nY, pAverageExclusive, "Exclusive Average", nTotalHeight) + 1; + if(S.nBars & MP_DRAW_MAX_EXCLUSIVE) + nX += MicroProfileDrawBarArray(nX, nY, pMaxExclusive, "Exclusive Max Time", nTotalHeight) + 1; + + for(int i = 0; i < MICROPROFILE_META_MAX; ++i) + { + if(0 != (S.nBars & (MP_DRAW_META_FIRST< SubmenuCallback; + typedef std::function ClickCallback; + SubmenuCallback GroupCallback[] = + { [] (int index, bool& bSelected) -> const char*{ + switch(index) + { + case 0: + bSelected = S.nDisplay == MP_DRAW_DETAILED; + return "Detailed"; + case 1: + bSelected = S.nDisplay == MP_DRAW_BARS; + return "Timers"; + case 2: + bSelected = S.nDisplay == MP_DRAW_HIDDEN; + return "Hidden"; + case 3: + bSelected = false; + return "Off"; + + default: return 0; + } + }, + [] (int index, bool& bSelected) -> const char*{ + if(index == 0) + { + bSelected = S.nMenuAllGroups != 0; + return "ALL"; + } + else + { + index = index-1; + bSelected = 0 != (S.nMenuActiveGroup & (1ll << index)); + if(index < MICROPROFILE_MAX_GROUPS && S.GroupInfo[index].pName) + return S.GroupInfo[index].pName; + else + return 0; + } + }, + [] (int index, bool& bSelected) -> const char*{ + if(index < sizeof(g_MicroProfileAggregatePresets)/sizeof(g_MicroProfileAggregatePresets[0])) + { + int val = g_MicroProfileAggregatePresets[index]; + bSelected = (int)S.nAggregateFlip == val; + if(0 == val) + return "Infinite"; + else + { + static char buf[128]; + snprintf(buf, sizeof(buf)-1, "%7d", val); + return buf; + } + } + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + bSelected = 0 != (S.nBars & (1 << index)); + switch(index) + { + case 0: return "Time"; + case 1: return "Average"; + case 2: return "Max"; + case 3: return "Call Count"; + case 4: return "Exclusive Timers"; + case 5: return "Exclusive Average"; + case 6: return "Exclusive Max"; + } + int nMetaIndex = index - 7; + if(nMetaIndex < MICROPROFILE_META_MAX) + { + return S.MetaCounters[nMetaIndex].pName; + } + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + if(index >= nOptionSize) return 0; + switch(Options[index].nSubType) + { + case 0: + bSelected = S.fReferenceTime == g_MicroProfileReferenceTimePresets[Options[index].nIndex]; + break; + case 1: + bSelected = S.nOpacityBackground>>24 == g_MicroProfileOpacityPresets[Options[index].nIndex]; + break; + case 2: + bSelected = S.nOpacityForeground>>24 == g_MicroProfileOpacityPresets[Options[index].nIndex]; + break; +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + case 3: + { + switch(Options[index].nIndex) + { + case 0: + bSelected = S.bContextSwitchRunning; + break; + case 1: + bSelected = S.bContextSwitchAllThreads; + break; + case 2: + bSelected = S.bContextSwitchNoBars; + break; + } + } + break; +#endif + } + return Options[index].Text; + }, + + [] (int index, bool& bSelected) -> const char*{ + static char buf[128]; + bSelected = false; + int nNumPresets = sizeof(g_MicroProfilePresetNames) / sizeof(g_MicroProfilePresetNames[0]); + int nIndexSave = index - nNumPresets - 1; + if(index == nNumPresets) + return "--"; + else if(nIndexSave >=0 && nIndexSave const char*{ + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + return 0; + }, + [] (int index, bool& bSelected) -> const char*{ + return 0; + }, + + + }; + ClickCallback CBClick[] = + { + [](int nIndex) + { + switch(nIndex) + { + case 0: + S.nDisplay = MP_DRAW_DETAILED; + break; + case 1: + S.nDisplay = MP_DRAW_BARS; + break; + case 2: + S.nDisplay = MP_DRAW_HIDDEN; + break; + case 3: + S.nDisplay = 0; + break; + } + }, + [](int nIndex) + { + if(nIndex == 0) + S.nMenuAllGroups = 1-S.nMenuAllGroups; + else + S.nMenuActiveGroup ^= (1ll << (nIndex-1)); + }, + [](int nIndex) + { + S.nAggregateFlip = g_MicroProfileAggregatePresets[nIndex]; + if(0 == S.nAggregateFlip) + { + memset(S.AggregateTimers, 0, sizeof(S.AggregateTimers)); + memset(S.MaxTimers, 0, sizeof(S.MaxTimers)); + memset(S.AggregateTimersExclusive, 0, sizeof(S.AggregateTimersExclusive)); + memset(S.MaxTimersExclusive, 0, sizeof(S.MaxTimersExclusive)); + S.nFlipAggregate = 0; + S.nFlipMax = 0; + S.nAggregateFlipCount = 0; + } + }, + [](int nIndex) + { + S.nBars ^= (1 << nIndex); + }, + [](int nIndex) + { + switch(Options[nIndex].nSubType) + { + case 0: + S.fReferenceTime = g_MicroProfileReferenceTimePresets[Options[nIndex].nIndex]; + S.fRcpReferenceTime = 1.f / S.fReferenceTime; + break; + case 1: + S.nOpacityBackground = g_MicroProfileOpacityPresets[Options[nIndex].nIndex]<<24; + break; + case 2: + S.nOpacityForeground = g_MicroProfileOpacityPresets[Options[nIndex].nIndex]<<24; + break; +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + case 3: + { + switch(Options[nIndex].nIndex) + { + case 0: + if(S.bContextSwitchRunning) + { + MicroProfileStopContextSwitchTrace(); + } + else + { + MicroProfileStartContextSwitchTrace(); + } + break; + case 1: + S.bContextSwitchAllThreads = !S.bContextSwitchAllThreads; + break; + case 2: + S.bContextSwitchNoBars= !S.bContextSwitchNoBars; + break; + + } + } + break; +#endif + } + }, + [](int nIndex) + { + int nNumPresets = sizeof(g_MicroProfilePresetNames) / sizeof(g_MicroProfilePresetNames[0]); + int nIndexSave = nIndex - nNumPresets - 1; + if(nIndexSave >= 0 && nIndexSave < nNumPresets) + { + MicroProfileSavePreset(g_MicroProfilePresetNames[nIndexSave]); + } + else if(nIndex >= 0 && nIndex < nNumPresets) + { + MicroProfileLoadPreset(g_MicroProfilePresetNames[nIndex]); + } + }, + [](int nIndex) + { + }, + [](int nIndex) + { + }, + [](int nIndex) + { + }, + }; + + uint32_t nSelectMenu = (uint32_t)-1; + for(uint32_t i = 0; i < nNumMenuItems; ++i) + { + nMenuX[i] = nX; + uint32_t nLen = (uint32_t)strlen(pMenuText[i]); + uint32_t nEnd = nX + nLen * (MICROPROFILE_TEXT_WIDTH+1); + if(S.nMouseY <= MICROPROFILE_TEXT_HEIGHT && S.nMouseX <= nEnd && S.nMouseX >= nX) + { + MicroProfileDrawBox(nX-1, nY, nX + nLen * (MICROPROFILE_TEXT_WIDTH+1), nY +(S.nBarHeight+1)+1, 0xff888888); + nSelectMenu = i; + if((S.nMouseLeft || S.nMouseRight) && i == (int)nPauseIndex) + { + S.nRunning = !S.nRunning; + } + } + MicroProfileDrawText(nX, nY, (uint32_t)-1, pMenuText[i], (uint32_t)strlen(pMenuText[i])); + nX += (nLen+1) * (MICROPROFILE_TEXT_WIDTH+1); + } + uint32_t nMenu = nSelectMenu != (uint32_t)-1 ? nSelectMenu : S.nActiveMenu; + S.nActiveMenu = nMenu; + if((uint32_t)-1 != nMenu) + { + nX = nMenuX[nMenu]; + nY += MICROPROFILE_TEXT_HEIGHT+1; + SubmenuCallback CB = GroupCallback[nMenu]; + int nNumLines = 0; + bool bSelected = false; + const char* pString = CB(nNumLines, bSelected); + uint32_t nWidth = 0, nHeight = 0; + while(pString) + { + nWidth = MicroProfileMax(nWidth, (int)strlen(pString)); + nNumLines++; + pString = CB(nNumLines, bSelected); + } + nWidth = (2+nWidth) * (MICROPROFILE_TEXT_WIDTH+1); + nHeight = nNumLines * (MICROPROFILE_TEXT_HEIGHT+1); + if(S.nMouseY <= nY + nHeight+0 && S.nMouseY >= nY-0 && S.nMouseX <= nX + nWidth + 0 && S.nMouseX >= nX - 0) + { + S.nActiveMenu = nMenu; + } + else if(nSelectMenu == (uint32_t)-1) + { + S.nActiveMenu = (uint32_t)-1; + } + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + nHeight, 0xff000000|g_nMicroProfileBackColors[1]); + for(int i = 0; i < nNumLines; ++i) + { + bool bSelected = false; + const char* pString = CB(i, bSelected); + if(S.nMouseY >= nY && S.nMouseY < nY + MICROPROFILE_TEXT_HEIGHT + 1) + { + bMouseOver = true; + if(S.nMouseLeft || S.nMouseRight) + { + CBClick[nMenu](i); + } + MicroProfileDrawBox(nX, nY, nX + nWidth, nY + MICROPROFILE_TEXT_HEIGHT + 1, 0xff888888); + } + int nLen = snprintf(buffer, SBUF_SIZE-1, "%c %s", bSelected ? '*' : ' ' ,pString); + MicroProfileDrawText(nX, nY, (uint32_t)-1, buffer, nLen); + nY += MICROPROFILE_TEXT_HEIGHT+1; + } + } + + + { + static char FrameTimeMessage[64]; + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + uint32_t nAggregateFrames = S.nAggregateFrames ? S.nAggregateFrames : 1; + float fMs = fToMs * (S.nFlipTicks); + float fAverageMs = fToMs * (S.nFlipAggregateDisplay / nAggregateFrames); + float fMaxMs = fToMs * S.nFlipMaxDisplay; + int nLen = snprintf(FrameTimeMessage, sizeof(FrameTimeMessage)-1, "Time[%6.2f] Avg[%6.2f] Max[%6.2f]", fMs, fAverageMs, fMaxMs); + pMenuText[nNumMenuItems++] = &FrameTimeMessage[0]; + MicroProfileDrawText(nWidth - nLen * (MICROPROFILE_TEXT_WIDTH+1), 0, -1, FrameTimeMessage, nLen); + } +} + + +void MicroProfileMoveGraph() +{ + int nZoom = S.nMouseWheelDelta; + int nPanX = 0; + int nPanY = 0; + static int X = 0, Y = 0; + if(S.nMouseDownLeft && !S.nModDown) + { + nPanX = S.nMouseX - X; + nPanY = S.nMouseY - Y; + } + X = S.nMouseX; + Y = S.nMouseY; + + if(nZoom) + { + float fOldRange = S.fDetailedRange; + if(nZoom>0) + { + S.fDetailedRangeTarget = S.fDetailedRange *= S.nModDown ? 1.40 : 1.05f; + } + else + { + S.fDetailedRangeTarget = S.fDetailedRange /= S.nModDown ? 1.40 : 1.05f; + } + + float fDiff = fOldRange - S.fDetailedRange; + float fMousePrc = MicroProfileMax((float)S.nMouseX / S.nWidth ,0.f); + S.fDetailedOffsetTarget = S.fDetailedOffset += fDiff * fMousePrc; + + } + if(nPanX) + { + S.fDetailedOffsetTarget = S.fDetailedOffset += -nPanX * S.fDetailedRange / S.nWidth; + } + S.nOffsetY -= nPanY; + if(S.nOffsetY<0) + S.nOffsetY = 0; +} + +bool MicroProfileIsDrawing() +{ + return S.nDisplay != 0; +} +void MicroProfileDraw(uint32_t nWidth, uint32_t nHeight) +{ + MICROPROFILE_SCOPE(g_MicroProfileDraw); + + if(S.nDisplay) + { + MicroProfileScopeLock L(MicroProfileMutex()); + S.nWidth = nWidth; + S.nHeight = nHeight; + S.nHoverToken = MICROPROFILE_INVALID_TOKEN; + S.nHoverTime = 0; + S.nHoverFrame = -1; + if(S.nDisplay != MP_DRAW_DETAILED) + S.nContextSwitchHoverThread = S.nContextSwitchHoverThreadAfter = S.nContextSwitchHoverThreadBefore = -1; + MicroProfileMoveGraph(); + + + if(S.nDisplay == MP_DRAW_DETAILED) + { + MicroProfileDrawDetailedView(nWidth, nHeight); + } + else if(S.nDisplay == MP_DRAW_BARS && S.nBars) + { + MicroProfileDrawBarView(nWidth, nHeight); + } + + MicroProfileDrawMenu(nWidth, nHeight); + bool bMouseOverGraph = MicroProfileDrawGraph(nWidth, nHeight); + bool bHidden = S.nDisplay == MP_DRAW_HIDDEN; + if(!bHidden) + { + uint32_t nLockedToolTipX = 3; + bool bDeleted = false; + for(int i = 0; i < MICROPROFILE_TOOLTIP_MAX_LOCKED; ++i) + { + int nIndex = (S.LockedToolTipFront + i) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + if(S.LockedToolTips[nIndex].ppStrings[0]) + { + uint32_t nToolTipWidth = 0, nToolTipHeight = 0; + MicroProfileFloatWindowSize(S.LockedToolTips[nIndex].ppStrings, S.LockedToolTips[nIndex].nNumStrings, 0, nToolTipWidth, nToolTipHeight, 0); + uint32_t nStartY = nHeight - nToolTipHeight - 2; + if(!bDeleted && S.nMouseY > nStartY && S.nMouseX > nLockedToolTipX && S.nMouseX <= nLockedToolTipX + nToolTipWidth && (S.nMouseLeft || S.nMouseRight) ) + { + bDeleted = true; + int j = i; + for(; j < MICROPROFILE_TOOLTIP_MAX_LOCKED-1; ++j) + { + int nIndex0 = (S.LockedToolTipFront + j) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + int nIndex1 = (S.LockedToolTipFront + j+1) % MICROPROFILE_TOOLTIP_MAX_LOCKED; + MicroProfileStringArrayCopy(&S.LockedToolTips[nIndex0], &S.LockedToolTips[nIndex1]); + } + MicroProfileStringArrayClear(&S.LockedToolTips[(S.LockedToolTipFront + j) % MICROPROFILE_TOOLTIP_MAX_LOCKED]); + } + else + { + MicroProfileDrawFloatWindow(nLockedToolTipX, nHeight-nToolTipHeight-2, &S.LockedToolTips[nIndex].ppStrings[0], S.LockedToolTips[nIndex].nNumStrings, S.nLockedToolTipColor[nIndex]); + nLockedToolTipX += nToolTipWidth + 4; + } + } + } + + if(S.nActiveMenu == 7) + { + if(S.nDisplay & MP_DRAW_DETAILED) + { + MicroProfileStringArray DetailedHelp; + MicroProfileStringArrayClear(&DetailedHelp); + MicroProfileStringArrayFormat(&DetailedHelp, "%s", MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Toggle Graph"); + MicroProfileStringArrayFormat(&DetailedHelp, "%s", MICROPROFILE_HELP_ALT); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Zoom"); + MicroProfileStringArrayFormat(&DetailedHelp, "%s + %s", MICROPROFILE_HELP_MOD, MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Lock Tooltip"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Drag"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Pan View"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Mouse Wheel"); + MicroProfileStringArrayAddLiteral(&DetailedHelp, "Zoom"); + MicroProfileDrawFloatWindow(nWidth, MICROPROFILE_FRAME_HISTORY_HEIGHT+20, DetailedHelp.ppStrings, DetailedHelp.nNumStrings, 0xff777777); + + MicroProfileStringArray DetailedHistoryHelp; + MicroProfileStringArrayClear(&DetailedHistoryHelp); + MicroProfileStringArrayFormat(&DetailedHistoryHelp, "%s", MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&DetailedHistoryHelp, "Center View"); + MicroProfileStringArrayFormat(&DetailedHistoryHelp, "%s", MICROPROFILE_HELP_ALT); + MicroProfileStringArrayAddLiteral(&DetailedHistoryHelp, "Zoom to frame"); + MicroProfileDrawFloatWindow(nWidth, 20, DetailedHistoryHelp.ppStrings, DetailedHistoryHelp.nNumStrings, 0xff777777); + + + + } + else if(0 != (S.nDisplay & MP_DRAW_BARS) && S.nBars) + { + MicroProfileStringArray BarHelp; + MicroProfileStringArrayClear(&BarHelp); + MicroProfileStringArrayFormat(&BarHelp, "%s", MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&BarHelp, "Toggle Graph"); + MicroProfileStringArrayFormat(&BarHelp, "%s + %s", MICROPROFILE_HELP_MOD, MICROPROFILE_HELP_LEFT); + MicroProfileStringArrayAddLiteral(&BarHelp, "Lock Tooltip"); + MicroProfileStringArrayAddLiteral(&BarHelp, "Drag"); + MicroProfileStringArrayAddLiteral(&BarHelp, "Pan View"); + MicroProfileDrawFloatWindow(nWidth, MICROPROFILE_FRAME_HISTORY_HEIGHT+20, BarHelp.ppStrings, BarHelp.nNumStrings, 0xff777777); + + } + MicroProfileStringArray Debug; + MicroProfileStringArrayClear(&Debug); + MicroProfileStringArrayAddLiteral(&Debug, "Memory Usage"); + MicroProfileStringArrayFormat(&Debug, "%4.2fmb", S.nMemUsage / (1024.f * 1024.f)); + uint32_t nFrameNext = (S.nFrameCurrent+1) % MICROPROFILE_MAX_FRAME_HISTORY; + MicroProfileFrameState* pFrameCurrent = &S.Frames[S.nFrameCurrent]; + MicroProfileFrameState* pFrameNext = &S.Frames[nFrameNext]; + + + MicroProfileStringArrayAddLiteral(&Debug, ""); + MicroProfileStringArrayAddLiteral(&Debug, ""); + MicroProfileStringArrayAddLiteral(&Debug, "Usage"); + MicroProfileStringArrayAddLiteral(&Debug, "markers [frames] "); + +#if MICROPROFILE_CONTEXT_SWITCH_TRACE + MicroProfileStringArrayAddLiteral(&Debug, "Context Switch"); + MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", S.nContextSwitchUsage, MICROPROFILE_CONTEXT_SWITCH_BUFFER_SIZE / S.nContextSwitchUsage ); +#endif + + for(int i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(pFrameCurrent->nLogStart[i] && S.Pool[i]) + { + uint32_t nEnd = pFrameNext->nLogStart[i]; + uint32_t nStart = pFrameCurrent->nLogStart[i]; + uint32_t nUsage = nStart < nEnd ? (nEnd - nStart) : (nEnd + MICROPROFILE_BUFFER_SIZE - nStart); + uint32_t nFrameSupport = MICROPROFILE_BUFFER_SIZE / nUsage; + MicroProfileStringArrayFormat(&Debug, "%s", &S.Pool[i]->ThreadName[0]); + MicroProfileStringArrayFormat(&Debug, "%9d [%7d]", nUsage, nFrameSupport); + } + } + + MicroProfileDrawFloatWindow(0, nHeight-10, Debug.ppStrings, Debug.nNumStrings, 0xff777777); + } + + + + if(S.nActiveMenu == -1 && !bMouseOverGraph) + { + if(S.nHoverToken != MICROPROFILE_INVALID_TOKEN) + { + MicroProfileDrawFloatTooltip(S.nMouseX, S.nMouseY, S.nHoverToken, S.nHoverTime); + } + else if(S.nContextSwitchHoverThreadAfter != -1 && S.nContextSwitchHoverThreadBefore != -1) + { + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + MicroProfileStringArray ToolTip; + MicroProfileStringArrayClear(&ToolTip); + MicroProfileStringArrayAddLiteral(&ToolTip, "Context Switch"); + MicroProfileStringArrayFormat(&ToolTip, "%04x", S.nContextSwitchHoverThread); + MicroProfileStringArrayAddLiteral(&ToolTip, "Before"); + MicroProfileStringArrayFormat(&ToolTip, "%04x", S.nContextSwitchHoverThreadBefore); + MicroProfileStringArrayAddLiteral(&ToolTip, "After"); + MicroProfileStringArrayFormat(&ToolTip, "%04x", S.nContextSwitchHoverThreadAfter); + MicroProfileStringArrayAddLiteral(&ToolTip, "Duration"); + int64_t nDifference = MicroProfileLogTickDifference(S.nContextSwitchHoverTickIn, S.nContextSwitchHoverTickOut); + MicroProfileStringArrayFormat(&ToolTip, "%6.2fms", fToMs * nDifference ); + MicroProfileStringArrayAddLiteral(&ToolTip, "CPU"); + MicroProfileStringArrayFormat(&ToolTip, "%d", S.nContextSwitchHoverCpu); + MicroProfileDrawFloatWindow(S.nMouseX, S.nMouseY+20, &ToolTip.ppStrings[0], ToolTip.nNumStrings, -1); + + + } + else if(S.nHoverFrame != -1) + { + uint32_t nNextFrame = (S.nHoverFrame+1)%MICROPROFILE_MAX_FRAME_HISTORY; + int64_t nTick = S.Frames[S.nHoverFrame].nFrameStartCpu; + int64_t nTickNext = S.Frames[nNextFrame].nFrameStartCpu; + int64_t nTickGpu = S.Frames[S.nHoverFrame].nFrameStartGpu; + int64_t nTickNextGpu = S.Frames[nNextFrame].nFrameStartGpu; + + float fToMs = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondCpu()); + float fToMsGpu = MicroProfileTickToMsMultiplier(MicroProfileTicksPerSecondGpu()); + float fMs = fToMs * (nTickNext - nTick); + float fMsGpu = fToMsGpu * (nTickNextGpu - nTickGpu); + MicroProfileStringArray ToolTip; + MicroProfileStringArrayClear(&ToolTip); + MicroProfileStringArrayFormat(&ToolTip, "Frame %d", S.nHoverFrame); + #if MICROPROFILE_DEBUG + MicroProfileStringArrayFormat(&ToolTip, "%p", &S.Frames[S.nHoverFrame]); + #else + MicroProfileStringArrayAddLiteral(&ToolTip, ""); + #endif + MicroProfileStringArrayAddLiteral(&ToolTip, "CPU Time"); + MicroProfileStringArrayFormat(&ToolTip, "%6.2fms", fMs); + MicroProfileStringArrayAddLiteral(&ToolTip, "GPU Time"); + MicroProfileStringArrayFormat(&ToolTip, "%6.2fms", fMsGpu); + #if MICROPROFILE_DEBUG + for(int i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(S.Frames[S.nHoverFrame].nLogStart[i]) + { + MicroProfileStringArrayFormat(&ToolTip, "%d", i); + MicroProfileStringArrayFormat(&ToolTip, "%d", S.Frames[S.nHoverFrame].nLogStart[i]); + } + } + #endif + MicroProfileDrawFloatWindow(S.nMouseX, S.nMouseY+20, &ToolTip.ppStrings[0], ToolTip.nNumStrings, -1); + } + if(S.nMouseLeft) + { + if(S.nHoverToken != MICROPROFILE_INVALID_TOKEN) + MicroProfileToggleGraph(S.nHoverToken); + } + } + } +#if MICROPROFILE_DRAWCURSOR + { + float fCursor[8] = + { + MicroProfileMax(0, (int)S.nMouseX-3), S.nMouseY, + MicroProfileMin(nWidth, S.nMouseX+3), S.nMouseY, + S.nMouseX, MicroProfileMax((int)S.nMouseY-3, 0), + S.nMouseX, MicroProfileMin(nHeight, S.nMouseY+3), + }; + MicroProfileDrawLine2D(2, &fCursor[0], 0xff00ff00); + MicroProfileDrawLine2D(2, &fCursor[4], 0xff00ff00); + } +#endif + + } + S.nMouseLeft = S.nMouseRight = 0; + S.nMouseLeftMod = S.nMouseRightMod = 0; + S.nMouseWheelDelta = 0; + if(S.nOverflow) + S.nOverflow--; + +} +void MicroProfileMousePosition(uint32_t nX, uint32_t nY, int nWheelDelta) +{ + S.nMouseX = nX; + S.nMouseY = nY; + S.nMouseWheelDelta = nWheelDelta; +} + +void MicroProfileModKey(uint32_t nKeyState) +{ + S.nModDown = nKeyState ? 1 : 0; +} + +void MicroProfileClearGraph() +{ + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken != 0) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + } + } +} +void MicroProfileTogglePause() +{ + S.nRunning = !S.nRunning; +} + +void MicroProfileGetState(MicroProfileState* pStateOut) +{ + pStateOut->nDisplay = S.nDisplay; + pStateOut->nMenuAllGroups = S.nMenuAllGroups; + pStateOut->nMenuActiveGroup = S.nMenuActiveGroup; + pStateOut->nMenuAllThreads = S.nMenuAllThreads; + pStateOut->nAggregateFlip = S.nAggregateFlip; + pStateOut->nBars = S.nBars; + pStateOut->fReferenceTime = S.fReferenceTime; +} + +void MicroProfileSetState(MicroProfileState* pStateOut) +{ + MicroProfileScopeLock L(MicroProfileMutex()); + S.nDisplay = pStateOut->nDisplay; + S.nMenuAllGroups = pStateOut->nMenuAllGroups; + S.nMenuActiveGroup = pStateOut->nMenuActiveGroup; + S.nMenuAllThreads = pStateOut->nMenuAllThreads; + S.nAggregateFlip = pStateOut->nAggregateFlip; + S.nBars = pStateOut->nBars; + S.fReferenceTime = pStateOut->fReferenceTime; + S.fRcpReferenceTime = 1.f / S.fReferenceTime; +} + +void MicroProfileToggleGraph(MicroProfileToken nToken) +{ + nToken &= 0xffff; + int32_t nMinSort = 0x7fffffff; + int32_t nFreeIndex = -1; + int32_t nMinIndex = 0; + int32_t nMaxSort = 0x80000000; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + if(S.Graph[i].nToken == MICROPROFILE_INVALID_TOKEN) + nFreeIndex = i; + if(S.Graph[i].nToken == nToken) + { + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + return; + } + if(S.Graph[i].nKey < nMinSort) + { + nMinSort = S.Graph[i].nKey; + nMinIndex = i; + } + if(S.Graph[i].nKey > nMaxSort) + { + nMaxSort = S.Graph[i].nKey; + } + } + int nIndex = nFreeIndex > -1 ? nFreeIndex : nMinIndex; + S.Graph[nIndex].nToken = nToken; + S.Graph[nIndex].nKey = nMaxSort+1; + memset(&S.Graph[nIndex].nHistory[0], 0, sizeof(S.Graph[nIndex].nHistory)); +} +void MicroProfileMouseButton(uint32_t nLeft, uint32_t nRight) +{ + if(0 == nLeft && S.nMouseDownLeft) + { + if(S.nModDown) + S.nMouseLeftMod = 1; + else + S.nMouseLeft = 1; + } + + if(0 == nRight && S.nMouseDownRight) + { + if(S.nModDown) + S.nMouseRightMod = 1; + else + S.nMouseRight = 1; + } + + S.nMouseDownLeft = nLeft; + S.nMouseDownRight = nRight; + +} + +#include + +#define MICROPROFILE_PRESET_HEADER_MAGIC 0x28586813 +#define MICROPROFILE_PRESET_HEADER_VERSION 0x00000100 +struct MicroProfilePresetHeader +{ + uint32_t nMagic; + uint32_t nVersion; + //groups, threads, aggregate, reference frame, graphs timers + uint32_t nGroups[MICROPROFILE_MAX_GROUPS]; + uint32_t nThreads[MICROPROFILE_MAX_THREADS]; + uint32_t nGraphName[MICROPROFILE_MAX_GRAPHS]; + uint32_t nGraphGroupName[MICROPROFILE_MAX_GRAPHS]; + uint32_t nMenuAllGroups; + uint32_t nMenuAllThreads; + uint32_t nAggregateFlip; + float fReferenceTime; + uint32_t nBars; + uint32_t nDisplay; + uint32_t nOpacityBackground; + uint32_t nOpacityForeground; +}; + +#ifndef MICROPROFILE_PRESET_FILENAME_FUNC +#define MICROPROFILE_PRESET_FILENAME_FUNC MicroProfilePresetFilename +static const char* MicroProfilePresetFilename(const char* pSuffix) +{ + static char filename[512]; + snprintf(filename, sizeof(filename)-1, ".microprofilepreset.%s", pSuffix); + return filename; +} +#endif + +void MicroProfileSavePreset(const char* pPresetName) +{ + std::lock_guard Lock(MicroProfileMutex()); + FILE* F = fopen(MICROPROFILE_PRESET_FILENAME_FUNC(pPresetName), "w"); + if(!F) return; + + MicroProfilePresetHeader Header; + memset(&Header, 0, sizeof(Header)); + Header.nAggregateFlip = S.nAggregateFlip; + Header.nBars = S.nBars; + Header.fReferenceTime = S.fReferenceTime; + Header.nMenuAllGroups = S.nMenuAllGroups; + Header.nMenuAllThreads = S.nMenuAllThreads; + Header.nMagic = MICROPROFILE_PRESET_HEADER_MAGIC; + Header.nVersion = MICROPROFILE_PRESET_HEADER_VERSION; + Header.nDisplay = S.nDisplay; + Header.nOpacityBackground = S.nOpacityBackground; + Header.nOpacityForeground = S.nOpacityForeground; + fwrite(&Header, sizeof(Header), 1, F); + uint64_t nMask = 1; + for(uint32_t i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(S.nMenuActiveGroup & nMask) + { + uint32_t offset = ftell(F); + const char* pName = S.GroupInfo[i].pName; + int nLen = (int)strlen(pName)+1; + fwrite(pName, nLen, 1, F); + Header.nGroups[i] = offset; + } + nMask <<= 1; + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + MicroProfileThreadLog* pLog = S.Pool[i]; + if(pLog && S.nThreadActive[i]) + { + uint32_t nOffset = ftell(F); + const char* pName = &pLog->ThreadName[0]; + int nLen = (int)strlen(pName)+1; + fwrite(pName, nLen, 1, F); + Header.nThreads[i] = nOffset; + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + MicroProfileToken nToken = S.Graph[i].nToken; + if(nToken != MICROPROFILE_INVALID_TOKEN) + { + uint32_t nGroupIndex = MicroProfileGetGroupIndex(nToken); + uint32_t nTimerIndex = MicroProfileGetTimerIndex(nToken); + const char* pGroupName = S.GroupInfo[nGroupIndex].pName; + const char* pTimerName = S.TimerInfo[nTimerIndex].pName; + MP_ASSERT(pGroupName); + MP_ASSERT(pTimerName); + int nGroupLen = (int)strlen(pGroupName)+1; + int nTimerLen = (int)strlen(pTimerName)+1; + + uint32_t nOffsetGroup = ftell(F); + fwrite(pGroupName, nGroupLen, 1, F); + uint32_t nOffsetTimer = ftell(F); + fwrite(pTimerName, nTimerLen, 1, F); + Header.nGraphName[i] = nOffsetTimer; + Header.nGraphGroupName[i] = nOffsetGroup; + } + } + fseek(F, 0, SEEK_SET); + fwrite(&Header, sizeof(Header), 1, F); + + fclose(F); + +} + + + +void MicroProfileLoadPreset(const char* pSuffix) +{ + std::lock_guard Lock(MicroProfileMutex()); + FILE* F = fopen(MICROPROFILE_PRESET_FILENAME_FUNC(pSuffix), "r"); + if(!F) + { + return; + } + fseek(F, 0, SEEK_END); + int nSize = ftell(F); + char* const pBuffer = (char*)alloca(nSize); + fseek(F, 0, SEEK_SET); + int nRead = (int)fread(pBuffer, nSize, 1, F); + fclose(F); + if(1 != nRead) + return; + + MicroProfilePresetHeader& Header = *(MicroProfilePresetHeader*)pBuffer; + + if(Header.nMagic != MICROPROFILE_PRESET_HEADER_MAGIC || Header.nVersion != MICROPROFILE_PRESET_HEADER_VERSION) + { + return; + } + + S.nAggregateFlip = Header.nAggregateFlip; + S.nBars = Header.nBars; + S.fReferenceTime = Header.fReferenceTime; + S.fRcpReferenceTime = 1.f / Header.fReferenceTime; + S.nMenuAllGroups = Header.nMenuAllGroups; + S.nMenuAllThreads = Header.nMenuAllThreads; + S.nDisplay = Header.nDisplay; + S.nMenuActiveGroup = 0; + S.nOpacityBackground = Header.nOpacityBackground; + S.nOpacityForeground = Header.nOpacityForeground; + + memset(&S.nThreadActive[0], 0, sizeof(S.nThreadActive)); + + for(uint32_t i = 0; i < MICROPROFILE_MAX_GROUPS; ++i) + { + if(Header.nGroups[i]) + { + const char* pGroupName = pBuffer + Header.nGroups[i]; + for(uint32_t j = 0; j < MICROPROFILE_MAX_GROUPS; ++j) + { + if(S.GroupInfo[j].pName && 0 == MP_STRCASECMP(pGroupName, S.GroupInfo[j].pName)) + { + S.nMenuActiveGroup |= (1ll << j); + } + } + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_THREADS; ++i) + { + if(Header.nThreads[i]) + { + const char* pThreadName = pBuffer + Header.nThreads[i]; + for(uint32_t j = 0; j < MICROPROFILE_MAX_THREADS; ++j) + { + MicroProfileThreadLog* pLog = S.Pool[j]; + if(pLog && 0 == MP_STRCASECMP(pThreadName, &pLog->ThreadName[0])) + { + S.nThreadActive[j] = 1; + } + } + } + } + for(uint32_t i = 0; i < MICROPROFILE_MAX_GRAPHS; ++i) + { + MicroProfileToken nPrevToken = S.Graph[i].nToken; + S.Graph[i].nToken = MICROPROFILE_INVALID_TOKEN; + if(Header.nGraphName[i] && Header.nGraphGroupName[i]) + { + const char* pGraphName = pBuffer + Header.nGraphName[i]; + const char* pGraphGroupName = pBuffer + Header.nGraphGroupName[i]; + for(uint32_t j = 0; j < S.nTotalTimers; ++j) + { + uint64_t nGroupIndex = S.TimerInfo[j].nGroupIndex; + if(0 == MP_STRCASECMP(pGraphName, S.TimerInfo[j].pName) && 0 == MP_STRCASECMP(pGraphGroupName, S.GroupInfo[nGroupIndex].pName)) + { + MicroProfileToken nToken = MicroProfileMakeToken(1ll << nGroupIndex, (uint16_t)j); + S.Graph[i].nToken = nToken; + if(nToken != nPrevToken) + { + memset(&S.Graph[i].nHistory, 0, sizeof(S.Graph[i].nHistory)); + } + break; + } + } + } + } +} + +void MicroProfileDrawLineVertical(int nX, int nTop, int nBottom, uint32_t nColor) +{ + MicroProfileDrawBox(nX, nTop, nX + 1, nBottom, nColor); +} + +void MicroProfileDrawLineHorizontal(int nLeft, int nRight, int nY, uint32_t nColor) +{ + MicroProfileDrawBox(nLeft, nY, nRight, nY + 1, nColor); +} + +float MicroProfileGetTime(const char* pGroup, const char* pName) +{ + MicroProfileToken nToken = MicroProfileFindToken(pGroup, pName); + if(nToken == MICROPROFILE_INVALID_TOKEN) + { + return 0.f; + } + uint32_t nTimerIndex = MicroProfileGetTimerIndex(nToken); + uint32_t nGroupIndex = MicroProfileGetGroupIndex(nToken); + float fToMs = MicroProfileTickToMsMultiplier(S.GroupInfo[nGroupIndex].Type == MicroProfileTokenTypeGpu ? MicroProfileTicksPerSecondGpu() : MicroProfileTicksPerSecondCpu()); + return S.Frame[nTimerIndex].nTicks * fToMs; +} +void MicroProfileForceEnableGroup(const char* pGroup, MicroProfileTokenType Type) +{ + MicroProfileInit(); + std::lock_guard Lock(MicroProfileMutex()); + uint16_t nGroup = MicroProfileGetGroup(pGroup, Type); + S.nForceGroup |= (1ll << nGroup); +} + +void MicroProfileForceDisableGroup(const char* pGroup, MicroProfileTokenType Type) +{ + MicroProfileInit(); + std::lock_guard Lock(MicroProfileMutex()); + uint16_t nGroup = MicroProfileGetGroup(pGroup, Type); + S.nForceGroup &= ~(1ll << nGroup); +} + + + +#if MICROPROFILE_CONTEXT_SWITCH_TRACE +#ifdef _WIN32 +#define INITGUID +#include +#include +#include + + +static GUID g_MicroProfileThreadClassGuid = { 0x3d6fa8d1, 0xfe05, 0x11d0, 0x9d, 0xda, 0x00, 0xc0, 0x4f, 0xd7, 0xba, 0x7c }; + +struct MicroProfileSCSwitch +{ + uint32_t NewThreadId; + uint32_t OldThreadId; + int8_t NewThreadPriority; + int8_t OldThreadPriority; + uint8_t PreviousCState; + int8_t SpareByte; + int8_t OldThreadWaitReason; + int8_t OldThreadWaitMode; + int8_t OldThreadState; + int8_t OldThreadWaitIdealProcessor; + uint32_t NewThreadWaitTime; + uint32_t Reserved; +}; + + +VOID WINAPI MicroProfileContextSwitchCallback(PEVENT_TRACE pEvent) +{ + if (pEvent->Header.Guid == g_MicroProfileThreadClassGuid) + { + if (pEvent->Header.Class.Type == 36) + { + MicroProfileSCSwitch* pCSwitch = (MicroProfileSCSwitch*) pEvent->MofData; + if ((pCSwitch->NewThreadId != 0) || (pCSwitch->OldThreadId != 0)) + { + MicroProfileContextSwitch Switch; + Switch.nThreadOut = pCSwitch->OldThreadId; + Switch.nThreadIn = pCSwitch->NewThreadId; + Switch.nCpu = pEvent->BufferContext.ProcessorNumber; + Switch.nTicks = pEvent->Header.TimeStamp.QuadPart; + MicroProfileContextSwitchPut(&Switch); + } + } + } +} + +ULONG WINAPI MicroProfileBufferCallback(PEVENT_TRACE_LOGFILE Buffer) +{ + return (S.bContextSwitchStop || !S.bContextSwitchRunning) ? FALSE : TRUE; +} + + +struct MicroProfileKernelTraceProperties : public EVENT_TRACE_PROPERTIES +{ + char dummy[sizeof(KERNEL_LOGGER_NAME)]; +}; + + +void MicroProfileTraceThread(int unused) +{ + + { + TRACEHANDLE SessionHandle = 0; + MicroProfileKernelTraceProperties sessionProperties; + + ZeroMemory(&sessionProperties, sizeof(sessionProperties)); + sessionProperties.Wnode.BufferSize = sizeof(sessionProperties); + sessionProperties.Wnode.Flags = WNODE_FLAG_TRACED_GUID; + sessionProperties.Wnode.ClientContext = 1; //QPC clock resolution + sessionProperties.Wnode.Guid = SystemTraceControlGuid; + sessionProperties.BufferSize = 1; + sessionProperties.NumberOfBuffers = 128; + sessionProperties.EnableFlags = EVENT_TRACE_FLAG_CSWITCH; + sessionProperties.LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + sessionProperties.MaximumFileSize = 0; + sessionProperties.LoggerNameOffset = sizeof(EVENT_TRACE_PROPERTIES); + sessionProperties.LogFileNameOffset = 0; + + EVENT_TRACE_LOGFILE log; + ZeroMemory(&log, sizeof(log)); + log.LoggerName = KERNEL_LOGGER_NAME; + log.ProcessTraceMode = 0; + TRACEHANDLE hLog = OpenTrace(&log); + if (hLog) + { + ControlTrace(SessionHandle, KERNEL_LOGGER_NAME, &sessionProperties, EVENT_TRACE_CONTROL_STOP); + } + CloseTrace(hLog); + + + } + ULONG status = ERROR_SUCCESS; + TRACEHANDLE SessionHandle = 0; + MicroProfileKernelTraceProperties sessionProperties; + + ZeroMemory(&sessionProperties, sizeof(sessionProperties)); + sessionProperties.Wnode.BufferSize = sizeof(sessionProperties); + sessionProperties.Wnode.Flags = WNODE_FLAG_TRACED_GUID; + sessionProperties.Wnode.ClientContext = 1; //QPC clock resolution + sessionProperties.Wnode.Guid = SystemTraceControlGuid; + sessionProperties.BufferSize = 1; + sessionProperties.NumberOfBuffers = 128; + sessionProperties.EnableFlags = EVENT_TRACE_FLAG_CSWITCH|EVENT_TRACE_FLAG_PROCESS; + sessionProperties.LogFileMode = EVENT_TRACE_REAL_TIME_MODE; + sessionProperties.MaximumFileSize = 0; + sessionProperties.LoggerNameOffset = sizeof(EVENT_TRACE_PROPERTIES); + sessionProperties.LogFileNameOffset = 0; + + + status = StartTrace((PTRACEHANDLE) &SessionHandle, KERNEL_LOGGER_NAME, &sessionProperties); + + if (ERROR_SUCCESS != status) + { + S.bContextSwitchRunning = false; + return; + } + + EVENT_TRACE_LOGFILE log; + ZeroMemory(&log, sizeof(log)); + + log.LoggerName = KERNEL_LOGGER_NAME; + log.ProcessTraceMode = PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_RAW_TIMESTAMP; + log.EventCallback = MicroProfileContextSwitchCallback; + log.BufferCallback = MicroProfileBufferCallback; + + TRACEHANDLE hLog = OpenTrace(&log); + ProcessTrace(&hLog, 1, 0, 0); + CloseTrace(hLog); + S.bContextSwitchRunning = false; +} + +void MicroProfileStartContextSwitchTrace() +{ + if(!S.bContextSwitchRunning) + { + if(!S.pContextSwitchThread) + S.pContextSwitchThread = new std::thread(); + if(S.pContextSwitchThread->joinable()) + { + S.bContextSwitchStop = true; + S.pContextSwitchThread->join(); + } + S.bContextSwitchRunning = true; + S.bContextSwitchStop = false; + *S.pContextSwitchThread = std::thread(&MicroProfileTraceThread, 0); + } +} + +void MicroProfileStopContextSwitchTrace() +{ + if(S.bContextSwitchRunning && S.pContextSwitchThread) + { + S.bContextSwitchStop = true; + S.pContextSwitchThread->join(); + } +} + +bool MicroProfileIsLocalThread(uint32_t nThreadId) +{ + HANDLE h = OpenThread(THREAD_QUERY_LIMITED_INFORMATION, FALSE, nThreadId); + if(h == NULL) + return false; + DWORD hProcess = GetProcessIdOfThread(h); + CloseHandle(h); + return GetCurrentProcessId() == hProcess; +} + +#else +#error "context switch trace not supported/implemented on platform" +#endif +#else + +bool MicroProfileIsLocalThread(uint32_t nThreadId){return false;} +void MicroProfileStopContextSwitchTrace(){} +void MicroProfileStartContextSwitchTrace(){} + +#endif + + +#undef S + +#ifdef _WIN32 +#pragma warning(pop) +#endif +#endif +#endif diff --git a/third_party/xbyak b/third_party/xbyak index 702d6e668..df27af3e4 160000 --- a/third_party/xbyak +++ b/third_party/xbyak @@ -1 +1 @@ -Subproject commit 702d6e6683c322f08a36ea059f6d6f8263b1bd0d +Subproject commit df27af3e4e7ded756bfbb23d0f663df728442935 diff --git a/tools/alloy-sandbox/alloy-sandbox.cc b/tools/alloy-sandbox/alloy-sandbox.cc index da8d1b80e..038580c69 100644 --- a/tools/alloy-sandbox/alloy-sandbox.cc +++ b/tools/alloy-sandbox/alloy-sandbox.cc @@ -24,6 +24,9 @@ using namespace xe::cpu; int alloy_sandbox(int argc, xechar_t** argv) { + Profiler::Initialize(); + xe::Profiler::ThreadEnter("main"); + XenonMemory* memory = new XenonMemory(); ExportResolver* export_resolver = new ExportResolver(); @@ -49,7 +52,7 @@ int alloy_sandbox(int argc, xechar_t** argv) { ctx->lr = 0xBEBEBEBE; ctx->r[5] = 10; ctx->r[25] = 25; - fn->Call(thread_state); + fn->Call(thread_state, ctx->lr); auto result = ctx->r[11]; delete thread_state; @@ -57,6 +60,9 @@ int alloy_sandbox(int argc, xechar_t** argv) { delete runtime; delete memory; + xe::Profiler::Dump(); + xe::Profiler::ThreadExit(); + return 0; } // ehhh diff --git a/tools/xenia-run/xenia-run.cc b/tools/xenia-run/xenia-run.cc index 59bc70a24..9f2151d8c 100644 --- a/tools/xenia-run/xenia-run.cc +++ b/tools/xenia-run/xenia-run.cc @@ -22,6 +22,9 @@ DEFINE_string(target, "", int xenia_run(int argc, xechar_t** argv) { int result_code = 1; + Profiler::Initialize(); + Profiler::ThreadEnter("main"); + Emulator* emulator = NULL; // Grab path from the flag or unnamed argument. @@ -89,6 +92,8 @@ XECLEANUP: if (result_code) { XEFATAL("Failed to launch emulator: %d", result_code); } + Profiler::Dump(); + Profiler::Shutdown(); return result_code; } XE_MAIN_WINDOW_THUNK(xenia_run, XETEXT("xenia-run"), "xenia-run some.xex"); diff --git a/xenia.gyp b/xenia.gyp index ddcd45b2a..8933eb750 100644 --- a/xenia.gyp +++ b/xenia.gyp @@ -5,6 +5,7 @@ 'third_party/beaengine.gypi', 'third_party/gflags.gypi', 'third_party/jansson.gypi', + 'third_party/llvm.gypi', 'third_party/sparsehash.gypi', 'third_party/wslay.gypi', ], @@ -23,9 +24,23 @@ 'target_arch%': 'x64', }, + 'conditions': [ + ['OS=="win"', { + 'variables': { + 'move_command%': 'move' + }, + }, { + 'variables': { + 'move_command%': 'mv' + }, + }] + ], + 'target_defaults': { 'include_dirs': [ 'include/', + 'third_party/', + '.', ], 'defines': [ @@ -95,7 +110,7 @@ 'SYMROOT': '<(DEPTH)/build/xenia/', 'ALWAYS_SEARCH_USER_PATHS': 'NO', 'ARCHS': ['x86_64'], - #'CLANG_CXX_LANGUAGE_STANDARD': 'c++0x', + 'CLANG_CXX_LANGUAGE_STANDARD': 'c++11', 'COMBINE_HIDPI_IMAGES': 'YES', 'GCC_C_LANGUAGE_STANDARD': 'gnu99', 'GCC_SYMBOLS_PRIVATE_EXTERN': 'YES', @@ -187,10 +202,28 @@ 'dependencies': [ 'beaengine', 'gflags', + 'llvm', ], + + 'conditions': [ + ['OS == "mac"', { + 'xcode_settings': { + 'OTHER_CFLAGS': [ + '-fno-operator-names', + ], + }, + }], + ['OS == "linux"', { + 'cflags': [ + '-fno-operator-names', + ], + }], + ], + 'export_dependent_settings': [ 'beaengine', 'gflags', + 'llvm', ], 'direct_dependent_settings': { @@ -211,6 +244,7 @@ 'user32', 'ole32', 'ntdll', + 'advapi32', ], }], ['OS == "mac"', { @@ -236,6 +270,7 @@ 'include_dirs': [ '.', 'src/', + '<(INTERMEDIATE_DIR)', ], 'includes': [ @@ -286,6 +321,7 @@ 'xinput', 'xaudio2', 'Shell32', + 'advapi32', ], }], ['OS == "mac"', {