From 7fb4b4cd419b9f4b00ab910840737bc19dcc2c46 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 15 Apr 2023 16:06:07 -0400 Subject: [PATCH] Attempt to emulate reserved load/store more closely. can't do anything for stores of the same value that are done via a non-reserved store to a reserved location uses a bitmap that splits up the memory space into 65k blocks per bit. Currently is using the guest virtual address but should be using physical addresses instead. Currently if a guest does a reserve on a location and then a reserved store to a totally different location we trigger a breakpoint. This should never happen Also removed the NEGATED_MUL_blah operations. They weren't necessary, nothing special is needed for the negated result variants. Added a log message for when watched physical memory has a race, it just would be nice to know when it happens and in what games. --- src/xenia/cpu/backend/x64/x64_backend.cc | 127 +++++++++++++++++++- src/xenia/cpu/backend/x64/x64_backend.h | 29 ++++- src/xenia/cpu/backend/x64/x64_seq_memory.cc | 79 +++++++++++- src/xenia/cpu/backend/x64/x64_sequences.cc | 103 +--------------- src/xenia/cpu/hir/hir_builder.cc | 43 +++---- src/xenia/cpu/hir/hir_builder.h | 8 +- src/xenia/cpu/hir/opcodes.h | 7 +- src/xenia/cpu/hir/opcodes.inl | 32 ++--- src/xenia/cpu/mmio_handler.cc | 3 +- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 2 +- src/xenia/cpu/ppc/ppc_emit_fpu.cc | 33 +++-- src/xenia/cpu/ppc/ppc_emit_memory.cc | 74 +++++++++--- 12 files changed, 353 insertions(+), 187 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_backend.cc b/src/xenia/cpu/backend/x64/x64_backend.cc index b1c1ff40e..e0918f89b 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.cc +++ b/src/xenia/cpu/backend/x64/x64_backend.cc @@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter { void* EmitGuestAndHostSynchronizeStackSizeLoadThunk( void* sync_func, unsigned stack_element_size); + void* EmitTryAcquireReservationHelper(); + void* EmitReservedStoreHelper(bool bit64 = false); + private: void* EmitCurrentForOffsets(const _code_offsets& offsets, size_t stack_size = 0); @@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) { thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk( synchronize_guest_and_host_stack_helper_, 4); } + try_acquire_reservation_helper_ = + thunk_emitter.EmitTryAcquireReservationHelper(); + reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false); + reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true); // Set the code cache to use the ResolveFunction thunk for default // indirections. @@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() { inc(ecx); jmp(checkbp, T_NEAR); L(we_good); - //we're popping this return address, so go down by one + // we're popping this return address, so go down by one sub(edx, sizeof(X64BackendStackpoint)); dec(ecx); L(checkbp); @@ -857,6 +864,123 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk( code_offsets.tail = getSize(); return EmitCurrentForOffsets(code_offsets); } +void* X64HelperEmitter::EmitTryAcquireReservationHelper() { + _code_offsets code_offsets = {}; + code_offsets.prolog = getSize(); + + Xbyak::Label already_has_a_reservation; + Xbyak::Label acquire_new_reservation; + + btr(GetBackendFlagsPtr(), 1); + mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_))); + jc(already_has_a_reservation); + + shr(ecx, RESERVE_BLOCK_SHIFT); + xor_(r9d, r9d); + mov(edx, ecx); + shr(edx, 6); // divide by 64 + lea(rdx, ptr[r8 + rdx * 8]); + and_(ecx, 64 - 1); + + lock(); + bts(qword[rdx], rcx); + // DebugBreak(); + // set flag on local backend context for thread to indicate our previous + // attempt to get the reservation succeeded + setnc(r9b); // success = bitmap did not have a set bit at the idx + shl(r9b, 1); + + mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)), + rdx); + mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx); + + or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d); + ret(); + L(already_has_a_reservation); + DebugBreak(); + + code_offsets.prolog_stack_alloc = getSize(); + code_offsets.body = getSize(); + code_offsets.epilog = getSize(); + code_offsets.tail = getSize(); + return EmitCurrentForOffsets(code_offsets); +} +// ecx=guest addr +// r9 = host addr +// r8 = value +// if ZF is set and CF is set, we succeeded +void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) { + _code_offsets code_offsets = {}; + code_offsets.prolog = getSize(); + Xbyak::Label done; + Xbyak::Label reservation_isnt_for_our_addr; + // carry must be set + zero flag must be set + + btr(GetBackendFlagsPtr(), 1); + + jnc(done); + + // mov(edx, i.src1.reg().cvt32()); + mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_))); + + shr(ecx, RESERVE_BLOCK_SHIFT); + mov(edx, ecx); + shr(edx, 6); // divide by 64 + lea(rdx, ptr[rax + rdx * 8]); + // begin acquiring exclusive access to cacheline containing our bit + prefetchw(ptr[rdx]); + + cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)), + rdx); + jnz(reservation_isnt_for_our_addr); + + mov(rax, + GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_))); + + // we need modulo bitsize, it turns out bittests' modulus behavior for the + // bitoffset only applies for register operands, for memory ones we bug out + // todo: actually, the above note may not be true, double check it + and_(ecx, 64 - 1); + cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx); + jnz(reservation_isnt_for_our_addr); + + // was our memory modified by kernel code or something? + lock(); + if (bit64) { + cmpxchg(ptr[r9], r8); + + } else { + cmpxchg(ptr[r9], r8d); + } + // the ZF flag is unaffected by BTR! we exploit this for the retval + + // cancel our lock on the 65k block + lock(); + btr(qword[rdx], rcx); + + // Xbyak::Label check_fucky; + jc(done); + DebugBreak(); + + // L(check_fucky); + + L(done); + + // i don't care that theres a dependency on the prev value of rax atm + // sadly theres no CF&ZF condition code + setz(al); + setc(ah); + cmp(ax, 0x0101); + ret(); + L(reservation_isnt_for_our_addr); + DebugBreak(); + code_offsets.prolog_stack_alloc = getSize(); + code_offsets.body = getSize(); + code_offsets.epilog = getSize(); + code_offsets.tail = getSize(); + return EmitCurrentForOffsets(code_offsets); +} + void X64HelperEmitter::EmitSaveVolatileRegs() { // Save off volatile registers. // mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax); @@ -975,6 +1099,7 @@ void X64Backend::InitializeBackendContext(void* ctx) { // https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png bctx->Ox1000 = 0x1000; bctx->guest_tick_count = Clock::GetGuestTickCountPointer(); + bctx->reserve_helper_ = &reserve_helper_; } void X64Backend::DeinitializeBackendContext(void* ctx) { X64BackendContext* bctx = BackendContextForGuestContext(ctx); diff --git a/src/xenia/cpu/backend/x64/x64_backend.h b/src/xenia/cpu/backend/x64/x64_backend.h index 79f635722..55ef7da6d 100644 --- a/src/xenia/cpu/backend/x64/x64_backend.h +++ b/src/xenia/cpu/backend/x64/x64_backend.h @@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1); typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1); typedef void (*ResolveFunctionThunk)(); +#define RESERVE_BLOCK_SHIFT 16 + +#define RESERVE_NUM_ENTRIES \ + ((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT) +// https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations +struct ReserveHelper { + uint64_t blocks[RESERVE_NUM_ENTRIES / 64]; + + ReserveHelper() { memset(blocks, 0, sizeof(blocks)); } +}; + struct X64BackendStackpoint { uint64_t host_stack_; unsigned guest_stack_; @@ -55,16 +66,21 @@ struct X64BackendStackpoint { // context (somehow placing a global X64BackendCtx prior to membase, so we can // negatively index the membase reg) struct X64BackendContext { + ReserveHelper* reserve_helper_; + uint64_t cached_reserve_value_; // guest_tick_count is used if inline_loadclock is used uint64_t* guest_tick_count; // records mapping of host_stack to guest_stack X64BackendStackpoint* stackpoints; - + uint64_t cached_reserve_offset; + uint32_t cached_reserve_bit; unsigned int current_stackpoint_depth; unsigned int mxcsr_fpu; // currently, the way we implement rounding mode // affects both vmx and the fpu unsigned int mxcsr_vmx; - unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx + // bit 0 = 0 if mxcsr is fpu, else it is vmx + // bit 1 = got reserve + unsigned int flags; unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted // add of it by... 2 bytes lol }; @@ -152,9 +168,18 @@ class X64Backend : public Backend { void* synchronize_guest_and_host_stack_helper_size8_ = nullptr; void* synchronize_guest_and_host_stack_helper_size16_ = nullptr; void* synchronize_guest_and_host_stack_helper_size32_ = nullptr; + + public: + void* try_acquire_reservation_helper_ = nullptr; + void* reserved_store_32_helper = nullptr; + void* reserved_store_64_helper = nullptr; + + private: #if XE_X64_PROFILER_AVAILABLE == 1 GuestProfilerData profiler_data_; #endif + + alignas(64) ReserveHelper reserve_helper_; }; } // namespace x64 diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 2cee66ece..75986b355 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -387,7 +387,6 @@ struct LVL_V128 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128); - struct LVR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { Xbyak::Label endpoint{}; @@ -483,6 +482,84 @@ struct STVR_V128 : Sequence> { } }; EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128); + +struct RESERVED_LOAD_INT32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // should use phys addrs, not virtual addrs! + + // try_acquire_reservation_helper_ doesnt spoil rax + e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]); + // begin acquiring exclusive access to the location + // we will do a load first, but we'll need exclusive access once we do our + // atomic op in the store + e.prefetchw(e.ptr[e.rax]); + e.mov(e.ecx, i.src1.reg().cvt32()); + e.call(e.backend()->try_acquire_reservation_helper_); + e.mov(i.dest, e.dword[e.rax]); + + e.mov( + e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)), + i.dest.reg().cvt64()); + } +}; + +struct RESERVED_LOAD_INT64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // try_acquire_reservation_helper_ doesnt spoil rax + e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.mov(e.ecx, i.src1.reg().cvt32()); + // begin acquiring exclusive access to the location + // we will do a load first, but we'll need exclusive access once we do our + // atomic op in the store + e.prefetchw(e.ptr[e.rax]); + + e.call(e.backend()->try_acquire_reservation_helper_); + e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]); + + e.mov( + e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)), + i.dest.reg()); + } +}; + +EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32, + RESERVED_LOAD_INT64); + +// address, value + +struct RESERVED_STORE_INT32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // edx=guest addr + // r9 = host addr + // r8 = value + // if ZF is set and CF is set, we succeeded + e.mov(e.ecx, i.src1.reg().cvt32()); + e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.mov(e.r8d, i.src2); + e.call(e.backend()->reserved_store_32_helper); + e.setz(i.dest); + } +}; + +struct RESERVED_STORE_INT64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(e.ecx, i.src1.reg().cvt32()); + e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.mov(e.r8, i.src2); + e.call(e.backend()->reserved_store_64_helper); + e.setz(i.dest); + } +}; + +EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32, + RESERVED_STORE_INT64); + // ============================================================================ // OPCODE_ATOMIC_COMPARE_EXCHANGE // ============================================================================ diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 554906660..5f428ad6c 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32 e.ChangeMxcsrMode(MXCSRMode::Fpu); if (!HasPrecedingCmpOfSameValues(i.instr)) { EmitCommutativeBinaryXmmOp( - e, i, - [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { e.vcomiss(src1, src2); }); } @@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64 e.ChangeMxcsrMode(MXCSRMode::Fpu); if (!HasPrecedingCmpOfSameValues(i.instr)) { EmitCommutativeBinaryXmmOp( - e, i, - [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { e.vcomisd(src1, src2); }); } @@ -1935,53 +1933,6 @@ struct MUL_ADD_V128 }; EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); -struct NEGATED_MUL_ADD_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - - Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); - if (e.IsFeatureEnabled(kX64EmitFMA)) { - // todo: this is garbage - e.vmovapd(e.xmm3, src1); - e.vfmadd213sd(e.xmm3, src2, src3); - e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD)); - } else { - // todo: might need to use x87 in this case... - e.vmulsd(e.xmm3, src1, src2); - e.vaddsd(i.dest, e.xmm3, src3); - e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD)); - } - } -}; -struct NEGATED_MUL_ADD_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Vmx); - - Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); - if (e.IsFeatureEnabled(kX64EmitFMA)) { - // todo: this is garbage - e.vmovaps(e.xmm3, src1); - e.vfmadd213ps(e.xmm3, src2, src3); - e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS)); - } else { - // todo: might need to use x87 in this case... - e.vmulps(e.xmm3, src1, src2); - e.vaddps(i.dest, e.xmm3, src3); - e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64, - NEGATED_MUL_ADD_V128); - // ============================================================================ // OPCODE_MUL_SUB // ============================================================================ @@ -2038,53 +1989,6 @@ struct MUL_SUB_V128 }; EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128); -struct NEGATED_MUL_SUB_F64 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Fpu); - - Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); - if (e.IsFeatureEnabled(kX64EmitFMA)) { - // todo: this is garbage - e.vmovapd(e.xmm3, src1); - e.vfmsub213sd(e.xmm3, src2, src3); - e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD)); - } else { - // todo: might need to use x87 in this case... - e.vmulsd(e.xmm3, src1, src2); - e.vsubsd(i.dest, e.xmm3, src3); - e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD)); - } - } -}; -struct NEGATED_MUL_SUB_V128 - : Sequence> { - static void Emit(X64Emitter& e, const EmitArgType& i) { - e.ChangeMxcsrMode(MXCSRMode::Vmx); - - Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2); - if (e.IsFeatureEnabled(kX64EmitFMA)) { - // todo: this is garbage - e.vmovaps(e.xmm3, src1); - e.vfmsub213ps(e.xmm3, src2, src3); - e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS)); - } else { - // todo: might need to use x87 in this case... - e.vmulps(e.xmm3, src1, src2); - e.vsubps(i.dest, e.xmm3, src3); - e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS)); - } - } -}; -EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64, - NEGATED_MUL_SUB_V128); - // ============================================================================ // OPCODE_NEG // ============================================================================ @@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) { // src1 constant. // `and` instruction only supports up to 32-bit immediate constants // 64-bit constants will need a temp register - //only possible with 64 bit inputs, andc is the only instruction that generates this + // only possible with 64 bit inputs, andc is the only instruction that + // generates this auto temp = GetTempReg(e); e.mov(temp, i.src1.constant()); diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index 7a5935001..00634a5e6 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) { return i->dest; } +Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) { + ASSERT_ADDRESS_TYPE(address); + + Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type)); + i->set_src1(address); + i->src2.value = i->src3.value = NULL; + + return i->dest; +} + +Value* HIRBuilder::StoreWithReserve(Value* address, Value* value, + TypeName type) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE)); + i->set_src1(address); + i->set_src2(value); + i->src3.value = NULL; + return i->dest; +} void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) { ASSERT_ADDRESS_TYPE(address); Instr* i = AppendInstr(OPCODE_STORE_info, store_flags); @@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) { return i->dest; } -Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) { - ASSERT_TYPES_EQUAL(value1, value2); - ASSERT_TYPES_EQUAL(value1, value3); - - Instr* i = - AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type)); - i->set_src1(value1); - i->set_src2(value2); - i->set_src3(value3); - return i->dest; -} - -Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) { - ASSERT_TYPES_EQUAL(value1, value2); - ASSERT_TYPES_EQUAL(value1, value3); - - Instr* i = - AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type)); - i->set_src1(value1); - i->set_src2(value2); - i->set_src3(value3); - return i->dest; -} - Value* HIRBuilder::Neg(Value* value) { Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type)); i->set_src1(value); diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index d83806cd9..b33f18aaf 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -189,6 +189,9 @@ class HIRBuilder { uint32_t store_flags = 0); Value* Load(Value* address, TypeName type, uint32_t load_flags = 0); + // create a reserve on an address, + Value* LoadWithReserve(Value* address, TypeName type); + Value* StoreWithReserve(Value* address, Value* value, TypeName type); Value* LoadVectorLeft(Value* address); Value* LoadVectorRight(Value* address); @@ -242,10 +245,7 @@ class HIRBuilder { Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3 Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3 - Value* NegatedMulAdd(Value* value1, Value* value2, - Value* value3); // -((1 * 2) + 3) - Value* NegatedMulSub(Value* value1, Value* value2, - Value* value3); // -((1 * 2) - 3) + Value* Neg(Value* value); Value* Abs(Value* value); Value* Sqrt(Value* value); diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 1bd85cae9..5a1bdc53b 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -248,9 +248,7 @@ enum Opcode { OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type. OPCODE_DIV, OPCODE_MUL_ADD, - OPCODE_NEGATED_MUL_ADD, OPCODE_MUL_SUB, - OPCODE_NEGATED_MUL_SUB, OPCODE_NEG, OPCODE_ABS, OPCODE_SQRT, @@ -292,7 +290,10 @@ enum Opcode { // as we already have OPCODE_ROUND. round double to float ( // ppc "single" fpu instruction result rounding behavior ) OPCODE_SET_NJM, - OPCODE_DELAY_EXECUTION, //for db16cyc + OPCODE_DELAY_EXECUTION, // for db16cyc + OPCODE_RESERVED_LOAD, + OPCODE_RESERVED_STORE, + __OPCODE_MAX_VALUE, // Keep at end. }; diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index e27f30b46..c5c089e85 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -218,7 +218,12 @@ DEFINE_OPCODE( "context_barrier", OPCODE_SIG_X, 0) -DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0) + +DEFINE_OPCODE( + OPCODE_DELAY_EXECUTION, + "delay_execution", + OPCODE_SIG_X, + 0) DEFINE_OPCODE( OPCODE_LOAD_MMIO, "load_mmio", @@ -453,19 +458,6 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V_V, OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) -DEFINE_OPCODE( - OPCODE_NEGATED_MUL_ADD, - "negated_mul_add", - OPCODE_SIG_V_V_V_V, - OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) - -DEFINE_OPCODE( - OPCODE_NEGATED_MUL_SUB, - "negated_mul_sub", - OPCODE_SIG_V_V_V_V, - OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING) - - DEFINE_OPCODE( OPCODE_NEG, "neg", @@ -719,3 +711,15 @@ DEFINE_OPCODE( "storev_right", OPCODE_SIG_X_V_V, OPCODE_FLAG_MEMORY) + +DEFINE_OPCODE( + OPCODE_RESERVED_LOAD, + "reserved_load", + OPCODE_SIG_V_V, + OPCODE_FLAG_MEMORY) + +DEFINE_OPCODE( + OPCODE_RESERVED_STORE, + "reserved_store", + OPCODE_SIG_V_V_V, + OPCODE_FLAG_MEMORY) \ No newline at end of file diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc index b1e2d2964..d676c4ada 100644 --- a/src/xenia/cpu/mmio_handler.cc +++ b/src/xenia/cpu/mmio_handler.cc @@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p, uint8_t rex_b = rex & 0b0001; uint8_t rex_x = rex & 0b0010; uint8_t rex_r = rex & 0b0100; - //uint8_t rex_w = rex & 0b1000; + // uint8_t rex_w = rex & 0b1000; // http://www.sandpile.org/x86/opc_rm.htm // http://www.sandpile.org/x86/opc_sib.htm @@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) { if (cur_access != memory::PageAccess::kNoAccess && (!is_write || cur_access != memory::PageAccess::kReadOnly)) { // Another thread has cleared this watch. Abort. + XELOGD("Race condition on watch, was already cleared by another thread!"); return true; } // The address is not found within any range, so either a write watch or an diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 386abf4bd..0ea2fb4ad 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, Value* b = f.VectorDenormFlush(f.LoadVR(vb)); Value* c = f.VectorDenormFlush(f.LoadVR(vc)); - Value* v = f.NegatedMulSub(a, c, b); + Value* v = f.Neg(f.MulSub(a, c, b)); f.StoreVR(vd, v); return 0; } diff --git a/src/xenia/cpu/ppc/ppc_emit_fpu.cc b/src/xenia/cpu/ppc/ppc_emit_fpu.cc index c491ad09a..79c4240e1 100644 --- a/src/xenia/cpu/ppc/ppc_emit_fpu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_fpu.cc @@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] + frB) - Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), - f.LoadFPR(i.A.FRB)); + Value* v = f.Neg( + f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] + frB) - Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), - f.LoadFPR(i.A.FRB)); + Value* v = f.Neg( + f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); @@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] - frB) - Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), - f.LoadFPR(i.A.FRB)); + Value* v = f.Neg( + f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); return 0; @@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- -([frA x frC] - frB) - Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), - f.LoadFPR(i.A.FRB)); + Value* v = f.Neg( + f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB))); v = f.ToSingle(v); f.StoreFPR(i.A.FRT, v); f.UpdateFPSCR(v, i.A.Rc); @@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) { f.StoreFPR(i.X.RT, v); /* The contents of frB with bit 0 cleared are placed into frD. -Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign -bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR. -Other registers altered: -• Condition Register (CR1 field): +Note that the fabs instruction treats NaNs just like any other kind of value. +That is, the sign bit of a NaN may be altered by fabs. This instruction does not +alter the FPSCR. Other registers altered: • Condition Register (CR1 field): Affected: FX, FEX, VX, OX (if Rc = 1) */ - // f.UpdateFPSCR(v, i.X.Rc); + // f.UpdateFPSCR(v, i.X.Rc); if (i.X.Rc) { // todo } @@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) { // frD <- !abs(frB) Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB))); f.StoreFPR(i.X.RT, v); - //f.UpdateFPSCR(v, i.X.Rc); + // f.UpdateFPSCR(v, i.X.Rc); if (i.X.Rc) { - //todo + // todo } return 0; } @@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) { // frD <- ¬ frB[0] || frB[1-63] Value* v = f.Neg(f.LoadFPR(i.X.RB)); f.StoreFPR(i.X.RT, v); - //f.UpdateFPSCR(v, i.X.Rc); + // f.UpdateFPSCR(v, i.X.Rc); if (i.X.Rc) { - //todo + // todo } return 0; } diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc index 69c7fdf9e..b4bdabb49 100644 --- a/src/xenia/cpu/ppc/ppc_emit_memory.cc +++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc @@ -22,6 +22,12 @@ DEFINE_bool( "instructions were written with the Xbox 360's cache in mind, and modern " "processors do their own automatic prefetching.", "CPU"); + +DEFINE_bool(no_reserved_ops, false, + "For testing whether a game may have races with a broken reserved " + "load/store impl", + "CPU"); + namespace xe { namespace cpu { namespace ppc { @@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) { // already, but I haven't see anything but interrupt callbacks (which are // always under a global lock) do that yet. // We issue a memory barrier here to make sure that we get good values. - f.MemoryBarrier(); - Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE)); - f.StoreReserved(rt); - f.StoreGPR(i.X.RT, rt); + + if (cvars::no_reserved_ops) { + f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE))); + + } else { + f.MemoryBarrier(); + + Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE)); + f.StoreGPR(i.X.RT, rt); + } return 0; } @@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) { // already, but I haven't see anything but interrupt callbacks (which are // always under a global lock) do that yet. // We issue a memory barrier here to make sure that we get good values. - f.MemoryBarrier(); Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE); - f.StoreReserved(rt); - f.StoreGPR(i.X.RT, rt); + if (cvars::no_reserved_ops) { + f.StoreGPR(i.X.RT, + f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE)); + + } else { + f.MemoryBarrier(); + + Value* rt = + f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE); + f.StoreGPR(i.X.RT, rt); + } return 0; } @@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) { Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT)); - Value* res = f.ByteSwap(f.LoadReserved()); - Value* v = f.AtomicCompareExchange(ea, res, rt); - f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); + + if (cvars::no_reserved_ops) { + f.Store(ea, rt); + + f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1)); + } else { + Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE); + + f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); + } f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8()); // Issue memory barrier for when we go out of lock and want others to see our // updates. - - f.MemoryBarrier(); - + if (!cvars::no_reserved_ops) { + f.MemoryBarrier(); + } return 0; } @@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) { // This will always succeed if under the global lock, however. Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); + Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE)); - Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE)); - Value* v = f.AtomicCompareExchange(ea, res, rt); - f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); + + if (cvars::no_reserved_ops) { + f.Store(ea, rt); + + f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1)); + } else { + Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE); + f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); + } + f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8()); // Issue memory barrier for when we go out of lock and want others to see our // updates. - f.MemoryBarrier(); + if (!cvars::no_reserved_ops) { + f.MemoryBarrier(); + } return 0; } - // Floating-point load (A-19) int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {