Attempt to emulate reserved load/store more closely. can't do anything for stores of the same value that are done via a non-reserved store to a reserved location

uses a bitmap that splits up the memory space into 65k blocks per bit.  Currently is using the guest virtual address but should be using physical addresses instead.

Currently if a guest does a reserve on a location and then a reserved store to a totally different location we trigger a breakpoint. This should never happen
Also removed the NEGATED_MUL_blah operations. They weren't necessary, nothing special is needed for the negated result variants.

Added a log message for when watched physical memory has a race, it just would be nice to know when it happens and in what games.
This commit is contained in:
chss95cs@gmail.com 2023-04-15 16:06:07 -04:00
parent 5e0c67438c
commit 7fb4b4cd41
12 changed files with 353 additions and 187 deletions

View File

@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter {
void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
void* sync_func, unsigned stack_element_size);
void* EmitTryAcquireReservationHelper();
void* EmitReservedStoreHelper(bool bit64 = false);
private:
void* EmitCurrentForOffsets(const _code_offsets& offsets,
size_t stack_size = 0);
@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) {
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
synchronize_guest_and_host_stack_helper_, 4);
}
try_acquire_reservation_helper_ =
thunk_emitter.EmitTryAcquireReservationHelper();
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
// Set the code cache to use the ResolveFunction thunk for default
// indirections.
@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
inc(ecx);
jmp(checkbp, T_NEAR);
L(we_good);
//we're popping this return address, so go down by one
// we're popping this return address, so go down by one
sub(edx, sizeof(X64BackendStackpoint));
dec(ecx);
L(checkbp);
@ -857,6 +864,123 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets);
}
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
Xbyak::Label already_has_a_reservation;
Xbyak::Label acquire_new_reservation;
btr(GetBackendFlagsPtr(), 1);
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
jc(already_has_a_reservation);
shr(ecx, RESERVE_BLOCK_SHIFT);
xor_(r9d, r9d);
mov(edx, ecx);
shr(edx, 6); // divide by 64
lea(rdx, ptr[r8 + rdx * 8]);
and_(ecx, 64 - 1);
lock();
bts(qword[rdx], rcx);
// DebugBreak();
// set flag on local backend context for thread to indicate our previous
// attempt to get the reservation succeeded
setnc(r9b); // success = bitmap did not have a set bit at the idx
shl(r9b, 1);
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
rdx);
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
ret();
L(already_has_a_reservation);
DebugBreak();
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets);
}
// ecx=guest addr
// r9 = host addr
// r8 = value
// if ZF is set and CF is set, we succeeded
void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
Xbyak::Label done;
Xbyak::Label reservation_isnt_for_our_addr;
// carry must be set + zero flag must be set
btr(GetBackendFlagsPtr(), 1);
jnc(done);
// mov(edx, i.src1.reg().cvt32());
mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
shr(ecx, RESERVE_BLOCK_SHIFT);
mov(edx, ecx);
shr(edx, 6); // divide by 64
lea(rdx, ptr[rax + rdx * 8]);
// begin acquiring exclusive access to cacheline containing our bit
prefetchw(ptr[rdx]);
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
rdx);
jnz(reservation_isnt_for_our_addr);
mov(rax,
GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
// we need modulo bitsize, it turns out bittests' modulus behavior for the
// bitoffset only applies for register operands, for memory ones we bug out
// todo: actually, the above note may not be true, double check it
and_(ecx, 64 - 1);
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
jnz(reservation_isnt_for_our_addr);
// was our memory modified by kernel code or something?
lock();
if (bit64) {
cmpxchg(ptr[r9], r8);
} else {
cmpxchg(ptr[r9], r8d);
}
// the ZF flag is unaffected by BTR! we exploit this for the retval
// cancel our lock on the 65k block
lock();
btr(qword[rdx], rcx);
// Xbyak::Label check_fucky;
jc(done);
DebugBreak();
// L(check_fucky);
L(done);
// i don't care that theres a dependency on the prev value of rax atm
// sadly theres no CF&ZF condition code
setz(al);
setc(ah);
cmp(ax, 0x0101);
ret();
L(reservation_isnt_for_our_addr);
DebugBreak();
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets);
}
void X64HelperEmitter::EmitSaveVolatileRegs() {
// Save off volatile registers.
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
@ -975,6 +1099,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
bctx->Ox1000 = 0x1000;
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
bctx->reserve_helper_ = &reserve_helper_;
}
void X64Backend::DeinitializeBackendContext(void* ctx) {
X64BackendContext* bctx = BackendContextForGuestContext(ctx);

View File

@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
typedef void (*ResolveFunctionThunk)();
#define RESERVE_BLOCK_SHIFT 16
#define RESERVE_NUM_ENTRIES \
((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT)
// https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations
struct ReserveHelper {
uint64_t blocks[RESERVE_NUM_ENTRIES / 64];
ReserveHelper() { memset(blocks, 0, sizeof(blocks)); }
};
struct X64BackendStackpoint {
uint64_t host_stack_;
unsigned guest_stack_;
@ -55,16 +66,21 @@ struct X64BackendStackpoint {
// context (somehow placing a global X64BackendCtx prior to membase, so we can
// negatively index the membase reg)
struct X64BackendContext {
ReserveHelper* reserve_helper_;
uint64_t cached_reserve_value_;
// guest_tick_count is used if inline_loadclock is used
uint64_t* guest_tick_count;
// records mapping of host_stack to guest_stack
X64BackendStackpoint* stackpoints;
uint64_t cached_reserve_offset;
uint32_t cached_reserve_bit;
unsigned int current_stackpoint_depth;
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
// affects both vmx and the fpu
unsigned int mxcsr_vmx;
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
// bit 0 = 0 if mxcsr is fpu, else it is vmx
// bit 1 = got reserve
unsigned int flags;
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
// add of it by... 2 bytes lol
};
@ -152,9 +168,18 @@ class X64Backend : public Backend {
void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
public:
void* try_acquire_reservation_helper_ = nullptr;
void* reserved_store_32_helper = nullptr;
void* reserved_store_64_helper = nullptr;
private:
#if XE_X64_PROFILER_AVAILABLE == 1
GuestProfilerData profiler_data_;
#endif
alignas(64) ReserveHelper reserve_helper_;
};
} // namespace x64

View File

@ -387,7 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
};
EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);
struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
Xbyak::Label endpoint{};
@ -483,6 +482,84 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
}
};
EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);
struct RESERVED_LOAD_INT32
: Sequence<RESERVED_LOAD_INT32, I<OPCODE_RESERVED_LOAD, I32Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// should use phys addrs, not virtual addrs!
// try_acquire_reservation_helper_ doesnt spoil rax
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
// begin acquiring exclusive access to the location
// we will do a load first, but we'll need exclusive access once we do our
// atomic op in the store
e.prefetchw(e.ptr[e.rax]);
e.mov(e.ecx, i.src1.reg().cvt32());
e.call(e.backend()->try_acquire_reservation_helper_);
e.mov(i.dest, e.dword[e.rax]);
e.mov(
e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
i.dest.reg().cvt64());
}
};
struct RESERVED_LOAD_INT64
: Sequence<RESERVED_LOAD_INT64, I<OPCODE_RESERVED_LOAD, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// try_acquire_reservation_helper_ doesnt spoil rax
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
e.mov(e.ecx, i.src1.reg().cvt32());
// begin acquiring exclusive access to the location
// we will do a load first, but we'll need exclusive access once we do our
// atomic op in the store
e.prefetchw(e.ptr[e.rax]);
e.call(e.backend()->try_acquire_reservation_helper_);
e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]);
e.mov(
e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
i.dest.reg());
}
};
EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32,
RESERVED_LOAD_INT64);
// address, value
struct RESERVED_STORE_INT32
: Sequence<RESERVED_STORE_INT32,
I<OPCODE_RESERVED_STORE, I8Op, I64Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
// edx=guest addr
// r9 = host addr
// r8 = value
// if ZF is set and CF is set, we succeeded
e.mov(e.ecx, i.src1.reg().cvt32());
e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
e.mov(e.r8d, i.src2);
e.call(e.backend()->reserved_store_32_helper);
e.setz(i.dest);
}
};
struct RESERVED_STORE_INT64
: Sequence<RESERVED_STORE_INT64,
I<OPCODE_RESERVED_STORE, I8Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.mov(e.ecx, i.src1.reg().cvt32());
e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
e.mov(e.r8, i.src2);
e.call(e.backend()->reserved_store_64_helper);
e.setz(i.dest);
}
};
EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32,
RESERVED_STORE_INT64);
// ============================================================================
// OPCODE_ATOMIC_COMPARE_EXCHANGE
// ============================================================================

View File

@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32
e.ChangeMxcsrMode(MXCSRMode::Fpu);
if (!HasPrecedingCmpOfSameValues(i.instr)) {
EmitCommutativeBinaryXmmOp(
e, i,
[](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
e.vcomiss(src1, src2);
});
}
@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64
e.ChangeMxcsrMode(MXCSRMode::Fpu);
if (!HasPrecedingCmpOfSameValues(i.instr)) {
EmitCommutativeBinaryXmmOp(
e, i,
[](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
e.vcomisd(src1, src2);
});
}
@ -1935,53 +1933,6 @@ struct MUL_ADD_V128
};
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
struct NEGATED_MUL_ADD_F64
: Sequence<NEGATED_MUL_ADD_F64,
I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovapd(e.xmm3, src1);
e.vfmadd213sd(e.xmm3, src2, src3);
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
} else {
// todo: might need to use x87 in this case...
e.vmulsd(e.xmm3, src1, src2);
e.vaddsd(i.dest, e.xmm3, src3);
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
}
}
};
struct NEGATED_MUL_ADD_V128
: Sequence<NEGATED_MUL_ADD_V128,
I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovaps(e.xmm3, src1);
e.vfmadd213ps(e.xmm3, src2, src3);
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
} else {
// todo: might need to use x87 in this case...
e.vmulps(e.xmm3, src1, src2);
e.vaddps(i.dest, e.xmm3, src3);
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
NEGATED_MUL_ADD_V128);
// ============================================================================
// OPCODE_MUL_SUB
// ============================================================================
@ -2038,53 +1989,6 @@ struct MUL_SUB_V128
};
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
struct NEGATED_MUL_SUB_F64
: Sequence<NEGATED_MUL_SUB_F64,
I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Fpu);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovapd(e.xmm3, src1);
e.vfmsub213sd(e.xmm3, src2, src3);
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
} else {
// todo: might need to use x87 in this case...
e.vmulsd(e.xmm3, src1, src2);
e.vsubsd(i.dest, e.xmm3, src3);
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
}
}
};
struct NEGATED_MUL_SUB_V128
: Sequence<NEGATED_MUL_SUB_V128,
I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
if (e.IsFeatureEnabled(kX64EmitFMA)) {
// todo: this is garbage
e.vmovaps(e.xmm3, src1);
e.vfmsub213ps(e.xmm3, src2, src3);
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
} else {
// todo: might need to use x87 in this case...
e.vmulps(e.xmm3, src1, src2);
e.vsubps(i.dest, e.xmm3, src3);
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
}
}
};
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
NEGATED_MUL_SUB_V128);
// ============================================================================
// OPCODE_NEG
// ============================================================================
@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
// src1 constant.
// `and` instruction only supports up to 32-bit immediate constants
// 64-bit constants will need a temp register
//only possible with 64 bit inputs, andc is the only instruction that generates this
// only possible with 64 bit inputs, andc is the only instruction that
// generates this
auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
e.mov(temp, i.src1.constant());

View File

@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) {
return i->dest;
}
Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type));
i->set_src1(address);
i->src2.value = i->src3.value = NULL;
return i->dest;
}
Value* HIRBuilder::StoreWithReserve(Value* address, Value* value,
TypeName type) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE));
i->set_src1(address);
i->set_src2(value);
i->src3.value = NULL;
return i->dest;
}
void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_STORE_info, store_flags);
@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
return i->dest;
}
Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
Instr* i =
AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->set_src3(value3);
return i->dest;
}
Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
ASSERT_TYPES_EQUAL(value1, value2);
ASSERT_TYPES_EQUAL(value1, value3);
Instr* i =
AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->set_src3(value3);
return i->dest;
}
Value* HIRBuilder::Neg(Value* value) {
Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
i->set_src1(value);

View File

@ -189,6 +189,9 @@ class HIRBuilder {
uint32_t store_flags = 0);
Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
// create a reserve on an address,
Value* LoadWithReserve(Value* address, TypeName type);
Value* StoreWithReserve(Value* address, Value* value, TypeName type);
Value* LoadVectorLeft(Value* address);
Value* LoadVectorRight(Value* address);
@ -242,10 +245,7 @@ class HIRBuilder {
Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3
Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3
Value* NegatedMulAdd(Value* value1, Value* value2,
Value* value3); // -((1 * 2) + 3)
Value* NegatedMulSub(Value* value1, Value* value2,
Value* value3); // -((1 * 2) - 3)
Value* Neg(Value* value);
Value* Abs(Value* value);
Value* Sqrt(Value* value);

View File

@ -248,9 +248,7 @@ enum Opcode {
OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type.
OPCODE_DIV,
OPCODE_MUL_ADD,
OPCODE_NEGATED_MUL_ADD,
OPCODE_MUL_SUB,
OPCODE_NEGATED_MUL_SUB,
OPCODE_NEG,
OPCODE_ABS,
OPCODE_SQRT,
@ -292,7 +290,10 @@ enum Opcode {
// as we already have OPCODE_ROUND. round double to float (
// ppc "single" fpu instruction result rounding behavior )
OPCODE_SET_NJM,
OPCODE_DELAY_EXECUTION, //for db16cyc
OPCODE_DELAY_EXECUTION, // for db16cyc
OPCODE_RESERVED_LOAD,
OPCODE_RESERVED_STORE,
__OPCODE_MAX_VALUE, // Keep at end.
};

View File

@ -218,7 +218,12 @@ DEFINE_OPCODE(
"context_barrier",
OPCODE_SIG_X,
0)
DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
DEFINE_OPCODE(
OPCODE_DELAY_EXECUTION,
"delay_execution",
OPCODE_SIG_X,
0)
DEFINE_OPCODE(
OPCODE_LOAD_MMIO,
"load_mmio",
@ -453,19 +458,6 @@ DEFINE_OPCODE(
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEGATED_MUL_ADD,
"negated_mul_add",
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEGATED_MUL_SUB,
"negated_mul_sub",
OPCODE_SIG_V_V_V_V,
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
DEFINE_OPCODE(
OPCODE_NEG,
"neg",
@ -719,3 +711,15 @@ DEFINE_OPCODE(
"storev_right",
OPCODE_SIG_X_V_V,
OPCODE_FLAG_MEMORY)
DEFINE_OPCODE(
OPCODE_RESERVED_LOAD,
"reserved_load",
OPCODE_SIG_V_V,
OPCODE_FLAG_MEMORY)
DEFINE_OPCODE(
OPCODE_RESERVED_STORE,
"reserved_store",
OPCODE_SIG_V_V_V,
OPCODE_FLAG_MEMORY)

View File

@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
uint8_t rex_b = rex & 0b0001;
uint8_t rex_x = rex & 0b0010;
uint8_t rex_r = rex & 0b0100;
//uint8_t rex_w = rex & 0b1000;
// uint8_t rex_w = rex & 0b1000;
// http://www.sandpile.org/x86/opc_rm.htm
// http://www.sandpile.org/x86/opc_sib.htm
@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
if (cur_access != memory::PageAccess::kNoAccess &&
(!is_write || cur_access != memory::PageAccess::kReadOnly)) {
// Another thread has cleared this watch. Abort.
XELOGD("Race condition on watch, was already cleared by another thread!");
return true;
}
// The address is not found within any range, so either a write watch or an

View File

@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
Value* b = f.VectorDenormFlush(f.LoadVR(vb));
Value* c = f.VectorDenormFlush(f.LoadVR(vc));
Value* v = f.NegatedMulSub(a, c, b);
Value* v = f.Neg(f.MulSub(a, c, b));
f.StoreVR(vd, v);
return 0;
}

View File

@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB)
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
Value* v = f.Neg(
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] + frB)
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
Value* v = f.Neg(
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB)
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
Value* v = f.Neg(
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
return 0;
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- -([frA x frC] - frB)
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
f.LoadFPR(i.A.FRB));
Value* v = f.Neg(
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
v = f.ToSingle(v);
f.StoreFPR(i.A.FRT, v);
f.UpdateFPSCR(v, i.A.Rc);
@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
f.StoreFPR(i.X.RT, v);
/*
The contents of frB with bit 0 cleared are placed into frD.
Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
Other registers altered:
Condition Register (CR1 field):
Note that the fabs instruction treats NaNs just like any other kind of value.
That is, the sign bit of a NaN may be altered by fabs. This instruction does not
alter the FPSCR. Other registers altered: Condition Register (CR1 field):
Affected: FX, FEX, VX, OX (if Rc = 1)
*/
// f.UpdateFPSCR(v, i.X.Rc);
// f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
// todo
}
@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- !abs(frB)
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
f.StoreFPR(i.X.RT, v);
//f.UpdateFPSCR(v, i.X.Rc);
// f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
// todo
}
return 0;
}
@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- ¬ frB[0] || frB[1-63]
Value* v = f.Neg(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v);
//f.UpdateFPSCR(v, i.X.Rc);
// f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
//todo
// todo
}
return 0;
}

View File

@ -22,6 +22,12 @@ DEFINE_bool(
"instructions were written with the Xbox 360's cache in mind, and modern "
"processors do their own automatic prefetching.",
"CPU");
DEFINE_bool(no_reserved_ops, false,
"For testing whether a game may have races with a broken reserved "
"load/store impl",
"CPU");
namespace xe {
namespace cpu {
namespace ppc {
@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
// already, but I haven't see anything but interrupt callbacks (which are
// always under a global lock) do that yet.
// We issue a memory barrier here to make sure that we get good values.
f.MemoryBarrier();
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
f.StoreReserved(rt);
f.StoreGPR(i.X.RT, rt);
if (cvars::no_reserved_ops) {
f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE)));
} else {
f.MemoryBarrier();
Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE));
f.StoreGPR(i.X.RT, rt);
}
return 0;
}
@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
// already, but I haven't see anything but interrupt callbacks (which are
// always under a global lock) do that yet.
// We issue a memory barrier here to make sure that we get good values.
f.MemoryBarrier();
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
f.StoreReserved(rt);
f.StoreGPR(i.X.RT, rt);
if (cvars::no_reserved_ops) {
f.StoreGPR(i.X.RT,
f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE));
} else {
f.MemoryBarrier();
Value* rt =
f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE);
f.StoreGPR(i.X.RT, rt);
}
return 0;
}
@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
Value* res = f.ByteSwap(f.LoadReserved());
Value* v = f.AtomicCompareExchange(ea, res, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
if (cvars::no_reserved_ops) {
f.Store(ea, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
} else {
Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
}
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
// Issue memory barrier for when we go out of lock and want others to see our
// updates.
f.MemoryBarrier();
if (!cvars::no_reserved_ops) {
f.MemoryBarrier();
}
return 0;
}
@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
// This will always succeed if under the global lock, however.
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
Value* v = f.AtomicCompareExchange(ea, res, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
if (cvars::no_reserved_ops) {
f.Store(ea, rt);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
} else {
Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
}
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
// Issue memory barrier for when we go out of lock and want others to see our
// updates.
f.MemoryBarrier();
if (!cvars::no_reserved_ops) {
f.MemoryBarrier();
}
return 0;
}
// Floating-point load (A-19)
int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {