Merge pull request #149 from chrisps/canary_experimental
reimplement reserved load/store
This commit is contained in:
commit
26dc48f695
|
@ -70,6 +70,9 @@ class X64HelperEmitter : public X64Emitter {
|
|||
void* EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
void* sync_func, unsigned stack_element_size);
|
||||
|
||||
void* EmitTryAcquireReservationHelper();
|
||||
void* EmitReservedStoreHelper(bool bit64 = false);
|
||||
|
||||
private:
|
||||
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
||||
size_t stack_size = 0);
|
||||
|
@ -226,6 +229,10 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
thunk_emitter.EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||
synchronize_guest_and_host_stack_helper_, 4);
|
||||
}
|
||||
try_acquire_reservation_helper_ =
|
||||
thunk_emitter.EmitTryAcquireReservationHelper();
|
||||
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
|
||||
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
|
||||
|
||||
// Set the code cache to use the ResolveFunction thunk for default
|
||||
// indirections.
|
||||
|
@ -799,7 +806,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackHelper() {
|
|||
inc(ecx);
|
||||
jmp(checkbp, T_NEAR);
|
||||
L(we_good);
|
||||
//we're popping this return address, so go down by one
|
||||
// we're popping this return address, so go down by one
|
||||
sub(edx, sizeof(X64BackendStackpoint));
|
||||
dec(ecx);
|
||||
L(checkbp);
|
||||
|
@ -857,6 +864,125 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|||
code_offsets.tail = getSize();
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
|
||||
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
||||
_code_offsets code_offsets = {};
|
||||
code_offsets.prolog = getSize();
|
||||
|
||||
Xbyak::Label already_has_a_reservation;
|
||||
Xbyak::Label acquire_new_reservation;
|
||||
|
||||
btr(GetBackendFlagsPtr(), 1);
|
||||
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
||||
jc(already_has_a_reservation);
|
||||
|
||||
shr(ecx, RESERVE_BLOCK_SHIFT);
|
||||
xor_(r9d, r9d);
|
||||
mov(edx, ecx);
|
||||
shr(edx, 6); // divide by 64
|
||||
lea(rdx, ptr[r8 + rdx * 8]);
|
||||
and_(ecx, 64 - 1);
|
||||
|
||||
lock();
|
||||
bts(qword[rdx], rcx);
|
||||
// set flag on local backend context for thread to indicate our previous
|
||||
// attempt to get the reservation succeeded
|
||||
setnc(r9b); // success = bitmap did not have a set bit at the idx
|
||||
shl(r9b, 1);
|
||||
|
||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
||||
rdx);
|
||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
|
||||
|
||||
or_(GetBackendCtxPtr(offsetof(X64BackendContext, flags)), r9d);
|
||||
ret();
|
||||
L(already_has_a_reservation);
|
||||
DebugBreak();
|
||||
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
code_offsets.epilog = getSize();
|
||||
code_offsets.tail = getSize();
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
// ecx=guest addr
|
||||
// r9 = host addr
|
||||
// r8 = value
|
||||
// if ZF is set and CF is set, we succeeded
|
||||
void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
|
||||
_code_offsets code_offsets = {};
|
||||
code_offsets.prolog = getSize();
|
||||
Xbyak::Label done;
|
||||
Xbyak::Label reservation_isnt_for_our_addr;
|
||||
Xbyak::Label somehow_double_cleared;
|
||||
// carry must be set + zero flag must be set
|
||||
|
||||
btr(GetBackendFlagsPtr(), 1);
|
||||
|
||||
jnc(done);
|
||||
|
||||
mov(rax, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
||||
|
||||
shr(ecx, RESERVE_BLOCK_SHIFT);
|
||||
mov(edx, ecx);
|
||||
shr(edx, 6); // divide by 64
|
||||
lea(rdx, ptr[rax + rdx * 8]);
|
||||
// begin acquiring exclusive access to cacheline containing our bit
|
||||
prefetchw(ptr[rdx]);
|
||||
|
||||
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
||||
rdx);
|
||||
jnz(reservation_isnt_for_our_addr);
|
||||
|
||||
mov(rax,
|
||||
GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)));
|
||||
|
||||
// we need modulo bitsize, it turns out bittests' modulus behavior for the
|
||||
// bitoffset only applies for register operands, for memory ones we bug out
|
||||
// todo: actually, the above note may not be true, double check it
|
||||
and_(ecx, 64 - 1);
|
||||
cmp(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_bit)), ecx);
|
||||
jnz(reservation_isnt_for_our_addr);
|
||||
|
||||
// was our memory modified by kernel code or something?
|
||||
lock();
|
||||
if (bit64) {
|
||||
cmpxchg(ptr[r9], r8);
|
||||
|
||||
} else {
|
||||
cmpxchg(ptr[r9], r8d);
|
||||
}
|
||||
// the ZF flag is unaffected by BTR! we exploit this for the retval
|
||||
|
||||
// cancel our lock on the 65k block
|
||||
lock();
|
||||
btr(qword[rdx], rcx);
|
||||
|
||||
jnc(somehow_double_cleared);
|
||||
|
||||
L(done);
|
||||
// i don't care that theres a dependency on the prev value of rax atm
|
||||
// sadly theres no CF&ZF condition code
|
||||
setz(al);
|
||||
setc(ah);
|
||||
cmp(ax, 0x0101);
|
||||
ret();
|
||||
|
||||
// could be the same label, but otherwise we don't know where we came from
|
||||
// when one gets triggered
|
||||
L(reservation_isnt_for_our_addr);
|
||||
DebugBreak();
|
||||
|
||||
L(somehow_double_cleared); // somehow, something else cleared our reserve??
|
||||
DebugBreak();
|
||||
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
code_offsets.epilog = getSize();
|
||||
code_offsets.tail = getSize();
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
|
||||
void X64HelperEmitter::EmitSaveVolatileRegs() {
|
||||
// Save off volatile registers.
|
||||
// mov(qword[rsp + offsetof(StackLayout::Thunk, r[0])], rax);
|
||||
|
@ -975,6 +1101,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
|
|||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||
bctx->Ox1000 = 0x1000;
|
||||
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
||||
bctx->reserve_helper_ = &reserve_helper_;
|
||||
}
|
||||
void X64Backend::DeinitializeBackendContext(void* ctx) {
|
||||
X64BackendContext* bctx = BackendContextForGuestContext(ctx);
|
||||
|
|
|
@ -42,6 +42,17 @@ typedef void* (*HostToGuestThunk)(void* target, void* arg0, void* arg1);
|
|||
typedef void* (*GuestToHostThunk)(void* target, void* arg0, void* arg1);
|
||||
typedef void (*ResolveFunctionThunk)();
|
||||
|
||||
#define RESERVE_BLOCK_SHIFT 16
|
||||
|
||||
#define RESERVE_NUM_ENTRIES \
|
||||
((1024ULL * 1024ULL * 1024ULL * 4ULL) >> RESERVE_BLOCK_SHIFT)
|
||||
// https://codalogic.com/blog/2022/12/06/Exploring-PowerPCs-read-modify-write-operations
|
||||
struct ReserveHelper {
|
||||
uint64_t blocks[RESERVE_NUM_ENTRIES / 64];
|
||||
|
||||
ReserveHelper() { memset(blocks, 0, sizeof(blocks)); }
|
||||
};
|
||||
|
||||
struct X64BackendStackpoint {
|
||||
uint64_t host_stack_;
|
||||
unsigned guest_stack_;
|
||||
|
@ -55,16 +66,21 @@ struct X64BackendStackpoint {
|
|||
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
||||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
ReserveHelper* reserve_helper_;
|
||||
uint64_t cached_reserve_value_;
|
||||
// guest_tick_count is used if inline_loadclock is used
|
||||
uint64_t* guest_tick_count;
|
||||
// records mapping of host_stack to guest_stack
|
||||
X64BackendStackpoint* stackpoints;
|
||||
|
||||
uint64_t cached_reserve_offset;
|
||||
uint32_t cached_reserve_bit;
|
||||
unsigned int current_stackpoint_depth;
|
||||
unsigned int mxcsr_fpu; // currently, the way we implement rounding mode
|
||||
// affects both vmx and the fpu
|
||||
unsigned int mxcsr_vmx;
|
||||
unsigned int flags; // bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
// bit 0 = 0 if mxcsr is fpu, else it is vmx
|
||||
// bit 1 = got reserve
|
||||
unsigned int flags;
|
||||
unsigned int Ox1000; // constant 0x1000 so we can shrink each tail emitted
|
||||
// add of it by... 2 bytes lol
|
||||
};
|
||||
|
@ -152,9 +168,18 @@ class X64Backend : public Backend {
|
|||
void* synchronize_guest_and_host_stack_helper_size8_ = nullptr;
|
||||
void* synchronize_guest_and_host_stack_helper_size16_ = nullptr;
|
||||
void* synchronize_guest_and_host_stack_helper_size32_ = nullptr;
|
||||
|
||||
public:
|
||||
void* try_acquire_reservation_helper_ = nullptr;
|
||||
void* reserved_store_32_helper = nullptr;
|
||||
void* reserved_store_64_helper = nullptr;
|
||||
|
||||
private:
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
GuestProfilerData profiler_data_;
|
||||
#endif
|
||||
|
||||
alignas(64) ReserveHelper reserve_helper_;
|
||||
};
|
||||
|
||||
} // namespace x64
|
||||
|
|
|
@ -387,7 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);
|
||||
|
||||
|
||||
struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
Xbyak::Label endpoint{};
|
||||
|
@ -483,6 +482,84 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
|
|||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);
|
||||
|
||||
struct RESERVED_LOAD_INT32
|
||||
: Sequence<RESERVED_LOAD_INT32, I<OPCODE_RESERVED_LOAD, I32Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// should use phys addrs, not virtual addrs!
|
||||
|
||||
// try_acquire_reservation_helper_ doesnt spoil rax
|
||||
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
|
||||
// begin acquiring exclusive access to the location
|
||||
// we will do a load first, but we'll need exclusive access once we do our
|
||||
// atomic op in the store
|
||||
e.prefetchw(e.ptr[e.rax]);
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
e.call(e.backend()->try_acquire_reservation_helper_);
|
||||
e.mov(i.dest, e.dword[e.rax]);
|
||||
|
||||
e.mov(
|
||||
e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
|
||||
i.dest.reg().cvt64());
|
||||
}
|
||||
};
|
||||
|
||||
struct RESERVED_LOAD_INT64
|
||||
: Sequence<RESERVED_LOAD_INT64, I<OPCODE_RESERVED_LOAD, I64Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// try_acquire_reservation_helper_ doesnt spoil rax
|
||||
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
// begin acquiring exclusive access to the location
|
||||
// we will do a load first, but we'll need exclusive access once we do our
|
||||
// atomic op in the store
|
||||
e.prefetchw(e.ptr[e.rax]);
|
||||
|
||||
e.call(e.backend()->try_acquire_reservation_helper_);
|
||||
e.mov(i.dest, e.qword[ComputeMemoryAddress(e, i.src1)]);
|
||||
|
||||
e.mov(
|
||||
e.GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_value_)),
|
||||
i.dest.reg());
|
||||
}
|
||||
};
|
||||
|
||||
EMITTER_OPCODE_TABLE(OPCODE_RESERVED_LOAD, RESERVED_LOAD_INT32,
|
||||
RESERVED_LOAD_INT64);
|
||||
|
||||
// address, value
|
||||
|
||||
struct RESERVED_STORE_INT32
|
||||
: Sequence<RESERVED_STORE_INT32,
|
||||
I<OPCODE_RESERVED_STORE, I8Op, I64Op, I32Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// edx=guest addr
|
||||
// r9 = host addr
|
||||
// r8 = value
|
||||
// if ZF is set and CF is set, we succeeded
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
|
||||
e.mov(e.r8d, i.src2);
|
||||
e.call(e.backend()->reserved_store_32_helper);
|
||||
e.setz(i.dest);
|
||||
}
|
||||
};
|
||||
|
||||
struct RESERVED_STORE_INT64
|
||||
: Sequence<RESERVED_STORE_INT64,
|
||||
I<OPCODE_RESERVED_STORE, I8Op, I64Op, I64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.mov(e.ecx, i.src1.reg().cvt32());
|
||||
e.lea(e.r9, e.ptr[ComputeMemoryAddress(e, i.src1)]);
|
||||
e.mov(e.r8, i.src2);
|
||||
e.call(e.backend()->reserved_store_64_helper);
|
||||
e.setz(i.dest);
|
||||
}
|
||||
};
|
||||
|
||||
EMITTER_OPCODE_TABLE(OPCODE_RESERVED_STORE, RESERVED_STORE_INT32,
|
||||
RESERVED_STORE_INT64);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_ATOMIC_COMPARE_EXCHANGE
|
||||
// ============================================================================
|
||||
|
|
|
@ -1018,8 +1018,7 @@ struct COMPARE_EQ_F32
|
|||
e.ChangeMxcsrMode(MXCSRMode::Fpu);
|
||||
if (!HasPrecedingCmpOfSameValues(i.instr)) {
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i,
|
||||
[](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
|
||||
e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
|
||||
e.vcomiss(src1, src2);
|
||||
});
|
||||
}
|
||||
|
@ -1032,8 +1031,7 @@ struct COMPARE_EQ_F64
|
|||
e.ChangeMxcsrMode(MXCSRMode::Fpu);
|
||||
if (!HasPrecedingCmpOfSameValues(i.instr)) {
|
||||
EmitCommutativeBinaryXmmOp(
|
||||
e, i,
|
||||
[](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
|
||||
e, i, [](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) {
|
||||
e.vcomisd(src1, src2);
|
||||
});
|
||||
}
|
||||
|
@ -1935,53 +1933,6 @@ struct MUL_ADD_V128
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128);
|
||||
|
||||
struct NEGATED_MUL_ADD_F64
|
||||
: Sequence<NEGATED_MUL_ADD_F64,
|
||||
I<OPCODE_NEGATED_MUL_ADD, F64Op, F64Op, F64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Fpu);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovapd(e.xmm3, src1);
|
||||
e.vfmadd213sd(e.xmm3, src2, src3);
|
||||
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulsd(e.xmm3, src1, src2);
|
||||
e.vaddsd(i.dest, e.xmm3, src3);
|
||||
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
}
|
||||
}
|
||||
};
|
||||
struct NEGATED_MUL_ADD_V128
|
||||
: Sequence<NEGATED_MUL_ADD_V128,
|
||||
I<OPCODE_NEGATED_MUL_ADD, V128Op, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovaps(e.xmm3, src1);
|
||||
e.vfmadd213ps(e.xmm3, src2, src3);
|
||||
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulps(e.xmm3, src1, src2);
|
||||
e.vaddps(i.dest, e.xmm3, src3);
|
||||
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_ADD, NEGATED_MUL_ADD_F64,
|
||||
NEGATED_MUL_ADD_V128);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_MUL_SUB
|
||||
// ============================================================================
|
||||
|
@ -2038,53 +1989,6 @@ struct MUL_SUB_V128
|
|||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_MUL_SUB, MUL_SUB_F64, MUL_SUB_V128);
|
||||
|
||||
struct NEGATED_MUL_SUB_F64
|
||||
: Sequence<NEGATED_MUL_SUB_F64,
|
||||
I<OPCODE_NEGATED_MUL_SUB, F64Op, F64Op, F64Op, F64Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Fpu);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovapd(e.xmm3, src1);
|
||||
e.vfmsub213sd(e.xmm3, src2, src3);
|
||||
e.vxorpd(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulsd(e.xmm3, src1, src2);
|
||||
e.vsubsd(i.dest, e.xmm3, src3);
|
||||
e.vxorpd(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPD));
|
||||
}
|
||||
}
|
||||
};
|
||||
struct NEGATED_MUL_SUB_V128
|
||||
: Sequence<NEGATED_MUL_SUB_V128,
|
||||
I<OPCODE_NEGATED_MUL_SUB, V128Op, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
|
||||
Xmm src3 = GetInputRegOrConstant(e, i.src3, e.xmm2);
|
||||
if (e.IsFeatureEnabled(kX64EmitFMA)) {
|
||||
// todo: this is garbage
|
||||
e.vmovaps(e.xmm3, src1);
|
||||
e.vfmsub213ps(e.xmm3, src2, src3);
|
||||
e.vxorps(i.dest, e.xmm3, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
} else {
|
||||
// todo: might need to use x87 in this case...
|
||||
e.vmulps(e.xmm3, src1, src2);
|
||||
e.vsubps(i.dest, e.xmm3, src3);
|
||||
e.vxorps(i.dest, i.dest, e.GetXmmConstPtr(XMMSignMaskPS));
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_NEGATED_MUL_SUB, NEGATED_MUL_SUB_F64,
|
||||
NEGATED_MUL_SUB_V128);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_NEG
|
||||
// ============================================================================
|
||||
|
@ -2641,7 +2545,8 @@ void EmitAndNotXX(X64Emitter& e, const ARGS& i) {
|
|||
// src1 constant.
|
||||
// `and` instruction only supports up to 32-bit immediate constants
|
||||
// 64-bit constants will need a temp register
|
||||
//only possible with 64 bit inputs, andc is the only instruction that generates this
|
||||
// only possible with 64 bit inputs, andc is the only instruction that
|
||||
// generates this
|
||||
auto temp = GetTempReg<typename decltype(i.src1)::reg_type>(e);
|
||||
e.mov(temp, i.src1.constant());
|
||||
|
||||
|
|
|
@ -1281,6 +1281,25 @@ Value* HIRBuilder::Load(Value* address, TypeName type, uint32_t load_flags) {
|
|||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::LoadWithReserve(Value* address, TypeName type) {
|
||||
ASSERT_ADDRESS_TYPE(address);
|
||||
|
||||
Instr* i = AppendInstr(OPCODE_RESERVED_LOAD_info, 0, AllocValue(type));
|
||||
i->set_src1(address);
|
||||
i->src2.value = i->src3.value = NULL;
|
||||
|
||||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::StoreWithReserve(Value* address, Value* value,
|
||||
TypeName type) {
|
||||
ASSERT_ADDRESS_TYPE(address);
|
||||
Instr* i = AppendInstr(OPCODE_RESERVED_STORE_info, 0, AllocValue(INT8_TYPE));
|
||||
i->set_src1(address);
|
||||
i->set_src2(value);
|
||||
i->src3.value = NULL;
|
||||
return i->dest;
|
||||
}
|
||||
void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
|
||||
ASSERT_ADDRESS_TYPE(address);
|
||||
Instr* i = AppendInstr(OPCODE_STORE_info, store_flags);
|
||||
|
@ -1739,30 +1758,6 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
|
|||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::NegatedMulAdd(Value* value1, Value* value2, Value* value3) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value3);
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_NEGATED_MUL_ADD_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
i->set_src3(value3);
|
||||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::NegatedMulSub(Value* value1, Value* value2, Value* value3) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
ASSERT_TYPES_EQUAL(value1, value3);
|
||||
|
||||
Instr* i =
|
||||
AppendInstr(OPCODE_NEGATED_MUL_SUB_info, 0, AllocValue(value1->type));
|
||||
i->set_src1(value1);
|
||||
i->set_src2(value2);
|
||||
i->set_src3(value3);
|
||||
return i->dest;
|
||||
}
|
||||
|
||||
Value* HIRBuilder::Neg(Value* value) {
|
||||
Instr* i = AppendInstr(OPCODE_NEG_info, 0, AllocValue(value->type));
|
||||
i->set_src1(value);
|
||||
|
|
|
@ -189,6 +189,9 @@ class HIRBuilder {
|
|||
uint32_t store_flags = 0);
|
||||
|
||||
Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
|
||||
// create a reserve on an address,
|
||||
Value* LoadWithReserve(Value* address, TypeName type);
|
||||
Value* StoreWithReserve(Value* address, Value* value, TypeName type);
|
||||
|
||||
Value* LoadVectorLeft(Value* address);
|
||||
Value* LoadVectorRight(Value* address);
|
||||
|
@ -242,10 +245,7 @@ class HIRBuilder {
|
|||
Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
|
||||
Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3
|
||||
Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3
|
||||
Value* NegatedMulAdd(Value* value1, Value* value2,
|
||||
Value* value3); // -((1 * 2) + 3)
|
||||
Value* NegatedMulSub(Value* value1, Value* value2,
|
||||
Value* value3); // -((1 * 2) - 3)
|
||||
|
||||
Value* Neg(Value* value);
|
||||
Value* Abs(Value* value);
|
||||
Value* Sqrt(Value* value);
|
||||
|
|
|
@ -248,9 +248,7 @@ enum Opcode {
|
|||
OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type.
|
||||
OPCODE_DIV,
|
||||
OPCODE_MUL_ADD,
|
||||
OPCODE_NEGATED_MUL_ADD,
|
||||
OPCODE_MUL_SUB,
|
||||
OPCODE_NEGATED_MUL_SUB,
|
||||
OPCODE_NEG,
|
||||
OPCODE_ABS,
|
||||
OPCODE_SQRT,
|
||||
|
@ -292,7 +290,10 @@ enum Opcode {
|
|||
// as we already have OPCODE_ROUND. round double to float (
|
||||
// ppc "single" fpu instruction result rounding behavior )
|
||||
OPCODE_SET_NJM,
|
||||
OPCODE_DELAY_EXECUTION, //for db16cyc
|
||||
OPCODE_DELAY_EXECUTION, // for db16cyc
|
||||
OPCODE_RESERVED_LOAD,
|
||||
OPCODE_RESERVED_STORE,
|
||||
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
};
|
||||
|
||||
|
|
|
@ -218,7 +218,12 @@ DEFINE_OPCODE(
|
|||
"context_barrier",
|
||||
OPCODE_SIG_X,
|
||||
0)
|
||||
DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_DELAY_EXECUTION,
|
||||
"delay_execution",
|
||||
OPCODE_SIG_X,
|
||||
0)
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_LOAD_MMIO,
|
||||
"load_mmio",
|
||||
|
@ -453,19 +458,6 @@ DEFINE_OPCODE(
|
|||
OPCODE_SIG_V_V_V_V,
|
||||
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_NEGATED_MUL_ADD,
|
||||
"negated_mul_add",
|
||||
OPCODE_SIG_V_V_V_V,
|
||||
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_NEGATED_MUL_SUB,
|
||||
"negated_mul_sub",
|
||||
OPCODE_SIG_V_V_V_V,
|
||||
OPCODE_FLAG_DISALLOW_CONSTANT_FOLDING)
|
||||
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_NEG,
|
||||
"neg",
|
||||
|
@ -719,3 +711,15 @@ DEFINE_OPCODE(
|
|||
"storev_right",
|
||||
OPCODE_SIG_X_V_V,
|
||||
OPCODE_FLAG_MEMORY)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_RESERVED_LOAD,
|
||||
"reserved_load",
|
||||
OPCODE_SIG_V_V,
|
||||
OPCODE_FLAG_MEMORY)
|
||||
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_RESERVED_STORE,
|
||||
"reserved_store",
|
||||
OPCODE_SIG_V_V_V,
|
||||
OPCODE_FLAG_MEMORY)
|
|
@ -185,7 +185,7 @@ bool MMIOHandler::TryDecodeLoadStore(const uint8_t* p,
|
|||
uint8_t rex_b = rex & 0b0001;
|
||||
uint8_t rex_x = rex & 0b0010;
|
||||
uint8_t rex_r = rex & 0b0100;
|
||||
//uint8_t rex_w = rex & 0b1000;
|
||||
// uint8_t rex_w = rex & 0b1000;
|
||||
|
||||
// http://www.sandpile.org/x86/opc_rm.htm
|
||||
// http://www.sandpile.org/x86/opc_sib.htm
|
||||
|
@ -448,6 +448,7 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
|
|||
if (cur_access != memory::PageAccess::kNoAccess &&
|
||||
(!is_write || cur_access != memory::PageAccess::kReadOnly)) {
|
||||
// Another thread has cleared this watch. Abort.
|
||||
XELOGD("Race condition on watch, was already cleared by another thread!");
|
||||
return true;
|
||||
}
|
||||
// The address is not found within any range, so either a write watch or an
|
||||
|
|
|
@ -1143,7 +1143,7 @@ int InstrEmit_vnmsubfp_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb,
|
|||
Value* b = f.VectorDenormFlush(f.LoadVR(vb));
|
||||
Value* c = f.VectorDenormFlush(f.LoadVR(vc));
|
||||
|
||||
Value* v = f.NegatedMulSub(a, c, b);
|
||||
Value* v = f.Neg(f.MulSub(a, c, b));
|
||||
f.StoreVR(vd, v);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -195,8 +195,8 @@ int InstrEmit_fmsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] + frB)
|
||||
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
Value* v = f.Neg(
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -204,8 +204,8 @@ int InstrEmit_fnmaddx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] + frB)
|
||||
Value* v = f.NegatedMulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
Value* v = f.Neg(
|
||||
f.MulAdd(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
|
@ -214,8 +214,8 @@ int InstrEmit_fnmaddsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] - frB)
|
||||
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
Value* v = f.Neg(
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
return 0;
|
||||
|
@ -223,8 +223,8 @@ int InstrEmit_fnmsubx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
int InstrEmit_fnmsubsx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// frD <- -([frA x frC] - frB)
|
||||
Value* v = f.NegatedMulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC),
|
||||
f.LoadFPR(i.A.FRB));
|
||||
Value* v = f.Neg(
|
||||
f.MulSub(f.LoadFPR(i.A.FRA), f.LoadFPR(i.A.FRC), f.LoadFPR(i.A.FRB)));
|
||||
v = f.ToSingle(v);
|
||||
f.StoreFPR(i.A.FRT, v);
|
||||
f.UpdateFPSCR(v, i.A.Rc);
|
||||
|
@ -444,13 +444,12 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
f.StoreFPR(i.X.RT, v);
|
||||
/*
|
||||
The contents of frB with bit 0 cleared are placed into frD.
|
||||
Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
|
||||
bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
|
||||
Other registers altered:
|
||||
• Condition Register (CR1 field):
|
||||
Note that the fabs instruction treats NaNs just like any other kind of value.
|
||||
That is, the sign bit of a NaN may be altered by fabs. This instruction does not
|
||||
alter the FPSCR. Other registers altered: • Condition Register (CR1 field):
|
||||
Affected: FX, FEX, VX, OX (if Rc = 1)
|
||||
*/
|
||||
// f.UpdateFPSCR(v, i.X.Rc);
|
||||
// f.UpdateFPSCR(v, i.X.Rc);
|
||||
if (i.X.Rc) {
|
||||
// todo
|
||||
}
|
||||
|
@ -469,9 +468,9 @@ int InstrEmit_fnabsx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- !abs(frB)
|
||||
Value* v = f.Neg(f.Abs(f.LoadFPR(i.X.RB)));
|
||||
f.StoreFPR(i.X.RT, v);
|
||||
//f.UpdateFPSCR(v, i.X.Rc);
|
||||
// f.UpdateFPSCR(v, i.X.Rc);
|
||||
if (i.X.Rc) {
|
||||
//todo
|
||||
// todo
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -480,9 +479,9 @@ int InstrEmit_fnegx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// frD <- ¬ frB[0] || frB[1-63]
|
||||
Value* v = f.Neg(f.LoadFPR(i.X.RB));
|
||||
f.StoreFPR(i.X.RT, v);
|
||||
//f.UpdateFPSCR(v, i.X.Rc);
|
||||
// f.UpdateFPSCR(v, i.X.Rc);
|
||||
if (i.X.Rc) {
|
||||
//todo
|
||||
// todo
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -22,6 +22,12 @@ DEFINE_bool(
|
|||
"instructions were written with the Xbox 360's cache in mind, and modern "
|
||||
"processors do their own automatic prefetching.",
|
||||
"CPU");
|
||||
|
||||
DEFINE_bool(no_reserved_ops, false,
|
||||
"For testing whether a game may have races with a broken reserved "
|
||||
"load/store impl",
|
||||
"CPU");
|
||||
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace ppc {
|
||||
|
@ -772,12 +778,17 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// already, but I haven't see anything but interrupt callbacks (which are
|
||||
// always under a global lock) do that yet.
|
||||
// We issue a memory barrier here to make sure that we get good values.
|
||||
f.MemoryBarrier();
|
||||
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE));
|
||||
f.StoreReserved(rt);
|
||||
f.StoreGPR(i.X.RT, rt);
|
||||
|
||||
if (cvars::no_reserved_ops) {
|
||||
f.StoreGPR(i.X.RT, f.ByteSwap(f.Load(ea, INT64_TYPE)));
|
||||
|
||||
} else {
|
||||
f.MemoryBarrier();
|
||||
|
||||
Value* rt = f.ByteSwap(f.LoadWithReserve(ea, INT64_TYPE));
|
||||
f.StoreGPR(i.X.RT, rt);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -797,12 +808,19 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// already, but I haven't see anything but interrupt callbacks (which are
|
||||
// always under a global lock) do that yet.
|
||||
// We issue a memory barrier here to make sure that we get good values.
|
||||
f.MemoryBarrier();
|
||||
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE);
|
||||
f.StoreReserved(rt);
|
||||
f.StoreGPR(i.X.RT, rt);
|
||||
if (cvars::no_reserved_ops) {
|
||||
f.StoreGPR(i.X.RT,
|
||||
f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE));
|
||||
|
||||
} else {
|
||||
f.MemoryBarrier();
|
||||
|
||||
Value* rt =
|
||||
f.ZeroExtend(f.ByteSwap(f.LoadWithReserve(ea, INT32_TYPE)), INT64_TYPE);
|
||||
f.StoreGPR(i.X.RT, rt);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -826,17 +844,24 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT));
|
||||
Value* res = f.ByteSwap(f.LoadReserved());
|
||||
Value* v = f.AtomicCompareExchange(ea, res, rt);
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
|
||||
|
||||
if (cvars::no_reserved_ops) {
|
||||
f.Store(ea, rt);
|
||||
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
|
||||
} else {
|
||||
Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
|
||||
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
|
||||
}
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
|
||||
|
||||
// Issue memory barrier for when we go out of lock and want others to see our
|
||||
// updates.
|
||||
|
||||
f.MemoryBarrier();
|
||||
|
||||
if (!cvars::no_reserved_ops) {
|
||||
f.MemoryBarrier();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -859,20 +884,29 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
// This will always succeed if under the global lock, however.
|
||||
|
||||
Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB);
|
||||
|
||||
Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE));
|
||||
Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE));
|
||||
Value* v = f.AtomicCompareExchange(ea, res, rt);
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
|
||||
|
||||
if (cvars::no_reserved_ops) {
|
||||
f.Store(ea, rt);
|
||||
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1));
|
||||
} else {
|
||||
Value* v = f.StoreWithReserve(ea, rt, INT64_TYPE);
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v);
|
||||
}
|
||||
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8());
|
||||
f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8());
|
||||
|
||||
// Issue memory barrier for when we go out of lock and want others to see our
|
||||
// updates.
|
||||
f.MemoryBarrier();
|
||||
if (!cvars::no_reserved_ops) {
|
||||
f.MemoryBarrier();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Floating-point load (A-19)
|
||||
|
||||
int InstrEmit_lfd(PPCHIRBuilder& f, const InstrData& i) {
|
||||
|
|
Loading…
Reference in New Issue