Add constant folding for LVR when 16 aligned, clean up prior commit by removing dead test code for LVR/LVL/STVL/STVR opcodes and legacy hir sequence

Delay using mm_pause in KeAcquireSpinLockAtRaisedIrql_entry, a huge amount of time is spent spinning in halo3
This commit is contained in:
chss95cs@gmail.com 2022-09-04 11:44:29 -07:00 committed by illusion0001
parent d372d8d5e3
commit 0c576877c8
4 changed files with 22 additions and 408 deletions

View File

@ -360,24 +360,6 @@ EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8,
ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32,
ATOMIC_EXCHANGE_I64);
static __m128i callnativesafe_lvl(void* ctx, void* addr) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
uaddr &= ~0xfULL;
__m128i tempload = _mm_loadu_si128((const __m128i*)uaddr);
__m128i badhelper =
_mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
__m128i tmpshuf = _mm_add_epi8(badhelper, _mm_set1_epi8((char)bad_offs));
tmpshuf = _mm_or_si128(tmpshuf, _mm_cmpgt_epi8(tmpshuf, _mm_set1_epi8(15)));
return _mm_shuffle_epi8(tempload, tmpshuf);
}
struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.mov(e.edx, 0xf);
@ -405,25 +387,6 @@ struct LVL_V128 : Sequence<LVL_V128, I<OPCODE_LVL, V128Op, I64Op>> {
};
EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128);
static __m128i callnativesafe_lvr(void* ctx, void* addr) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
if (!bad_offs) {
return _mm_setzero_si128();
}
uaddr &= ~0xfULL;
__m128i tempload = _mm_loadu_si128((const __m128i*)uaddr);
__m128i badhelper =
_mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
__m128i tmpshuf = _mm_add_epi8(badhelper, _mm_set1_epi8((char)bad_offs));
tmpshuf = _mm_or_si128(tmpshuf, _mm_cmplt_epi8(tmpshuf, _mm_set1_epi8(16)));
return _mm_shuffle_epi8(tempload, tmpshuf);
}
struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
@ -457,181 +420,8 @@ struct LVR_V128 : Sequence<LVR_V128, I<OPCODE_LVR, V128Op, I64Op>> {
};
EMITTER_OPCODE_TABLE(OPCODE_LVR, LVR_V128);
static __m128i PermuteV128Bytes(__m128i selector, __m128i src1, __m128i src2) {
#if 1
__m128i selector2 = _mm_xor_si128(selector, _mm_set1_epi8(3));
__m128i src1_shuf = _mm_shuffle_epi8(src1, selector2);
__m128i src2_shuf = _mm_shuffle_epi8(src2, selector2);
__m128i src2_selection = _mm_cmpgt_epi8(selector2, _mm_set1_epi8(15));
return _mm_blendv_epi8(src1_shuf, src2_shuf, src2_selection);
#else
// not the issue
unsigned char tmpbuffer[32];
_mm_storeu_si128((__m128i*)tmpbuffer, src1);
_mm_storeu_si128((__m128i*)(&tmpbuffer[16]), src2);
__m128i result;
for (unsigned i = 0; i < 16; ++i) {
result.m128i_u8[i] = tmpbuffer[(selector.m128i_u8[i] ^ 3) & 0x1f];
}
return result;
#endif
}
static __m128i ByteSwap(__m128i input) {
return _mm_shuffle_epi8(input, _mm_setr_epi32(0x00010203u, 0x04050607u,
0x08090A0Bu, 0x0C0D0E0Fu));
}
static __m128i LVSR(char input) {
__m128i lvsr_table_base = ByteSwap(_mm_setr_epi8(
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31));
__m128i base_as_vec = _mm_loadu_si128((const __m128i*)&lvsr_table_base);
__m128i shr_for_offset = _mm_sub_epi8(base_as_vec, _mm_set1_epi8(input));
return shr_for_offset;
}
/*
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
// ea &= ~0xF
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
Value* shrs = f.LoadVectorShr(eb);
Value* zerovec = f.LoadZeroVec128();
// v = (old & ~mask) | ((new >> eb) & mask)
Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE);
Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
// mask = FFFF... >> eb
Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE);
Value* v = f.Select(mask, old_value, new_value);
// ea &= ~0xF (handled above)
f.Store(ea, f.ByteSwap(v));
*/
#if 0
static void callnativesafe_stvl(void* ctx, void* addr, __m128i* value) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
uaddr &= ~0xfULL;
__m128i tempload = ByteSwap(_mm_loadu_si128((const __m128i*)uaddr));
__m128i our_value_to_store = _mm_loadu_si128(value);
__m128i shr_for_offset = LVSR((char)bad_offs);
__m128i permuted_us =
PermuteV128Bytes(shr_for_offset, _mm_setzero_si128(), our_value_to_store);
//__m128i mask = PermuteV128Bytes(shr_for_offset, _mm_setzero_si128(),
// _mm_set1_epi8((char)0xff));
__m128i mask = _mm_cmpgt_epi8(shr_for_offset, _mm_set1_epi8(15));
__m128i blended_input_and_memory =
_mm_blendv_epi8(tempload, permuted_us, mask);
__m128i swapped_final_result = ByteSwap(blended_input_and_memory);
_mm_storeu_si128((__m128i*)uaddr, swapped_final_result);
}
#else
static void callnativesafe_stvl(void* ctx, void* addr, __m128i* value) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
uaddr &= ~0xfULL;
__m128i tempload = _mm_loadu_si128((const __m128i*)uaddr);
__m128i our_value_to_store = _mm_loadu_si128(value);
__m128i shr_for_offset;
{
__m128i lvsr_table_base =
_mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31),
_mm_set1_epi8(16));
shr_for_offset =
_mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs));
}
__m128i permuted_us;
{
__m128i selector2 = _mm_xor_si128(shr_for_offset, _mm_set1_epi8(3));
__m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, selector2);
permuted_us = src2_shuf;
}
__m128i blended_input_and_memory =
_mm_blendv_epi8(permuted_us, tempload, shr_for_offset);
__m128i swapped_final_result = blended_input_and_memory;
_mm_storeu_si128((__m128i*)uaddr, swapped_final_result);
}
static void callnativesafe_stvl_experiment(void* addr, __m128i* value) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
uaddr &= ~0xfULL;
__m128i tempload = _mm_loadu_si128((const __m128i*)uaddr);
__m128i our_value_to_store = _mm_loadu_si128(value);
__m128i shr_for_offset;
{
__m128i lvsr_table_base =
_mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31),
_mm_set1_epi8(16));
// lvsr_table_base = _mm_xor_si128(lvsr_table_base, _mm_set1_epi8(3));
// lvsr_table_base = ByteSwap(lvsr_table_base);
shr_for_offset =
_mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs));
}
__m128i permuted_us;
{
shr_for_offset = _mm_xor_si128(shr_for_offset, _mm_set1_epi8(3));
__m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, shr_for_offset);
permuted_us = src2_shuf;
}
__m128i blended_input_and_memory =
_mm_blendv_epi8(permuted_us, tempload, shr_for_offset);
__m128i swapped_final_result = blended_input_and_memory;
_mm_storeu_si128((__m128i*)uaddr, swapped_final_result);
}
#endif
struct STVL_V128 : Sequence<STVL_V128, I<OPCODE_STVL, VoidOp, I64Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
#if 0
e.lea(e.GetNativeParam(0), e.ptr[ComputeMemoryAddress(e, i.src1)]);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.lea(e.GetNativeParam(1), e.StashXmm(0, src2));
e.CallNativeSafe((void*)callnativesafe_stvl);
#else
e.mov(e.ecx, 15);
e.mov(e.edx, e.ecx);
e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]);
@ -640,7 +430,6 @@ struct STVL_V128 : Sequence<STVL_V128, I<OPCODE_STVL, VoidOp, I64Op, V128Op>> {
e.not_(e.rdx);
e.and_(e.rax, e.rdx);
e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMSTVLShuffle));
// e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
e.vpbroadcastb(e.xmm3, e.xmm0);
} else {
@ -650,126 +439,18 @@ struct STVL_V128 : Sequence<STVL_V128, I<OPCODE_STVL, VoidOp, I64Op, V128Op>> {
e.vpxor(e.xmm1, e.xmm0,
e.GetXmmConstPtr(XMMSwapWordMask)); // xmm1 from now on will be our
// selector for blend/shuffle
// we can reuse xmm0, xmm2 and xmm3 now
// e.vmovdqa(e.xmm0, e.ptr[e.rax]);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm0);
e.vpshufb(e.xmm2, src2, e.xmm1);
e.vpblendvb(e.xmm3, e.xmm2, e.ptr[e.rax], e.xmm1);
e.vmovdqa(e.ptr[e.rax], e.xmm3);
#endif
}
};
EMITTER_OPCODE_TABLE(OPCODE_STVL, STVL_V128);
/*
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
// Skip if %16=0 (no data to store).
auto skip_label = f.NewLabel();
f.BranchFalse(eb, skip_label);
// ea &= ~0xF
// NOTE: need to recalculate ea and eb because after Branch we start a new
// block and we can't use their previous instantiation in the new block
ea = CalculateEA_0(f, ra, rb);
eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
Value* shrs = f.LoadVectorShr(eb);
Value* zerovec = f.LoadZeroVec128();
// v = (old & ~mask) | ((new << eb) & mask)
Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE);
Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
// mask = ~FFFF... >> eb
Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE);
Value* v = f.Select(mask, old_value, new_value);
// ea &= ~0xF (handled above)
f.Store(ea, f.ByteSwap(v));
f.MarkLabel(skip_label);
*/
#if 0
static void callnativesafe_stvr(void* ctx, void* addr, __m128i* value) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
if (!bad_offs) {
return;
}
uaddr &= ~0xfULL;
__m128i tempload = ByteSwap(_mm_loadu_si128((const __m128i*)uaddr));
__m128i our_value_to_store = _mm_loadu_si128(value);
__m128i shr_for_offset = LVSR((char)bad_offs);
__m128i permuted_us = PermuteV128Bytes(
shr_for_offset, our_value_to_store, _mm_setzero_si128() );
__m128i mask = PermuteV128Bytes(
shr_for_offset, _mm_set1_epi8((char)0xff) ,_mm_setzero_si128()
);
//__m128i mask = _mm_cmpgt_epi8(shr_for_offset, _mm_set1_epi8(15));
__m128i blended_input_and_memory =
_mm_blendv_epi8(tempload, permuted_us, mask);
__m128i swapped_final_result = ByteSwap(blended_input_and_memory);
_mm_storeu_si128((__m128i*)uaddr, swapped_final_result);
}
#else
static void callnativesafe_stvr(void* ctx, void* addr, __m128i* value) {
uintptr_t uaddr = reinterpret_cast<uintptr_t>(addr);
uintptr_t bad_offs = uaddr & 0xf;
uaddr &= ~0xfULL;
if (!bad_offs) {
return;
}
__m128i tempload = _mm_loadu_si128((const __m128i*)uaddr);
__m128i our_value_to_store = _mm_loadu_si128(value);
__m128i shr_for_offset;
{
__m128i lvsr_table_base =
_mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31),
_mm_set1_epi8(16));
// lvsr_table_base = _mm_xor_si128(lvsr_table_base, _mm_set1_epi8(3));
// lvsr_table_base = ByteSwap(lvsr_table_base);
shr_for_offset =
_mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs));
}
__m128i permuted_us;
{
shr_for_offset = _mm_xor_si128(shr_for_offset, _mm_set1_epi8((char)0x83));
__m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, shr_for_offset);
permuted_us = src2_shuf;
}
__m128i blended_input_and_memory =
_mm_blendv_epi8(permuted_us, tempload, shr_for_offset);
__m128i swapped_final_result = blended_input_and_memory;
_mm_storeu_si128((__m128i*)uaddr, swapped_final_result);
}
#endif
struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
#if 0
e.lea(e.GetNativeParam(0), e.ptr[ComputeMemoryAddress(e, i.src1)]);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
e.lea(e.GetNativeParam(1), e.StashXmm(0, src2));
e.CallNativeSafe((void*)callnativesafe_stvr);
#else
Xbyak::Label skipper{};
e.mov(e.ecx, 15);
e.mov(e.edx, e.ecx);
@ -782,7 +463,7 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMSTVLShuffle));
// todo: maybe a table lookup might be a better idea for getting the
// shuffle/blend
// e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMSTVRSwapMask));
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
e.vpbroadcastb(e.xmm3, e.xmm0);
} else {
@ -792,8 +473,6 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
e.vpxor(e.xmm1, e.xmm0,
e.GetXmmConstPtr(XMMSTVRSwapMask)); // xmm1 from now on will be our
// selector for blend/shuffle
// we can reuse xmm0, xmm2 and xmm3 now
// e.vmovdqa(e.xmm0, e.ptr[e.rax]);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm0);
@ -801,7 +480,6 @@ struct STVR_V128 : Sequence<STVR_V128, I<OPCODE_STVR, VoidOp, I64Op, V128Op>> {
e.vpblendvb(e.xmm3, e.xmm2, e.ptr[e.rax], e.xmm1);
e.vmovdqa(e.ptr[e.rax], e.xmm3);
e.L(skipper);
#endif
}
};
EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128);

View File

@ -243,7 +243,16 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
result = true;
}
break;
case OPCODE_LVR:
if (i->src1.value->IsConstant()) {
if (!(i->src1.value->AsUint32() & 0xF)) {
v->set_zero(VEC128_TYPE);
i->Remove();
result = true;
break;
}
}
break;
case OPCODE_LOAD:
case OPCODE_LOAD_OFFSET:
if (i->src1.value->IsConstant()) {
@ -921,6 +930,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
result = true;
}
break;
default:
// Ignored.
break;

View File

@ -208,20 +208,10 @@ int InstrEmit_stvxl128(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_lvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
uint32_t ra, uint32_t rb) {
Value* ea = CalculateEA_0(f, ra, rb);
#if 0
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
// ea &= ~0xF
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
// v = (new << eb)
Value* v = f.Permute(f.LoadVectorShl(eb), f.ByteSwap(f.Load(ea, VEC128_TYPE)),
f.LoadZeroVec128(), INT8_TYPE);
f.StoreVR(vd, v);
return 0;
#else
Value* val = f.LoadVectorLeft(ea);
f.StoreVR(vd, val);
return 0;
#endif
}
int InstrEmit_lvlx(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_lvlx_(f, i, i.X.RT, i.X.RA, i.X.RB);
@ -243,32 +233,10 @@ int InstrEmit_lvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
// buffer, which sometimes may be nothing and hang off the end of the valid
// page area. We still need to zero the resulting register, though.
Value* ea = CalculateEA_0(f, ra, rb);
#if 0
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
// Skip if %16=0 (just load zero).
auto load_label = f.NewLabel();
auto end_label = f.NewLabel();
f.BranchTrue(eb, load_label);
f.StoreVR(vd, f.LoadZeroVec128());
f.Branch(end_label);
f.MarkLabel(load_label);
// ea &= ~0xF
// NOTE: need to recalculate ea and eb because after Branch we start a new
// block and we can't use their previous instantiation in the new block
ea = CalculateEA_0(f, ra, rb);
eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
// v = (new >> (16 - eb))
Value* v = f.Permute(f.LoadVectorShl(eb), f.LoadZeroVec128(),
f.ByteSwap(f.Load(ea, VEC128_TYPE)), INT8_TYPE);
f.StoreVR(vd, v);
f.MarkLabel(end_label);
return 0;
#else
Value* val = f.LoadVectorRight(ea);
f.StoreVR(vd, val);
return 0;
#endif
}
int InstrEmit_lvrx(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_lvrx_(f, i, i.X.RT, i.X.RA, i.X.RB);
@ -289,34 +257,9 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
// we could optimize this to prevent the other load/mask, in that case.
Value* ea = CalculateEA_0(f, ra, rb);
#if 0
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
// ea &= ~0xF
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
Value* shrs = f.LoadVectorShr(eb);
Value* zerovec = f.LoadZeroVec128();
// v = (old & ~mask) | ((new >> eb) & mask)
Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE);
Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE);
Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
/*
these permutes need to be looked at closer. keep in mind Permute is meant to
emulate vmx's shuffles and does not generate particularly good code. The logic
here looks as if it might make more sense as a comparison (
*/
// mask = FFFF... >> eb
Value* v = f.Select(mask, old_value, new_value);
// ea &= ~0xF (handled above)
f.Store(ea, f.ByteSwap(v));
#else
Value* vdr = f.LoadVR(vd);
f.StoreVectorLeft(ea, vdr);
#endif
return 0;
}
int InstrEmit_stvlx(PPCHIRBuilder& f, const InstrData& i) {
@ -339,32 +282,9 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd,
// buffer, which sometimes may be nothing and hang off the end of the valid
// page area.
Value* ea = CalculateEA_0(f, ra, rb);
#if 0
Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
// Skip if %16=0 (no data to store).
auto skip_label = f.NewLabel();
f.BranchFalse(eb, skip_label);
// ea &= ~0xF
// NOTE: need to recalculate ea and eb because after Branch we start a new
// block and we can't use their previous instantiation in the new block
ea = CalculateEA_0(f, ra, rb);
eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF));
ea = f.And(ea, f.LoadConstantUint64(~0xFull));
Value* shrs = f.LoadVectorShr(eb);
Value* zerovec = f.LoadZeroVec128();
// v = (old & ~mask) | ((new << eb) & mask)
Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE);
Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE));
// mask = ~FFFF... >> eb
Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE);
Value* v = f.Select(mask, old_value, new_value);
// ea &= ~0xF (handled above)
f.Store(ea, f.ByteSwap(v));
f.MarkLabel(skip_label);
#else
Value* vdr = f.LoadVR(vd);
f.StoreVectorRight(ea, vdr);
#endif
return 0;
}
int InstrEmit_stvrx(PPCHIRBuilder& f, const InstrData& i) {

View File

@ -952,11 +952,17 @@ void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) {
}
DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented,
kHighFrequency);
// todo: this is not accurate
void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) {
// Lock.
auto lock = reinterpret_cast<uint32_t*>(lock_ptr.host_address());
while (!xe::atomic_cas(0, 1, lock)) {
#if XE_ARCH_AMD64 == 1
// todo: this is just a nop if they don't have SMT, which is not great
// either...
_mm_pause();
#endif
// Spin!
// TODO(benvanik): error on deadlock?
}