From 2e5c4937fd6486b8427dd3ff535c9b47bc4da87b Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sun, 4 Sep 2022 11:44:29 -0700 Subject: [PATCH] Add constant folding for LVR when 16 aligned, clean up prior commit by removing dead test code for LVR/LVL/STVL/STVR opcodes and legacy hir sequence Delay using mm_pause in KeAcquireSpinLockAtRaisedIrql_entry, a huge amount of time is spent spinning in halo3 --- src/xenia/cpu/backend/x64/x64_seq_memory.cc | 324 +----------------- .../passes/constant_propagation_pass.cc | 12 +- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 86 +---- .../kernel/xboxkrnl/xboxkrnl_threading.cc | 8 +- 4 files changed, 22 insertions(+), 408 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 3a64acc18..2cee66ece 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -360,24 +360,6 @@ EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8, ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32, ATOMIC_EXCHANGE_I64); -static __m128i callnativesafe_lvl(void* ctx, void* addr) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - - uaddr &= ~0xfULL; - - __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); - - __m128i badhelper = - _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - - __m128i tmpshuf = _mm_add_epi8(badhelper, _mm_set1_epi8((char)bad_offs)); - - tmpshuf = _mm_or_si128(tmpshuf, _mm_cmpgt_epi8(tmpshuf, _mm_set1_epi8(15))); - return _mm_shuffle_epi8(tempload, tmpshuf); -} - struct LVL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { e.mov(e.edx, 0xf); @@ -405,25 +387,6 @@ struct LVL_V128 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128); -static __m128i callnativesafe_lvr(void* ctx, void* addr) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - if (!bad_offs) { - return _mm_setzero_si128(); - } - uaddr &= ~0xfULL; - - __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); - - __m128i badhelper = - _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); - - __m128i tmpshuf = _mm_add_epi8(badhelper, _mm_set1_epi8((char)bad_offs)); - - tmpshuf = _mm_or_si128(tmpshuf, _mm_cmplt_epi8(tmpshuf, _mm_set1_epi8(16))); - return _mm_shuffle_epi8(tempload, tmpshuf); -} struct LVR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -457,181 +420,8 @@ struct LVR_V128 : Sequence> { }; EMITTER_OPCODE_TABLE(OPCODE_LVR, LVR_V128); -static __m128i PermuteV128Bytes(__m128i selector, __m128i src1, __m128i src2) { -#if 1 - __m128i selector2 = _mm_xor_si128(selector, _mm_set1_epi8(3)); - - __m128i src1_shuf = _mm_shuffle_epi8(src1, selector2); - __m128i src2_shuf = _mm_shuffle_epi8(src2, selector2); - - __m128i src2_selection = _mm_cmpgt_epi8(selector2, _mm_set1_epi8(15)); - - return _mm_blendv_epi8(src1_shuf, src2_shuf, src2_selection); - -#else - // not the issue - unsigned char tmpbuffer[32]; - - _mm_storeu_si128((__m128i*)tmpbuffer, src1); - _mm_storeu_si128((__m128i*)(&tmpbuffer[16]), src2); - - __m128i result; - - for (unsigned i = 0; i < 16; ++i) { - result.m128i_u8[i] = tmpbuffer[(selector.m128i_u8[i] ^ 3) & 0x1f]; - } - return result; - -#endif -} -static __m128i ByteSwap(__m128i input) { - return _mm_shuffle_epi8(input, _mm_setr_epi32(0x00010203u, 0x04050607u, - 0x08090A0Bu, 0x0C0D0E0Fu)); -} -static __m128i LVSR(char input) { - __m128i lvsr_table_base = ByteSwap(_mm_setr_epi8( - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); - - __m128i base_as_vec = _mm_loadu_si128((const __m128i*)&lvsr_table_base); - - __m128i shr_for_offset = _mm_sub_epi8(base_as_vec, _mm_set1_epi8(input)); - return shr_for_offset; -} - -/* -Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); -// ea &= ~0xF -ea = f.And(ea, f.LoadConstantUint64(~0xFull)); -Value* shrs = f.LoadVectorShr(eb); -Value* zerovec = f.LoadZeroVec128(); - -// v = (old & ~mask) | ((new >> eb) & mask) -Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE); -Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); - -// mask = FFFF... >> eb -Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE); - -Value* v = f.Select(mask, old_value, new_value); -// ea &= ~0xF (handled above) -f.Store(ea, f.ByteSwap(v)); -*/ -#if 0 - -static void callnativesafe_stvl(void* ctx, void* addr, __m128i* value) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - - uaddr &= ~0xfULL; - - __m128i tempload = ByteSwap(_mm_loadu_si128((const __m128i*)uaddr)); - - __m128i our_value_to_store = _mm_loadu_si128(value); - - __m128i shr_for_offset = LVSR((char)bad_offs); - - __m128i permuted_us = - PermuteV128Bytes(shr_for_offset, _mm_setzero_si128(), our_value_to_store); - //__m128i mask = PermuteV128Bytes(shr_for_offset, _mm_setzero_si128(), - // _mm_set1_epi8((char)0xff)); - - __m128i mask = _mm_cmpgt_epi8(shr_for_offset, _mm_set1_epi8(15)); - __m128i blended_input_and_memory = - _mm_blendv_epi8(tempload, permuted_us, mask); - - __m128i swapped_final_result = ByteSwap(blended_input_and_memory); - - _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); -} -#else -static void callnativesafe_stvl(void* ctx, void* addr, __m128i* value) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - - uaddr &= ~0xfULL; - - __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); - - __m128i our_value_to_store = _mm_loadu_si128(value); - - __m128i shr_for_offset; - { - __m128i lvsr_table_base = - _mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31), - _mm_set1_epi8(16)); - shr_for_offset = - _mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs)); - } - __m128i permuted_us; - { - __m128i selector2 = _mm_xor_si128(shr_for_offset, _mm_set1_epi8(3)); - - __m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, selector2); - - permuted_us = src2_shuf; - } - - __m128i blended_input_and_memory = - _mm_blendv_epi8(permuted_us, tempload, shr_for_offset); - - __m128i swapped_final_result = blended_input_and_memory; - - _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); -} -static void callnativesafe_stvl_experiment(void* addr, __m128i* value) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - - uaddr &= ~0xfULL; - - __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); - - __m128i our_value_to_store = _mm_loadu_si128(value); - - __m128i shr_for_offset; - { - __m128i lvsr_table_base = - _mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31), - _mm_set1_epi8(16)); - - // lvsr_table_base = _mm_xor_si128(lvsr_table_base, _mm_set1_epi8(3)); - // lvsr_table_base = ByteSwap(lvsr_table_base); - shr_for_offset = - _mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs)); - } - __m128i permuted_us; - { - shr_for_offset = _mm_xor_si128(shr_for_offset, _mm_set1_epi8(3)); - - __m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, shr_for_offset); - - permuted_us = src2_shuf; - } - - __m128i blended_input_and_memory = - _mm_blendv_epi8(permuted_us, tempload, shr_for_offset); - - __m128i swapped_final_result = blended_input_and_memory; - - _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); -} - -#endif struct STVL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { -#if 0 - e.lea(e.GetNativeParam(0), e.ptr[ComputeMemoryAddress(e, i.src1)]); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - - e.lea(e.GetNativeParam(1), e.StashXmm(0, src2)); - e.CallNativeSafe((void*)callnativesafe_stvl); - -#else e.mov(e.ecx, 15); e.mov(e.edx, e.ecx); e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]); @@ -640,7 +430,6 @@ struct STVL_V128 : Sequence> { e.not_(e.rdx); e.and_(e.rax, e.rdx); e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMSTVLShuffle)); - // e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask)); if (e.IsFeatureEnabled(kX64EmitAVX2)) { e.vpbroadcastb(e.xmm3, e.xmm0); } else { @@ -650,126 +439,18 @@ struct STVL_V128 : Sequence> { e.vpxor(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMSwapWordMask)); // xmm1 from now on will be our // selector for blend/shuffle - // we can reuse xmm0, xmm2 and xmm3 now - // e.vmovdqa(e.xmm0, e.ptr[e.rax]); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm0); e.vpshufb(e.xmm2, src2, e.xmm1); e.vpblendvb(e.xmm3, e.xmm2, e.ptr[e.rax], e.xmm1); e.vmovdqa(e.ptr[e.rax], e.xmm3); - -#endif } }; EMITTER_OPCODE_TABLE(OPCODE_STVL, STVL_V128); -/* - Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - // Skip if %16=0 (no data to store). - auto skip_label = f.NewLabel(); - f.BranchFalse(eb, skip_label); - // ea &= ~0xF - // NOTE: need to recalculate ea and eb because after Branch we start a new - // block and we can't use their previous instantiation in the new block - ea = CalculateEA_0(f, ra, rb); - eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - ea = f.And(ea, f.LoadConstantUint64(~0xFull)); - Value* shrs = f.LoadVectorShr(eb); - Value* zerovec = f.LoadZeroVec128(); - // v = (old & ~mask) | ((new << eb) & mask) - Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE); - Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); - // mask = ~FFFF... >> eb - Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE); - Value* v = f.Select(mask, old_value, new_value); - // ea &= ~0xF (handled above) - f.Store(ea, f.ByteSwap(v)); - f.MarkLabel(skip_label); -*/ -#if 0 -static void callnativesafe_stvr(void* ctx, void* addr, __m128i* value) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - if (!bad_offs) { - return; - } - uaddr &= ~0xfULL; - - __m128i tempload = ByteSwap(_mm_loadu_si128((const __m128i*)uaddr)); - - __m128i our_value_to_store = _mm_loadu_si128(value); - - __m128i shr_for_offset = LVSR((char)bad_offs); - - __m128i permuted_us = PermuteV128Bytes( - shr_for_offset, our_value_to_store, _mm_setzero_si128() ); - __m128i mask = PermuteV128Bytes( - shr_for_offset, _mm_set1_epi8((char)0xff) ,_mm_setzero_si128() - ); - - //__m128i mask = _mm_cmpgt_epi8(shr_for_offset, _mm_set1_epi8(15)); - __m128i blended_input_and_memory = - _mm_blendv_epi8(tempload, permuted_us, mask); - - __m128i swapped_final_result = ByteSwap(blended_input_and_memory); - - _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); -} -#else -static void callnativesafe_stvr(void* ctx, void* addr, __m128i* value) { - uintptr_t uaddr = reinterpret_cast(addr); - - uintptr_t bad_offs = uaddr & 0xf; - - uaddr &= ~0xfULL; - if (!bad_offs) { - return; - } - __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); - - __m128i our_value_to_store = _mm_loadu_si128(value); - - __m128i shr_for_offset; - { - __m128i lvsr_table_base = - _mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - 27, 28, 29, 30, 31), - _mm_set1_epi8(16)); - - // lvsr_table_base = _mm_xor_si128(lvsr_table_base, _mm_set1_epi8(3)); - // lvsr_table_base = ByteSwap(lvsr_table_base); - shr_for_offset = - _mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs)); - } - __m128i permuted_us; - { - shr_for_offset = _mm_xor_si128(shr_for_offset, _mm_set1_epi8((char)0x83)); - - __m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, shr_for_offset); - - permuted_us = src2_shuf; - } - - __m128i blended_input_and_memory = - _mm_blendv_epi8(permuted_us, tempload, shr_for_offset); - - __m128i swapped_final_result = blended_input_and_memory; - - _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); -} -#endif struct STVR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { -#if 0 - e.lea(e.GetNativeParam(0), e.ptr[ComputeMemoryAddress(e, i.src1)]); - Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); - - e.lea(e.GetNativeParam(1), e.StashXmm(0, src2)); - e.CallNativeSafe((void*)callnativesafe_stvr); - -#else Xbyak::Label skipper{}; e.mov(e.ecx, 15); e.mov(e.edx, e.ecx); @@ -782,7 +463,7 @@ struct STVR_V128 : Sequence> { e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMSTVLShuffle)); // todo: maybe a table lookup might be a better idea for getting the // shuffle/blend - // e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMSTVRSwapMask)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { e.vpbroadcastb(e.xmm3, e.xmm0); } else { @@ -792,8 +473,6 @@ struct STVR_V128 : Sequence> { e.vpxor(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMSTVRSwapMask)); // xmm1 from now on will be our // selector for blend/shuffle - // we can reuse xmm0, xmm2 and xmm3 now - // e.vmovdqa(e.xmm0, e.ptr[e.rax]); Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm0); @@ -801,7 +480,6 @@ struct STVR_V128 : Sequence> { e.vpblendvb(e.xmm3, e.xmm2, e.ptr[e.rax], e.xmm1); e.vmovdqa(e.ptr[e.rax], e.xmm3); e.L(skipper); -#endif } }; EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128); diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 2d1654030..f7d91267b 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -243,7 +243,16 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - + case OPCODE_LVR: + if (i->src1.value->IsConstant()) { + if (!(i->src1.value->AsUint32() & 0xF)) { + v->set_zero(VEC128_TYPE); + i->Remove(); + result = true; + break; + } + } + break; case OPCODE_LOAD: case OPCODE_LOAD_OFFSET: if (i->src1.value->IsConstant()) { @@ -921,6 +930,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; + default: // Ignored. break; diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index 9f2ede47f..9ff0f45c6 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -208,20 +208,10 @@ int InstrEmit_stvxl128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_lvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { Value* ea = CalculateEA_0(f, ra, rb); -#if 0 - Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - // ea &= ~0xF - ea = f.And(ea, f.LoadConstantUint64(~0xFull)); - // v = (new << eb) - Value* v = f.Permute(f.LoadVectorShl(eb), f.ByteSwap(f.Load(ea, VEC128_TYPE)), - f.LoadZeroVec128(), INT8_TYPE); - f.StoreVR(vd, v); - return 0; -#else + Value* val = f.LoadVectorLeft(ea); f.StoreVR(vd, val); return 0; -#endif } int InstrEmit_lvlx(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_lvlx_(f, i, i.X.RT, i.X.RA, i.X.RB); @@ -243,32 +233,10 @@ int InstrEmit_lvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, // buffer, which sometimes may be nothing and hang off the end of the valid // page area. We still need to zero the resulting register, though. Value* ea = CalculateEA_0(f, ra, rb); -#if 0 - Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - // Skip if %16=0 (just load zero). - auto load_label = f.NewLabel(); - auto end_label = f.NewLabel(); - f.BranchTrue(eb, load_label); - f.StoreVR(vd, f.LoadZeroVec128()); - f.Branch(end_label); - f.MarkLabel(load_label); - // ea &= ~0xF - // NOTE: need to recalculate ea and eb because after Branch we start a new - // block and we can't use their previous instantiation in the new block - ea = CalculateEA_0(f, ra, rb); - eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - ea = f.And(ea, f.LoadConstantUint64(~0xFull)); - // v = (new >> (16 - eb)) - Value* v = f.Permute(f.LoadVectorShl(eb), f.LoadZeroVec128(), - f.ByteSwap(f.Load(ea, VEC128_TYPE)), INT8_TYPE); - f.StoreVR(vd, v); - f.MarkLabel(end_label); - return 0; -#else + Value* val = f.LoadVectorRight(ea); f.StoreVR(vd, val); return 0; -#endif } int InstrEmit_lvrx(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_lvrx_(f, i, i.X.RT, i.X.RA, i.X.RB); @@ -289,34 +257,9 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, // we could optimize this to prevent the other load/mask, in that case. Value* ea = CalculateEA_0(f, ra, rb); -#if 0 - Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - // ea &= ~0xF - ea = f.And(ea, f.LoadConstantUint64(~0xFull)); - Value* shrs = f.LoadVectorShr(eb); - Value* zerovec = f.LoadZeroVec128(); - - // v = (old & ~mask) | ((new >> eb) & mask) - - Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE); - Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE); - Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); - /* - these permutes need to be looked at closer. keep in mind Permute is meant to - emulate vmx's shuffles and does not generate particularly good code. The logic - here looks as if it might make more sense as a comparison ( -*/ - // mask = FFFF... >> eb - - - Value* v = f.Select(mask, old_value, new_value); - // ea &= ~0xF (handled above) - f.Store(ea, f.ByteSwap(v)); -#else Value* vdr = f.LoadVR(vd); f.StoreVectorLeft(ea, vdr); -#endif return 0; } int InstrEmit_stvlx(PPCHIRBuilder& f, const InstrData& i) { @@ -339,32 +282,9 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, // buffer, which sometimes may be nothing and hang off the end of the valid // page area. Value* ea = CalculateEA_0(f, ra, rb); -#if 0 - Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - // Skip if %16=0 (no data to store). - auto skip_label = f.NewLabel(); - f.BranchFalse(eb, skip_label); - // ea &= ~0xF - // NOTE: need to recalculate ea and eb because after Branch we start a new - // block and we can't use their previous instantiation in the new block - ea = CalculateEA_0(f, ra, rb); - eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); - ea = f.And(ea, f.LoadConstantUint64(~0xFull)); - Value* shrs = f.LoadVectorShr(eb); - Value* zerovec = f.LoadZeroVec128(); - // v = (old & ~mask) | ((new << eb) & mask) - Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE); - Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); - // mask = ~FFFF... >> eb - Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE); - Value* v = f.Select(mask, old_value, new_value); - // ea &= ~0xF (handled above) - f.Store(ea, f.ByteSwap(v)); - f.MarkLabel(skip_label); -#else + Value* vdr = f.LoadVR(vd); f.StoreVectorRight(ea, vdr); -#endif return 0; } int InstrEmit_stvrx(PPCHIRBuilder& f, const InstrData& i) { diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index 9da95a84d..8e66ac683 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -952,11 +952,17 @@ void KfReleaseSpinLock_entry(lpdword_t lock_ptr, dword_t old_irql) { } DECLARE_XBOXKRNL_EXPORT2(KfReleaseSpinLock, kThreading, kImplemented, kHighFrequency); - +// todo: this is not accurate void KeAcquireSpinLockAtRaisedIrql_entry(lpdword_t lock_ptr) { // Lock. auto lock = reinterpret_cast(lock_ptr.host_address()); while (!xe::atomic_cas(0, 1, lock)) { +#if XE_ARCH_AMD64 == 1 + // todo: this is just a nop if they don't have SMT, which is not great + // either... + + _mm_pause(); +#endif // Spin! // TODO(benvanik): error on deadlock? }