implement bit-perfect vrsqrtefp

This commit is contained in:
disjtqz 2023-09-30 14:59:56 -04:00 committed by Radosław Gliński
parent cfecdcbeab
commit 79465708aa
9 changed files with 540 additions and 28 deletions

View File

@ -73,6 +73,9 @@ class X64HelperEmitter : public X64Emitter {
void* EmitTryAcquireReservationHelper();
void* EmitReservedStoreHelper(bool bit64 = false);
void* EmitScalarVRsqrteHelper();
void* EmitVectorVRsqrteHelper(void* scalar_helper);
private:
void* EmitCurrentForOffsets(const _code_offsets& offsets,
size_t stack_size = 0);
@ -207,6 +210,8 @@ bool X64Backend::Initialize(Processor* processor) {
if (!code_cache_->Initialize()) {
return false;
}
// Allocate emitter constant data.
emitter_data_ = X64Emitter::PlaceConstData();
// Generate thunks used to transition between jitted code and host code.
XbyakAllocator allocator;
@ -233,7 +238,8 @@ bool X64Backend::Initialize(Processor* processor) {
thunk_emitter.EmitTryAcquireReservationHelper();
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
// Set the code cache to use the ResolveFunction thunk for default
// indirections.
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
@ -243,9 +249,6 @@ bool X64Backend::Initialize(Processor* processor) {
// Allocate some special indirections.
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
// Allocate emitter constant data.
emitter_data_ = X64Emitter::PlaceConstData();
// Setup exception callback
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
if (cvars::record_mmio_access_exceptions) {
@ -844,7 +847,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
pop(r8); // return address
switch (stack_element_size) {
case 4:
mov(r11d, ptr[r8]);
@ -865,6 +868,300 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
return EmitCurrentForOffsets(code_offsets);
}
void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
_code_offsets code_offsets = {};
Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1;
Xbyak::Label LC1, _LCPI3_1;
Xbyak::Label handle_denormal_input;
Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret, handle_oddball_denormal;
auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) {
inLocalLabel();
Xbyak::Label end_lzcnt;
bsr(scratch_reg, reg);
mov(reg, 0x20);
jz(end_lzcnt);
xor_(scratch_reg, 0x1F);
mov(reg, scratch_reg);
L(end_lzcnt);
outLocalLabel();
};
vmovd(r8d, xmm0);
vmovaps(xmm1, xmm0);
mov(ecx, r8d);
//extract mantissa
and_(ecx, 0x7fffff);
mov(edx, ecx);
cmp(r8d, 0xff800000);
jz(specialcheck_1, CodeGenerator::T_NEAR);
//is exponent zero?
test(r8d, 0x7f800000);
jne(L18);
test(ecx, ecx);
jne(L2);
L(L18);
//extract biased exponent and unbias
mov(r9d, r8d);
shr(r9d, 23);
movzx(r9d, r9b);
lea(eax, ptr[r9 - 127]);
cmp(r9d, 255);
jne(L4);
jmp(L35);
L(L2);
bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
jnc(handle_denormal_input, CodeGenerator::T_NEAR);
// handle denormal input with NJM on
// denorms get converted to zero w/ input sign, jump to our label
// that handles inputs of 0 for this
jmp(convert_to_signed_inf_and_ret);
L(L35);
vxorps(xmm0, xmm0, xmm0);
mov(eax, 128);
vcomiss(xmm1, xmm0);
jb(L4);
test(ecx, ecx);
jne(L8);
ret();
L(L4);
cmp(eax, 128);
jne(L9);
vxorps(xmm0, xmm0, xmm0);
vcomiss(xmm0, xmm1);
jbe(L9);
vmovss(xmm2, ptr[rip+LC1]);
vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32));
test(edx, edx);
jne(L8);
vorps(xmm0, xmm2, xmm2);
ret();
L(L9);
test(edx, edx);
je(L10);
cmp(eax, 128);
jne(L11);
L(L8);
or_(r8d, 0x400000);
vmovd(xmm0, r8d);
ret();
L(L10);
test(r9d, r9d);
jne(L11);
L(convert_to_signed_inf_and_ret);
not_(r8d);
shr(r8d, 31);
lea(rdx, ptr[rip + _LCPI3_1]);
shl(r8d, 2);
vmovss(xmm0, ptr[r8 + rdx]);
ret();
L(L11);
vxorps(xmm2, xmm2, xmm2);
vmovss(xmm0, ptr[rip+LC1]);
vcomiss(xmm2, xmm1);
ja(L1, CodeGenerator::T_NEAR);
mov(ecx, 127);
sal(eax, 4);
sub(ecx, r9d);
mov(r9d, edx);
and_(eax, 16);
shr(edx, 9);
shr(r9d, 19);
and_(edx, 1023);
sar(ecx, 1);
or_(eax, r9d);
xor_(eax, 16);
mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) +
rax * 4]);
mov(eax, r9d);
shr(r9d, 16);
imul(edx, r9d);
sal(eax, 10);
and_(eax, 0x3fffc00);
sub(eax, edx);
bt(eax, 25);
jc(L12);
mov(edx, eax);
add(ecx, 6);
and_(edx, 0x1ffffff);
if (IsFeatureEnabled(kX64EmitLZCNT)) {
lzcnt(edx, edx);
} else {
emulate_lzcnt_helper_unary_reg(edx, r9d);
}
lea(r9d, ptr[rdx - 6]);
sub(ecx, edx);
if (IsFeatureEnabled(kX64EmitBMI2)) {
shlx(eax, eax, r9d);
} else {
xchg(ecx, r9d);
shl(eax, cl);
xchg(ecx, r9d);
}
L(L12);
test(al, 5);
je(L13);
test(al, 2);
je(L13);
add(eax, 4);
L(L13);
sal(ecx, 23);
and_(r8d, 0x80000000);
shr(eax, 2);
add(ecx, 0x3f800000);
and_(eax, 0x7fffff);
vxorps(xmm1, xmm1);
or_(ecx, r8d);
or_(ecx, eax);
vmovd(xmm0, ecx);
vaddss(xmm0, xmm1);//apply DAZ behavior to output
L(L1);
ret();
L(handle_denormal_input);
mov(r9d, r8d);
and_(r9d, 0x7FFFFFFF);
cmp(r9d, 0x400000);
jz(handle_oddball_denormal);
if (IsFeatureEnabled(kX64EmitLZCNT)) {
lzcnt(ecx, ecx);
} else {
emulate_lzcnt_helper_unary_reg(ecx, r9d);
}
mov(r9d, 9);
mov(eax, -118);
lea(edx, ptr[rcx - 8]);
sub(r9d, ecx);
sub(eax, ecx);
if (IsFeatureEnabled(kX64EmitBMI2)) {
shlx(edx, r8d, edx);
} else {
xchg(ecx, edx);
// esi is just the value of xmm0's low word, so we can restore it from there
shl(r8d, cl);
mov(ecx, edx); // restore ecx, dont xchg because we're going to spoil edx anyway
mov(edx, r8d);
vmovd(r8d, xmm0);
}
and_(edx, 0x7ffffe);
jmp(L4);
L(specialcheck_1);
//should be extremely rare
vmovss(xmm0, ptr[rip+LC1]);
ret();
L(handle_oddball_denormal);
not_(r8d);
lea(r9, ptr[rip + LC1]);
shr(r8d, 31);
movss(xmm0, ptr[r9 + r8 * 4]);
ret();
L(_LCPI3_1);
dd(0xFF800000);
dd(0x7F800000);
L(LC1);
//the position of 7FC00000 here matters, this address will be indexed in handle_oddball_denormal
dd(0x7FC00000);
dd(0x5F34FD00);
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.prolog = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
return EmitCurrentForOffsets(code_offsets);
}
void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
_code_offsets code_offsets = {};
Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version;
auto result_ptr =
GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0]));
auto counter_ptr = GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2]));
counter_ptr.setBit(64);
//shuffle and xor to check whether all lanes are equal
//sadly has to leave the float pipeline for the vptest, which is moderate yikes
vmovhlps(xmm2, xmm0, xmm0);
vmovsldup(xmm1, xmm0);
vxorps(xmm1, xmm1, xmm0);
vxorps(xmm2, xmm2, xmm0);
vorps(xmm2, xmm1, xmm2);
vptest(xmm2, xmm2);
jnz(check_scalar_operation_in_vmx);
//jmp(scalar_helper, CodeGenerator::T_NEAR);
call(scalar_helper);
vshufps(xmm0, xmm0, xmm0, 0);
ret();
L(check_scalar_operation_in_vmx);
vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]);
jnz(actual_vector_version);
vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3));
call(scalar_helper);
// this->DebugBreak();
vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6));
vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
0b0111);
ret();
L(actual_vector_version);
xor_(ecx, ecx);
vmovaps(result_ptr, xmm0);
mov(counter_ptr, rcx);
Xbyak::Label loop;
L(loop);
lea(rax, result_ptr);
vmovss(xmm0, ptr[rax+rcx*4]);
call(scalar_helper);
mov(rcx, counter_ptr);
lea(rax, result_ptr);
vmovss(ptr[rax+rcx*4], xmm0);
inc(ecx);
cmp(ecx, 4);
mov(counter_ptr, rcx);
jl(loop);
vmovaps(xmm0, result_ptr);
ret();
code_offsets.prolog_stack_alloc = getSize();
code_offsets.body = getSize();
code_offsets.epilog = getSize();
code_offsets.tail = getSize();
code_offsets.prolog = getSize();
return EmitCurrentForOffsets(code_offsets);
}
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
_code_offsets code_offsets = {};
code_offsets.prolog = getSize();
@ -872,7 +1169,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
Xbyak::Label already_has_a_reservation;
Xbyak::Label acquire_new_reservation;
btr(GetBackendFlagsPtr(), 1);
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
jc(already_has_a_reservation);
@ -888,7 +1185,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
// set flag on local backend context for thread to indicate our previous
// attempt to get the reservation succeeded
setnc(r9b); // success = bitmap did not have a set bit at the idx
shl(r9b, 1);
shl(r9b, kX64BackendHasReserveBit);
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
rdx);
@ -917,7 +1214,7 @@ void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
Xbyak::Label somehow_double_cleared;
// carry must be set + zero flag must be set
btr(GetBackendFlagsPtr(), 1);
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
jnc(done);
@ -1097,7 +1394,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
: nullptr;
bctx->current_stackpoint_depth = 0;
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
bctx->flags = 0;
bctx->flags = (1U << kX64BackendNJMOn); // NJM on by default
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
bctx->Ox1000 = 0x1000;
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
@ -1128,7 +1425,9 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
uint32_t control = mode & 7;
_mm_setcsr(mxcsr_table[control]);
bctx->mxcsr_fpu = mxcsr_table[control];
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
auto ppc_context = ((ppc::PPCContext*)ctx);
ppc_context->fpscr.bits.rn = control;
ppc_context->fpscr.bits.ni = control >> 2;
}
bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {

View File

@ -61,11 +61,22 @@ struct X64BackendStackpoint {
// use
unsigned guest_return_address_;
};
enum : uint32_t {
kX64BackendMXCSRModeBit = 0,
kX64BackendHasReserveBit = 1,
kX64BackendNJMOn = 2, //non-java mode bit is currently set. for use in software fp routines
kX64BackendNonIEEEMode = 3, //non-ieee mode is currently enabled for scalar fpu.
};
// located prior to the ctx register
// some things it would be nice to have be per-emulator instance instead of per
// context (somehow placing a global X64BackendCtx prior to membase, so we can
// negatively index the membase reg)
struct X64BackendContext {
union {
__m128 helper_scratch_xmms[4];
uint64_t helper_scratch_u64s[8];
uint32_t helper_scratch_u32s[16];
};
ReserveHelper* reserve_helper_;
uint64_t cached_reserve_value_;
// guest_tick_count is used if inline_loadclock is used
@ -147,6 +158,13 @@ class X64Backend : public Backend {
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
void RecordMMIOExceptionForGuestInstruction(void* host_address);
uint32_t LookupXMMConstantAddress32(unsigned index) {
return static_cast<uint32_t>(emitter_data() + sizeof(vec128_t) * index);
}
void* LookupXMMConstantAddress(unsigned index) {
return reinterpret_cast<void*>(emitter_data() + sizeof(vec128_t) * index);
}
#if XE_X64_PROFILER_AVAILABLE == 1
uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
#endif
@ -173,7 +191,8 @@ class X64Backend : public Backend {
void* try_acquire_reservation_helper_ = nullptr;
void* reserved_store_32_helper = nullptr;
void* reserved_store_64_helper = nullptr;
void* vrsqrtefp_vector_helper = nullptr;
void* vrsqrtefp_scalar_helper = nullptr;
private:
#if XE_X64_PROFILER_AVAILABLE == 1
GuestProfilerData profiler_data_;

View File

@ -982,6 +982,16 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
return result;
}
static inline vec128_t v128_setr_words(uint32_t v0, uint32_t v1, uint32_t v2,
uint32_t v3) {
vec128_t result;
result.u32[0] = v0;
result.u32[1] = v1;
result.u32[2] = v2;
result.u32[3] = v3;
return result;
}
static const vec128_t xmm_consts[] = {
/* XMMZero */ vec128f(0.0f),
/* XMMByteSwapMask */
@ -1151,7 +1161,19 @@ static const vec128_t xmm_consts[] = {
vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
// XMMVSRMask
vec128b(1)};
vec128b(1),
//XMMVRsqrteTableStart
v128_setr_words(0x568B4FD, 0x4F3AF97, 0x48DAAA5, 0x435A618),
v128_setr_words(0x3E7A1E4, 0x3A29DFE, 0x3659A5C, 0x32E96F8),
v128_setr_words(0x2FC93CA, 0x2D090CE, 0x2A88DFE, 0x2838B57),
v128_setr_words(0x26188D4, 0x2438673, 0x2268431, 0x20B820B),
v128_setr_words(0x3D27FFA, 0x3807C29, 0x33878AA, 0x2F97572),
v128_setr_words(0x2C27279, 0x2926FB7, 0x2666D26, 0x23F6AC0),
v128_setr_words(0x21D6881, 0x1FD6665, 0x1E16468, 0x1C76287),
v128_setr_words(0x1AF60C1, 0x1995F12, 0x1855D79, 0x1735BF4),
//XMMVRsqrteTableBase
vec128i(0) //filled in later
};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) {
@ -1223,7 +1245,17 @@ uintptr_t X64Emitter::PlaceConstData() {
// The pointer must not be greater than 31 bits.
assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
/*
set each 32-bit element of the constant XMMVRsqrteTableBase to be the address of the start of the constant XMMVRsqrteTableStart
this
*/
vec128_t* deferred_constants = reinterpret_cast<vec128_t*>(mem);
vec128_t* vrsqrte_table_base = &deferred_constants[XMMVRsqrteTableBase];
uint32_t ptr_to_vrsqrte_table32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(&deferred_constants[XMMVRsqrteTableStart]));
*vrsqrte_table_base = vec128i(ptr_to_vrsqrte_table32);
memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
return reinterpret_cast<uintptr_t>(mem);
@ -1237,8 +1269,9 @@ void X64Emitter::FreeConstData(uintptr_t data) {
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
// Load through fixed constant table setup by PlaceConstData.
// It's important that the pointer is not signed, as it will be sign-extended.
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
sizeof(vec128_t) * id)];
void* emitter_data_ptr = backend_->LookupXMMConstantAddress(static_cast<unsigned>(id));
xenia_assert(reinterpret_cast<uintptr_t>(emitter_data_ptr) < (1ULL << 31));//must not have signbit set
return ptr[emitter_data_ptr];
}
// Implies possible StashXmm(0, ...)!
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
@ -1634,9 +1667,9 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
} else { // even if already set, we still need to update flags to reflect
// our mode
if (new_mode == MXCSRMode::Fpu) {
btr(GetBackendFlagsPtr(), 0);
btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
} else if (new_mode == MXCSRMode::Vmx) {
bts(GetBackendFlagsPtr(), 0);
bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
} else {
assert_unhandled_case(new_mode);
}
@ -1646,11 +1679,11 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
if (!already_set) {
if (new_mode == MXCSRMode::Fpu) {
LoadFpuMxcsrDirect();
btr(GetBackendFlagsPtr(), 0);
btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
return true;
} else if (new_mode == MXCSRMode::Vmx) {
LoadVmxMxcsrDirect();
bts(GetBackendFlagsPtr(), 0);
bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
return true;
} else {
assert_unhandled_case(new_mode);

View File

@ -174,7 +174,9 @@ enum XmmConst {
XMMSTVLShuffle,
XMMSTVRSwapMask, // swapwordmask with bit 7 set
XMMVSRShlByteshuf,
XMMVSRMask
XMMVSRMask,
XMMVRsqrteTableStart,
XMMVRsqrteTableBase = XMMVRsqrteTableStart + (32 / 4), //32 4-byte elements in table, 4 4-byte elements fit in each xmm
};
using amdfx::xopcompare_e;
@ -308,7 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
size_t stack_size() const { return stack_size_; }
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
/*
returns true if had to load mxcsr. DOT_PRODUCT can use this to skip

View File

@ -3376,17 +3376,28 @@ struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
addr_vmx.setBit(32);
auto flags_ptr = e.GetBackendFlagsPtr();
if (i.src1.is_constant) {
if (i.src1.constant() == 0) {
// turn off daz/flush2z
e.mov(addr_vmx, _MM_MASK_MASK);
e.btr(flags_ptr, kX64BackendNJMOn);
} else {
e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
e.bts(flags_ptr, kX64BackendNJMOn);
}
} else {
e.mov(e.eax, flags_ptr);
e.mov(e.edx, 1U << kX64BackendNJMOn);
e.mov(e.ecx, e.edx);
e.not_(e.ecx);
e.and_(e.ecx, e.eax);
e.or_(e.edx, e.eax);
e.test(i.src1, i.src1);
e.cmove(e.edx, e.ecx);
e.mov(flags_ptr, e.edx);
e.mov(e.edx, DEFAULT_VMX_MXCSR);
e.mov(e.eax, _MM_MASK_MASK);

View File

@ -2123,12 +2123,19 @@ struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
e.ChangeMxcsrMode(MXCSRMode::Vmx);
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
e.vrsqrt14ps(i.dest, src1);
/*
the vast majority of inputs to vrsqrte come from vmsum3 or vmsum4 as part
of a vector normalization sequence. in fact, its difficult to find uses of vrsqrte in titles
that have inputs which do not come from vmsum.
*/
if (i.src1.value && i.src1.value->AllFloatVectorLanesSameValue()) {
e.vmovss(e.xmm0, src1);
e.call(e.backend()->vrsqrtefp_scalar_helper);
e.vshufps(i.dest, e.xmm0, e.xmm0, 0);
} else {
e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne));
e.vsqrtps(e.xmm1, src1);
e.vdivps(i.dest, e.xmm0, e.xmm1);
e.vmovaps(e.xmm0, src1);
e.call(e.backend()->vrsqrtefp_vector_helper);
e.vmovaps(i.dest, e.xmm0);
}
}
};
@ -3183,16 +3190,37 @@ struct SET_ROUNDING_MODE_I32
// removed the And with 7 and hoisted that and into the InstrEmit_'s that
// generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and
// backends dont have to worry about it
auto flags_ptr = e.GetBackendFlagsPtr();
if (i.src1.is_constant) {
e.mov(e.eax, mxcsr_table[i.src1.constant()]);
unsigned constant_value = i.src1.constant();
e.mov(e.eax, mxcsr_table[constant_value]);
if (constant_value & 4) {
e.or_(flags_ptr, 1U << kX64BackendNonIEEEMode);
}
else {
e.btr(flags_ptr, kX64BackendNonIEEEMode);
}
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
} else {
e.mov(e.ecx, i.src1);
//can andnot, but this is a very infrequently used opcode
e.mov(e.eax, 1U << kX64BackendNonIEEEMode);
e.mov(e.edx, e.eax);
e.not_(e.edx);
e.mov(e.ecx, flags_ptr);
//edx = flags w/ non ieee cleared
e.and_(e.edx, e.ecx);
//eax = flags w/ non ieee set
e.or_(e.eax, e.ecx);
e.bt(i.src1, 2);
e.mov(e.ecx, i.src1);
e.cmovc(e.edx, e.eax);
e.mov(e.rax, uintptr_t(mxcsr_table));
e.mov(flags_ptr, e.edx);
e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]);
// this was not here
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx);

View File

@ -1370,6 +1370,38 @@ bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
}
}
}
/*
splatting a 32-bit value extracted from a vector where all 4 32-bit values are the same should be eliminated and
instead use the vector extracted from, which will be identical
have seen this happen, some games vmsum and then splat the low float to all 4 floats, even though it already is there
*/
if (opc == OPCODE_SPLAT) {
if (i->dest->type == VEC128_TYPE) {
auto splatted_value = i->src1.value;
auto splat_type = splatted_value->type;
if (splat_type == FLOAT32_TYPE || splat_type == INT32_TYPE) {
//its a splat of a fourbyte value, check the definition
auto splat_input_definition = splatted_value->GetDefSkipAssigns();
if (splat_input_definition) {
auto defining_opcode = splat_input_definition->GetOpcodeNum();
if (defining_opcode == OPCODE_EXTRACT) {
auto value_extracted_from = splat_input_definition->src1.value;
if (value_extracted_from->type == VEC128_TYPE) {
xenia_assert(splat_input_definition->dest->type == splat_type);
if (value_extracted_from->AllFloatVectorLanesSameValue()) {
i->Replace(&OPCODE_ASSIGN_info,0);
i->set_src1(value_extracted_from);
return true;
}
}
}
}
}
}
}
return false;
}
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {

View File

@ -1805,6 +1805,86 @@ bool Value::AllUsesByOneInsn() const {
}
return true;
}
bool Value::AllFloatVectorLanesSameValue(const hir::Value* for_value,
uint32_t current_depth) {
// limit recursion, otherwise this function will slow down emission
if (current_depth == 16) {
return false;
}
using namespace hir;
hir::Instr* definition;
Opcode definition_opcode_number;
re_enter:
definition = for_value->def;
if (!definition) {
xenia_assert(for_value->IsConstant());
auto&& constant_value = for_value->constant.v128;
for (unsigned constant_lane_index = 1; constant_lane_index < 4; ++constant_lane_index) {
if (constant_value.u32[0] != constant_value.u32[constant_lane_index]) {
return false;
}
}
return true;
}
definition_opcode_number = definition->GetOpcodeNum();
if (definition_opcode_number == OPCODE_ASSIGN) {
for_value = definition->src1.value;
goto re_enter;
}
if (definition_opcode_number == OPCODE_VECTOR_DENORMFLUSH) {
for_value = definition->src1.value;
goto re_enter;
}
/*
vmsum propagates its result to every lane
*/
if (definition_opcode_number == OPCODE_DOT_PRODUCT_4 ||
definition_opcode_number == OPCODE_DOT_PRODUCT_3) {
return true;
}
//if splat of 32-bit value type, return true
//technically a splat of int16 or int8 would also produce the same "float" in all lanes
//but i think its best to keep this function focused on specifically float data
if (definition_opcode_number == OPCODE_SPLAT) {
if (definition->dest->type == VEC128_TYPE) {
auto splat_src_value_type = definition->src1.value->type;
if (splat_src_value_type == INT32_TYPE ||
splat_src_value_type == FLOAT32_TYPE) {
return true;
}
}
}
switch (definition_opcode_number) {
//all of these opcodes produce the same value for the same input
case OPCODE_RSQRT:
case OPCODE_RECIP:
case OPCODE_POW2:
case OPCODE_LOG2:
for_value = definition->src1.value;
goto re_enter;
//binary opcodes
case OPCODE_ADD:
case OPCODE_SUB:
case OPCODE_MUL:
if (!AllFloatVectorLanesSameValue(definition->src1.value,
current_depth + 1)) {
return false;
}
for_value = definition->src2.value;
goto re_enter;
default:
break;
}
return false;
}
} // namespace hir
} // namespace cpu
} // namespace xe

View File

@ -618,8 +618,16 @@ class Value {
bool MaybeFloaty() const {
return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
}
bool AllFloatVectorLanesSameValue() const {
return Value::AllFloatVectorLanesSameValue(this);
}
private:
/*
returns true if for_value (which must be VEC128_TYPE) has the same value in
every float
*/
static bool AllFloatVectorLanesSameValue(const hir::Value* for_value,
uint32_t current_depth = 0);
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
static bool CompareInt16(Opcode opcode, Value* a, Value* b);
static bool CompareInt32(Opcode opcode, Value* a, Value* b);