implement bit-perfect vrsqrtefp
This commit is contained in:
parent
cfecdcbeab
commit
79465708aa
|
@ -73,6 +73,9 @@ class X64HelperEmitter : public X64Emitter {
|
|||
void* EmitTryAcquireReservationHelper();
|
||||
void* EmitReservedStoreHelper(bool bit64 = false);
|
||||
|
||||
void* EmitScalarVRsqrteHelper();
|
||||
void* EmitVectorVRsqrteHelper(void* scalar_helper);
|
||||
|
||||
private:
|
||||
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
||||
size_t stack_size = 0);
|
||||
|
@ -207,6 +210,8 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
if (!code_cache_->Initialize()) {
|
||||
return false;
|
||||
}
|
||||
// Allocate emitter constant data.
|
||||
emitter_data_ = X64Emitter::PlaceConstData();
|
||||
|
||||
// Generate thunks used to transition between jitted code and host code.
|
||||
XbyakAllocator allocator;
|
||||
|
@ -233,7 +238,8 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
thunk_emitter.EmitTryAcquireReservationHelper();
|
||||
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
|
||||
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
|
||||
|
||||
vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
|
||||
vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
|
||||
// Set the code cache to use the ResolveFunction thunk for default
|
||||
// indirections.
|
||||
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
||||
|
@ -243,9 +249,6 @@ bool X64Backend::Initialize(Processor* processor) {
|
|||
// Allocate some special indirections.
|
||||
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
|
||||
|
||||
// Allocate emitter constant data.
|
||||
emitter_data_ = X64Emitter::PlaceConstData();
|
||||
|
||||
// Setup exception callback
|
||||
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
||||
if (cvars::record_mmio_access_exceptions) {
|
||||
|
@ -844,7 +847,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|||
_code_offsets code_offsets = {};
|
||||
code_offsets.prolog = getSize();
|
||||
pop(r8); // return address
|
||||
|
||||
|
||||
switch (stack_element_size) {
|
||||
case 4:
|
||||
mov(r11d, ptr[r8]);
|
||||
|
@ -865,6 +868,300 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
|||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
|
||||
void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
|
||||
_code_offsets code_offsets = {};
|
||||
|
||||
Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1;
|
||||
Xbyak::Label LC1, _LCPI3_1;
|
||||
Xbyak::Label handle_denormal_input;
|
||||
Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret, handle_oddball_denormal;
|
||||
|
||||
auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) {
|
||||
inLocalLabel();
|
||||
Xbyak::Label end_lzcnt;
|
||||
bsr(scratch_reg, reg);
|
||||
mov(reg, 0x20);
|
||||
jz(end_lzcnt);
|
||||
xor_(scratch_reg, 0x1F);
|
||||
mov(reg, scratch_reg);
|
||||
L(end_lzcnt);
|
||||
outLocalLabel();
|
||||
};
|
||||
|
||||
vmovd(r8d, xmm0);
|
||||
vmovaps(xmm1, xmm0);
|
||||
mov(ecx, r8d);
|
||||
//extract mantissa
|
||||
and_(ecx, 0x7fffff);
|
||||
mov(edx, ecx);
|
||||
cmp(r8d, 0xff800000);
|
||||
jz(specialcheck_1, CodeGenerator::T_NEAR);
|
||||
//is exponent zero?
|
||||
test(r8d, 0x7f800000);
|
||||
jne(L18);
|
||||
test(ecx, ecx);
|
||||
jne(L2);
|
||||
|
||||
L(L18);
|
||||
//extract biased exponent and unbias
|
||||
mov(r9d, r8d);
|
||||
shr(r9d, 23);
|
||||
movzx(r9d, r9b);
|
||||
lea(eax, ptr[r9 - 127]);
|
||||
cmp(r9d, 255);
|
||||
jne(L4);
|
||||
jmp(L35);
|
||||
|
||||
L(L2);
|
||||
|
||||
bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
|
||||
jnc(handle_denormal_input, CodeGenerator::T_NEAR);
|
||||
|
||||
// handle denormal input with NJM on
|
||||
// denorms get converted to zero w/ input sign, jump to our label
|
||||
// that handles inputs of 0 for this
|
||||
|
||||
jmp(convert_to_signed_inf_and_ret);
|
||||
L(L35);
|
||||
|
||||
vxorps(xmm0, xmm0, xmm0);
|
||||
mov(eax, 128);
|
||||
vcomiss(xmm1, xmm0);
|
||||
jb(L4);
|
||||
test(ecx, ecx);
|
||||
jne(L8);
|
||||
ret();
|
||||
|
||||
L(L4);
|
||||
cmp(eax, 128);
|
||||
jne(L9);
|
||||
vxorps(xmm0, xmm0, xmm0);
|
||||
vcomiss(xmm0, xmm1);
|
||||
jbe(L9);
|
||||
vmovss(xmm2, ptr[rip+LC1]);
|
||||
vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32));
|
||||
|
||||
test(edx, edx);
|
||||
jne(L8);
|
||||
vorps(xmm0, xmm2, xmm2);
|
||||
ret();
|
||||
|
||||
L(L9);
|
||||
test(edx, edx);
|
||||
je(L10);
|
||||
cmp(eax, 128);
|
||||
jne(L11);
|
||||
L(L8);
|
||||
or_(r8d, 0x400000);
|
||||
vmovd(xmm0, r8d);
|
||||
ret();
|
||||
L(L10);
|
||||
test(r9d, r9d);
|
||||
jne(L11);
|
||||
L(convert_to_signed_inf_and_ret);
|
||||
not_(r8d);
|
||||
shr(r8d, 31);
|
||||
|
||||
lea(rdx, ptr[rip + _LCPI3_1]);
|
||||
shl(r8d, 2);
|
||||
vmovss(xmm0, ptr[r8 + rdx]);
|
||||
ret();
|
||||
|
||||
L(L11);
|
||||
vxorps(xmm2, xmm2, xmm2);
|
||||
vmovss(xmm0, ptr[rip+LC1]);
|
||||
vcomiss(xmm2, xmm1);
|
||||
ja(L1, CodeGenerator::T_NEAR);
|
||||
mov(ecx, 127);
|
||||
sal(eax, 4);
|
||||
sub(ecx, r9d);
|
||||
mov(r9d, edx);
|
||||
and_(eax, 16);
|
||||
shr(edx, 9);
|
||||
shr(r9d, 19);
|
||||
and_(edx, 1023);
|
||||
sar(ecx, 1);
|
||||
or_(eax, r9d);
|
||||
xor_(eax, 16);
|
||||
mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) +
|
||||
rax * 4]);
|
||||
mov(eax, r9d);
|
||||
shr(r9d, 16);
|
||||
imul(edx, r9d);
|
||||
sal(eax, 10);
|
||||
and_(eax, 0x3fffc00);
|
||||
sub(eax, edx);
|
||||
bt(eax, 25);
|
||||
jc(L12);
|
||||
mov(edx, eax);
|
||||
add(ecx, 6);
|
||||
and_(edx, 0x1ffffff);
|
||||
|
||||
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
||||
lzcnt(edx, edx);
|
||||
} else {
|
||||
emulate_lzcnt_helper_unary_reg(edx, r9d);
|
||||
}
|
||||
|
||||
lea(r9d, ptr[rdx - 6]);
|
||||
sub(ecx, edx);
|
||||
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
||||
shlx(eax, eax, r9d);
|
||||
} else {
|
||||
xchg(ecx, r9d);
|
||||
shl(eax, cl);
|
||||
xchg(ecx, r9d);
|
||||
}
|
||||
|
||||
L(L12);
|
||||
test(al, 5);
|
||||
je(L13);
|
||||
test(al, 2);
|
||||
je(L13);
|
||||
add(eax, 4);
|
||||
|
||||
L(L13);
|
||||
sal(ecx, 23);
|
||||
and_(r8d, 0x80000000);
|
||||
shr(eax, 2);
|
||||
add(ecx, 0x3f800000);
|
||||
and_(eax, 0x7fffff);
|
||||
vxorps(xmm1, xmm1);
|
||||
or_(ecx, r8d);
|
||||
or_(ecx, eax);
|
||||
vmovd(xmm0, ecx);
|
||||
vaddss(xmm0, xmm1);//apply DAZ behavior to output
|
||||
|
||||
L(L1);
|
||||
ret();
|
||||
|
||||
|
||||
L(handle_denormal_input);
|
||||
mov(r9d, r8d);
|
||||
and_(r9d, 0x7FFFFFFF);
|
||||
cmp(r9d, 0x400000);
|
||||
jz(handle_oddball_denormal);
|
||||
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
||||
lzcnt(ecx, ecx);
|
||||
} else {
|
||||
emulate_lzcnt_helper_unary_reg(ecx, r9d);
|
||||
}
|
||||
|
||||
mov(r9d, 9);
|
||||
mov(eax, -118);
|
||||
lea(edx, ptr[rcx - 8]);
|
||||
sub(r9d, ecx);
|
||||
sub(eax, ecx);
|
||||
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
||||
shlx(edx, r8d, edx);
|
||||
} else {
|
||||
xchg(ecx, edx);
|
||||
// esi is just the value of xmm0's low word, so we can restore it from there
|
||||
shl(r8d, cl);
|
||||
mov(ecx, edx); // restore ecx, dont xchg because we're going to spoil edx anyway
|
||||
mov(edx, r8d);
|
||||
vmovd(r8d, xmm0);
|
||||
}
|
||||
and_(edx, 0x7ffffe);
|
||||
jmp(L4);
|
||||
|
||||
L(specialcheck_1);
|
||||
//should be extremely rare
|
||||
vmovss(xmm0, ptr[rip+LC1]);
|
||||
ret();
|
||||
|
||||
L(handle_oddball_denormal);
|
||||
not_(r8d);
|
||||
lea(r9, ptr[rip + LC1]);
|
||||
|
||||
shr(r8d, 31);
|
||||
movss(xmm0, ptr[r9 + r8 * 4]);
|
||||
ret();
|
||||
|
||||
L(_LCPI3_1);
|
||||
dd(0xFF800000);
|
||||
dd(0x7F800000);
|
||||
L(LC1);
|
||||
//the position of 7FC00000 here matters, this address will be indexed in handle_oddball_denormal
|
||||
dd(0x7FC00000);
|
||||
dd(0x5F34FD00);
|
||||
|
||||
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
code_offsets.prolog = getSize();
|
||||
code_offsets.epilog = getSize();
|
||||
code_offsets.tail = getSize();
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
|
||||
void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
|
||||
_code_offsets code_offsets = {};
|
||||
Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version;
|
||||
auto result_ptr =
|
||||
GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0]));
|
||||
auto counter_ptr = GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2]));
|
||||
counter_ptr.setBit(64);
|
||||
|
||||
//shuffle and xor to check whether all lanes are equal
|
||||
//sadly has to leave the float pipeline for the vptest, which is moderate yikes
|
||||
vmovhlps(xmm2, xmm0, xmm0);
|
||||
vmovsldup(xmm1, xmm0);
|
||||
vxorps(xmm1, xmm1, xmm0);
|
||||
vxorps(xmm2, xmm2, xmm0);
|
||||
vorps(xmm2, xmm1, xmm2);
|
||||
vptest(xmm2, xmm2);
|
||||
jnz(check_scalar_operation_in_vmx);
|
||||
//jmp(scalar_helper, CodeGenerator::T_NEAR);
|
||||
call(scalar_helper);
|
||||
vshufps(xmm0, xmm0, xmm0, 0);
|
||||
ret();
|
||||
|
||||
L(check_scalar_operation_in_vmx);
|
||||
|
||||
vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]);
|
||||
jnz(actual_vector_version);
|
||||
vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
call(scalar_helper);
|
||||
// this->DebugBreak();
|
||||
vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6));
|
||||
|
||||
vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
|
||||
0b0111);
|
||||
|
||||
ret();
|
||||
|
||||
|
||||
L(actual_vector_version);
|
||||
|
||||
|
||||
xor_(ecx, ecx);
|
||||
vmovaps(result_ptr, xmm0);
|
||||
|
||||
mov(counter_ptr, rcx);
|
||||
Xbyak::Label loop;
|
||||
|
||||
L(loop);
|
||||
lea(rax, result_ptr);
|
||||
vmovss(xmm0, ptr[rax+rcx*4]);
|
||||
call(scalar_helper);
|
||||
mov(rcx, counter_ptr);
|
||||
lea(rax, result_ptr);
|
||||
vmovss(ptr[rax+rcx*4], xmm0);
|
||||
inc(ecx);
|
||||
cmp(ecx, 4);
|
||||
mov(counter_ptr, rcx);
|
||||
jl(loop);
|
||||
vmovaps(xmm0, result_ptr);
|
||||
ret();
|
||||
code_offsets.prolog_stack_alloc = getSize();
|
||||
code_offsets.body = getSize();
|
||||
code_offsets.epilog = getSize();
|
||||
code_offsets.tail = getSize();
|
||||
code_offsets.prolog = getSize();
|
||||
return EmitCurrentForOffsets(code_offsets);
|
||||
}
|
||||
|
||||
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
||||
_code_offsets code_offsets = {};
|
||||
code_offsets.prolog = getSize();
|
||||
|
@ -872,7 +1169,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
|||
Xbyak::Label already_has_a_reservation;
|
||||
Xbyak::Label acquire_new_reservation;
|
||||
|
||||
btr(GetBackendFlagsPtr(), 1);
|
||||
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
|
||||
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
||||
jc(already_has_a_reservation);
|
||||
|
||||
|
@ -888,7 +1185,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
|||
// set flag on local backend context for thread to indicate our previous
|
||||
// attempt to get the reservation succeeded
|
||||
setnc(r9b); // success = bitmap did not have a set bit at the idx
|
||||
shl(r9b, 1);
|
||||
shl(r9b, kX64BackendHasReserveBit);
|
||||
|
||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
||||
rdx);
|
||||
|
@ -917,7 +1214,7 @@ void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
|
|||
Xbyak::Label somehow_double_cleared;
|
||||
// carry must be set + zero flag must be set
|
||||
|
||||
btr(GetBackendFlagsPtr(), 1);
|
||||
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
|
||||
|
||||
jnc(done);
|
||||
|
||||
|
@ -1097,7 +1394,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
|
|||
: nullptr;
|
||||
bctx->current_stackpoint_depth = 0;
|
||||
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
||||
bctx->flags = 0;
|
||||
bctx->flags = (1U << kX64BackendNJMOn); // NJM on by default
|
||||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||
bctx->Ox1000 = 0x1000;
|
||||
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
||||
|
@ -1128,7 +1425,9 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
|
|||
uint32_t control = mode & 7;
|
||||
_mm_setcsr(mxcsr_table[control]);
|
||||
bctx->mxcsr_fpu = mxcsr_table[control];
|
||||
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
|
||||
auto ppc_context = ((ppc::PPCContext*)ctx);
|
||||
ppc_context->fpscr.bits.rn = control;
|
||||
ppc_context->fpscr.bits.ni = control >> 2;
|
||||
}
|
||||
|
||||
bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
|
||||
|
|
|
@ -61,11 +61,22 @@ struct X64BackendStackpoint {
|
|||
// use
|
||||
unsigned guest_return_address_;
|
||||
};
|
||||
enum : uint32_t {
|
||||
kX64BackendMXCSRModeBit = 0,
|
||||
kX64BackendHasReserveBit = 1,
|
||||
kX64BackendNJMOn = 2, //non-java mode bit is currently set. for use in software fp routines
|
||||
kX64BackendNonIEEEMode = 3, //non-ieee mode is currently enabled for scalar fpu.
|
||||
};
|
||||
// located prior to the ctx register
|
||||
// some things it would be nice to have be per-emulator instance instead of per
|
||||
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
||||
// negatively index the membase reg)
|
||||
struct X64BackendContext {
|
||||
union {
|
||||
__m128 helper_scratch_xmms[4];
|
||||
uint64_t helper_scratch_u64s[8];
|
||||
uint32_t helper_scratch_u32s[16];
|
||||
};
|
||||
ReserveHelper* reserve_helper_;
|
||||
uint64_t cached_reserve_value_;
|
||||
// guest_tick_count is used if inline_loadclock is used
|
||||
|
@ -147,6 +158,13 @@ class X64Backend : public Backend {
|
|||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
|
||||
virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
|
||||
void RecordMMIOExceptionForGuestInstruction(void* host_address);
|
||||
|
||||
uint32_t LookupXMMConstantAddress32(unsigned index) {
|
||||
return static_cast<uint32_t>(emitter_data() + sizeof(vec128_t) * index);
|
||||
}
|
||||
void* LookupXMMConstantAddress(unsigned index) {
|
||||
return reinterpret_cast<void*>(emitter_data() + sizeof(vec128_t) * index);
|
||||
}
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
|
||||
#endif
|
||||
|
@ -173,7 +191,8 @@ class X64Backend : public Backend {
|
|||
void* try_acquire_reservation_helper_ = nullptr;
|
||||
void* reserved_store_32_helper = nullptr;
|
||||
void* reserved_store_64_helper = nullptr;
|
||||
|
||||
void* vrsqrtefp_vector_helper = nullptr;
|
||||
void* vrsqrtefp_scalar_helper = nullptr;
|
||||
private:
|
||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||
GuestProfilerData profiler_data_;
|
||||
|
|
|
@ -982,6 +982,16 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
|||
return result;
|
||||
}
|
||||
|
||||
static inline vec128_t v128_setr_words(uint32_t v0, uint32_t v1, uint32_t v2,
|
||||
uint32_t v3) {
|
||||
vec128_t result;
|
||||
result.u32[0] = v0;
|
||||
result.u32[1] = v1;
|
||||
result.u32[2] = v2;
|
||||
result.u32[3] = v3;
|
||||
return result;
|
||||
}
|
||||
|
||||
static const vec128_t xmm_consts[] = {
|
||||
/* XMMZero */ vec128f(0.0f),
|
||||
/* XMMByteSwapMask */
|
||||
|
@ -1151,7 +1161,19 @@ static const vec128_t xmm_consts[] = {
|
|||
vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
|
||||
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
||||
// XMMVSRMask
|
||||
vec128b(1)};
|
||||
vec128b(1),
|
||||
//XMMVRsqrteTableStart
|
||||
v128_setr_words(0x568B4FD, 0x4F3AF97, 0x48DAAA5, 0x435A618),
|
||||
v128_setr_words(0x3E7A1E4, 0x3A29DFE, 0x3659A5C, 0x32E96F8),
|
||||
v128_setr_words(0x2FC93CA, 0x2D090CE, 0x2A88DFE, 0x2838B57),
|
||||
v128_setr_words(0x26188D4, 0x2438673, 0x2268431, 0x20B820B),
|
||||
v128_setr_words(0x3D27FFA, 0x3807C29, 0x33878AA, 0x2F97572),
|
||||
v128_setr_words(0x2C27279, 0x2926FB7, 0x2666D26, 0x23F6AC0),
|
||||
v128_setr_words(0x21D6881, 0x1FD6665, 0x1E16468, 0x1C76287),
|
||||
v128_setr_words(0x1AF60C1, 0x1995F12, 0x1855D79, 0x1735BF4),
|
||||
//XMMVRsqrteTableBase
|
||||
vec128i(0) //filled in later
|
||||
};
|
||||
|
||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||
for (auto& vec : xmm_consts) {
|
||||
|
@ -1223,7 +1245,17 @@ uintptr_t X64Emitter::PlaceConstData() {
|
|||
|
||||
// The pointer must not be greater than 31 bits.
|
||||
assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
|
||||
|
||||
std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
|
||||
/*
|
||||
set each 32-bit element of the constant XMMVRsqrteTableBase to be the address of the start of the constant XMMVRsqrteTableStart
|
||||
this
|
||||
*/
|
||||
vec128_t* deferred_constants = reinterpret_cast<vec128_t*>(mem);
|
||||
vec128_t* vrsqrte_table_base = &deferred_constants[XMMVRsqrteTableBase];
|
||||
uint32_t ptr_to_vrsqrte_table32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(&deferred_constants[XMMVRsqrteTableStart]));
|
||||
*vrsqrte_table_base = vec128i(ptr_to_vrsqrte_table32);
|
||||
|
||||
memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
|
||||
|
||||
return reinterpret_cast<uintptr_t>(mem);
|
||||
|
@ -1237,8 +1269,9 @@ void X64Emitter::FreeConstData(uintptr_t data) {
|
|||
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
||||
// Load through fixed constant table setup by PlaceConstData.
|
||||
// It's important that the pointer is not signed, as it will be sign-extended.
|
||||
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
||||
sizeof(vec128_t) * id)];
|
||||
void* emitter_data_ptr = backend_->LookupXMMConstantAddress(static_cast<unsigned>(id));
|
||||
xenia_assert(reinterpret_cast<uintptr_t>(emitter_data_ptr) < (1ULL << 31));//must not have signbit set
|
||||
return ptr[emitter_data_ptr];
|
||||
}
|
||||
// Implies possible StashXmm(0, ...)!
|
||||
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||
|
@ -1634,9 +1667,9 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
|||
} else { // even if already set, we still need to update flags to reflect
|
||||
// our mode
|
||||
if (new_mode == MXCSRMode::Fpu) {
|
||||
btr(GetBackendFlagsPtr(), 0);
|
||||
btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||
} else if (new_mode == MXCSRMode::Vmx) {
|
||||
bts(GetBackendFlagsPtr(), 0);
|
||||
bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||
} else {
|
||||
assert_unhandled_case(new_mode);
|
||||
}
|
||||
|
@ -1646,11 +1679,11 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
|||
if (!already_set) {
|
||||
if (new_mode == MXCSRMode::Fpu) {
|
||||
LoadFpuMxcsrDirect();
|
||||
btr(GetBackendFlagsPtr(), 0);
|
||||
btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||
return true;
|
||||
} else if (new_mode == MXCSRMode::Vmx) {
|
||||
LoadVmxMxcsrDirect();
|
||||
bts(GetBackendFlagsPtr(), 0);
|
||||
bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||
return true;
|
||||
} else {
|
||||
assert_unhandled_case(new_mode);
|
||||
|
|
|
@ -174,7 +174,9 @@ enum XmmConst {
|
|||
XMMSTVLShuffle,
|
||||
XMMSTVRSwapMask, // swapwordmask with bit 7 set
|
||||
XMMVSRShlByteshuf,
|
||||
XMMVSRMask
|
||||
XMMVSRMask,
|
||||
XMMVRsqrteTableStart,
|
||||
XMMVRsqrteTableBase = XMMVRsqrteTableStart + (32 / 4), //32 4-byte elements in table, 4 4-byte elements fit in each xmm
|
||||
|
||||
};
|
||||
using amdfx::xopcompare_e;
|
||||
|
@ -308,7 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
|||
|
||||
size_t stack_size() const { return stack_size_; }
|
||||
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
|
||||
|
||||
|
||||
void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
|
||||
/*
|
||||
returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
|
||||
|
|
|
@ -3376,17 +3376,28 @@ struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
|
|||
auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
|
||||
|
||||
addr_vmx.setBit(32);
|
||||
auto flags_ptr = e.GetBackendFlagsPtr();
|
||||
if (i.src1.is_constant) {
|
||||
if (i.src1.constant() == 0) {
|
||||
// turn off daz/flush2z
|
||||
e.mov(addr_vmx, _MM_MASK_MASK);
|
||||
e.btr(flags_ptr, kX64BackendNJMOn);
|
||||
|
||||
} else {
|
||||
e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
|
||||
e.bts(flags_ptr, kX64BackendNJMOn);
|
||||
}
|
||||
|
||||
} else {
|
||||
e.mov(e.eax, flags_ptr);
|
||||
e.mov(e.edx, 1U << kX64BackendNJMOn);
|
||||
e.mov(e.ecx, e.edx);
|
||||
e.not_(e.ecx);
|
||||
e.and_(e.ecx, e.eax);
|
||||
e.or_(e.edx, e.eax);
|
||||
e.test(i.src1, i.src1);
|
||||
e.cmove(e.edx, e.ecx);
|
||||
e.mov(flags_ptr, e.edx);
|
||||
e.mov(e.edx, DEFAULT_VMX_MXCSR);
|
||||
e.mov(e.eax, _MM_MASK_MASK);
|
||||
|
||||
|
|
|
@ -2123,12 +2123,19 @@ struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
|
||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
||||
e.vrsqrt14ps(i.dest, src1);
|
||||
/*
|
||||
the vast majority of inputs to vrsqrte come from vmsum3 or vmsum4 as part
|
||||
of a vector normalization sequence. in fact, its difficult to find uses of vrsqrte in titles
|
||||
that have inputs which do not come from vmsum.
|
||||
*/
|
||||
if (i.src1.value && i.src1.value->AllFloatVectorLanesSameValue()) {
|
||||
e.vmovss(e.xmm0, src1);
|
||||
e.call(e.backend()->vrsqrtefp_scalar_helper);
|
||||
e.vshufps(i.dest, e.xmm0, e.xmm0, 0);
|
||||
} else {
|
||||
e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne));
|
||||
e.vsqrtps(e.xmm1, src1);
|
||||
e.vdivps(i.dest, e.xmm0, e.xmm1);
|
||||
e.vmovaps(e.xmm0, src1);
|
||||
e.call(e.backend()->vrsqrtefp_vector_helper);
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -3183,16 +3190,37 @@ struct SET_ROUNDING_MODE_I32
|
|||
// removed the And with 7 and hoisted that and into the InstrEmit_'s that
|
||||
// generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and
|
||||
// backends dont have to worry about it
|
||||
auto flags_ptr = e.GetBackendFlagsPtr();
|
||||
if (i.src1.is_constant) {
|
||||
e.mov(e.eax, mxcsr_table[i.src1.constant()]);
|
||||
unsigned constant_value = i.src1.constant();
|
||||
e.mov(e.eax, mxcsr_table[constant_value]);
|
||||
|
||||
if (constant_value & 4) {
|
||||
e.or_(flags_ptr, 1U << kX64BackendNonIEEEMode);
|
||||
}
|
||||
else {
|
||||
e.btr(flags_ptr, kX64BackendNonIEEEMode);
|
||||
}
|
||||
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
|
||||
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
|
||||
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
|
||||
|
||||
} else {
|
||||
e.mov(e.ecx, i.src1);
|
||||
//can andnot, but this is a very infrequently used opcode
|
||||
e.mov(e.eax, 1U << kX64BackendNonIEEEMode);
|
||||
e.mov(e.edx, e.eax);
|
||||
e.not_(e.edx);
|
||||
e.mov(e.ecx, flags_ptr);
|
||||
//edx = flags w/ non ieee cleared
|
||||
e.and_(e.edx, e.ecx);
|
||||
//eax = flags w/ non ieee set
|
||||
e.or_(e.eax, e.ecx);
|
||||
e.bt(i.src1, 2);
|
||||
|
||||
e.mov(e.ecx, i.src1);
|
||||
e.cmovc(e.edx, e.eax);
|
||||
e.mov(e.rax, uintptr_t(mxcsr_table));
|
||||
e.mov(flags_ptr, e.edx);
|
||||
e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]);
|
||||
// this was not here
|
||||
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx);
|
||||
|
|
|
@ -1370,6 +1370,38 @@ bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
splatting a 32-bit value extracted from a vector where all 4 32-bit values are the same should be eliminated and
|
||||
instead use the vector extracted from, which will be identical
|
||||
have seen this happen, some games vmsum and then splat the low float to all 4 floats, even though it already is there
|
||||
*/
|
||||
if (opc == OPCODE_SPLAT) {
|
||||
if (i->dest->type == VEC128_TYPE) {
|
||||
auto splatted_value = i->src1.value;
|
||||
auto splat_type = splatted_value->type;
|
||||
if (splat_type == FLOAT32_TYPE || splat_type == INT32_TYPE) {
|
||||
//its a splat of a fourbyte value, check the definition
|
||||
auto splat_input_definition = splatted_value->GetDefSkipAssigns();
|
||||
if (splat_input_definition) {
|
||||
auto defining_opcode = splat_input_definition->GetOpcodeNum();
|
||||
if (defining_opcode == OPCODE_EXTRACT) {
|
||||
auto value_extracted_from = splat_input_definition->src1.value;
|
||||
if (value_extracted_from->type == VEC128_TYPE) {
|
||||
|
||||
xenia_assert(splat_input_definition->dest->type == splat_type);
|
||||
|
||||
if (value_extracted_from->AllFloatVectorLanesSameValue()) {
|
||||
i->Replace(&OPCODE_ASSIGN_info,0);
|
||||
i->set_src1(value_extracted_from);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
|
||||
|
|
|
@ -1805,6 +1805,86 @@ bool Value::AllUsesByOneInsn() const {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
bool Value::AllFloatVectorLanesSameValue(const hir::Value* for_value,
|
||||
uint32_t current_depth) {
|
||||
// limit recursion, otherwise this function will slow down emission
|
||||
if (current_depth == 16) {
|
||||
return false;
|
||||
}
|
||||
using namespace hir;
|
||||
hir::Instr* definition;
|
||||
Opcode definition_opcode_number;
|
||||
re_enter:
|
||||
definition = for_value->def;
|
||||
if (!definition) {
|
||||
xenia_assert(for_value->IsConstant());
|
||||
|
||||
auto&& constant_value = for_value->constant.v128;
|
||||
for (unsigned constant_lane_index = 1; constant_lane_index < 4; ++constant_lane_index) {
|
||||
if (constant_value.u32[0] != constant_value.u32[constant_lane_index]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
definition_opcode_number = definition->GetOpcodeNum();
|
||||
|
||||
if (definition_opcode_number == OPCODE_ASSIGN) {
|
||||
for_value = definition->src1.value;
|
||||
goto re_enter;
|
||||
}
|
||||
|
||||
if (definition_opcode_number == OPCODE_VECTOR_DENORMFLUSH) {
|
||||
for_value = definition->src1.value;
|
||||
goto re_enter;
|
||||
}
|
||||
/*
|
||||
vmsum propagates its result to every lane
|
||||
*/
|
||||
if (definition_opcode_number == OPCODE_DOT_PRODUCT_4 ||
|
||||
definition_opcode_number == OPCODE_DOT_PRODUCT_3) {
|
||||
return true;
|
||||
}
|
||||
//if splat of 32-bit value type, return true
|
||||
//technically a splat of int16 or int8 would also produce the same "float" in all lanes
|
||||
//but i think its best to keep this function focused on specifically float data
|
||||
if (definition_opcode_number == OPCODE_SPLAT) {
|
||||
if (definition->dest->type == VEC128_TYPE) {
|
||||
auto splat_src_value_type = definition->src1.value->type;
|
||||
if (splat_src_value_type == INT32_TYPE ||
|
||||
splat_src_value_type == FLOAT32_TYPE) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (definition_opcode_number) {
|
||||
//all of these opcodes produce the same value for the same input
|
||||
case OPCODE_RSQRT:
|
||||
case OPCODE_RECIP:
|
||||
case OPCODE_POW2:
|
||||
case OPCODE_LOG2:
|
||||
for_value = definition->src1.value;
|
||||
goto re_enter;
|
||||
|
||||
//binary opcodes
|
||||
case OPCODE_ADD:
|
||||
case OPCODE_SUB:
|
||||
case OPCODE_MUL:
|
||||
if (!AllFloatVectorLanesSameValue(definition->src1.value,
|
||||
current_depth + 1)) {
|
||||
return false;
|
||||
}
|
||||
for_value = definition->src2.value;
|
||||
goto re_enter;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
} // namespace hir
|
||||
} // namespace cpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -618,8 +618,16 @@ class Value {
|
|||
bool MaybeFloaty() const {
|
||||
return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
|
||||
}
|
||||
|
||||
bool AllFloatVectorLanesSameValue() const {
|
||||
return Value::AllFloatVectorLanesSameValue(this);
|
||||
}
|
||||
private:
|
||||
/*
|
||||
returns true if for_value (which must be VEC128_TYPE) has the same value in
|
||||
every float
|
||||
*/
|
||||
static bool AllFloatVectorLanesSameValue(const hir::Value* for_value,
|
||||
uint32_t current_depth = 0);
|
||||
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
|
||||
static bool CompareInt16(Opcode opcode, Value* a, Value* b);
|
||||
static bool CompareInt32(Opcode opcode, Value* a, Value* b);
|
||||
|
|
Loading…
Reference in New Issue