implement bit-perfect vrsqrtefp
This commit is contained in:
parent
cfecdcbeab
commit
79465708aa
|
@ -73,6 +73,9 @@ class X64HelperEmitter : public X64Emitter {
|
||||||
void* EmitTryAcquireReservationHelper();
|
void* EmitTryAcquireReservationHelper();
|
||||||
void* EmitReservedStoreHelper(bool bit64 = false);
|
void* EmitReservedStoreHelper(bool bit64 = false);
|
||||||
|
|
||||||
|
void* EmitScalarVRsqrteHelper();
|
||||||
|
void* EmitVectorVRsqrteHelper(void* scalar_helper);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
void* EmitCurrentForOffsets(const _code_offsets& offsets,
|
||||||
size_t stack_size = 0);
|
size_t stack_size = 0);
|
||||||
|
@ -207,6 +210,8 @@ bool X64Backend::Initialize(Processor* processor) {
|
||||||
if (!code_cache_->Initialize()) {
|
if (!code_cache_->Initialize()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
// Allocate emitter constant data.
|
||||||
|
emitter_data_ = X64Emitter::PlaceConstData();
|
||||||
|
|
||||||
// Generate thunks used to transition between jitted code and host code.
|
// Generate thunks used to transition between jitted code and host code.
|
||||||
XbyakAllocator allocator;
|
XbyakAllocator allocator;
|
||||||
|
@ -233,7 +238,8 @@ bool X64Backend::Initialize(Processor* processor) {
|
||||||
thunk_emitter.EmitTryAcquireReservationHelper();
|
thunk_emitter.EmitTryAcquireReservationHelper();
|
||||||
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
|
reserved_store_32_helper = thunk_emitter.EmitReservedStoreHelper(false);
|
||||||
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
|
reserved_store_64_helper = thunk_emitter.EmitReservedStoreHelper(true);
|
||||||
|
vrsqrtefp_scalar_helper = thunk_emitter.EmitScalarVRsqrteHelper();
|
||||||
|
vrsqrtefp_vector_helper = thunk_emitter.EmitVectorVRsqrteHelper(vrsqrtefp_scalar_helper);
|
||||||
// Set the code cache to use the ResolveFunction thunk for default
|
// Set the code cache to use the ResolveFunction thunk for default
|
||||||
// indirections.
|
// indirections.
|
||||||
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
assert_zero(uint64_t(resolve_function_thunk_) & 0xFFFFFFFF00000000ull);
|
||||||
|
@ -243,9 +249,6 @@ bool X64Backend::Initialize(Processor* processor) {
|
||||||
// Allocate some special indirections.
|
// Allocate some special indirections.
|
||||||
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
|
code_cache_->CommitExecutableRange(0x9FFF0000, 0x9FFFFFFF);
|
||||||
|
|
||||||
// Allocate emitter constant data.
|
|
||||||
emitter_data_ = X64Emitter::PlaceConstData();
|
|
||||||
|
|
||||||
// Setup exception callback
|
// Setup exception callback
|
||||||
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
|
||||||
if (cvars::record_mmio_access_exceptions) {
|
if (cvars::record_mmio_access_exceptions) {
|
||||||
|
@ -844,7 +847,7 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
_code_offsets code_offsets = {};
|
_code_offsets code_offsets = {};
|
||||||
code_offsets.prolog = getSize();
|
code_offsets.prolog = getSize();
|
||||||
pop(r8); // return address
|
pop(r8); // return address
|
||||||
|
|
||||||
switch (stack_element_size) {
|
switch (stack_element_size) {
|
||||||
case 4:
|
case 4:
|
||||||
mov(r11d, ptr[r8]);
|
mov(r11d, ptr[r8]);
|
||||||
|
@ -865,6 +868,300 @@ void* X64HelperEmitter::EmitGuestAndHostSynchronizeStackSizeLoadThunk(
|
||||||
return EmitCurrentForOffsets(code_offsets);
|
return EmitCurrentForOffsets(code_offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void* X64HelperEmitter::EmitScalarVRsqrteHelper() {
|
||||||
|
_code_offsets code_offsets = {};
|
||||||
|
|
||||||
|
Xbyak::Label L18, L2, L35, L4, L9, L8, L10, L11, L12, L13, L1;
|
||||||
|
Xbyak::Label LC1, _LCPI3_1;
|
||||||
|
Xbyak::Label handle_denormal_input;
|
||||||
|
Xbyak::Label specialcheck_1, convert_to_signed_inf_and_ret, handle_oddball_denormal;
|
||||||
|
|
||||||
|
auto emulate_lzcnt_helper_unary_reg = [this](auto& reg, auto& scratch_reg) {
|
||||||
|
inLocalLabel();
|
||||||
|
Xbyak::Label end_lzcnt;
|
||||||
|
bsr(scratch_reg, reg);
|
||||||
|
mov(reg, 0x20);
|
||||||
|
jz(end_lzcnt);
|
||||||
|
xor_(scratch_reg, 0x1F);
|
||||||
|
mov(reg, scratch_reg);
|
||||||
|
L(end_lzcnt);
|
||||||
|
outLocalLabel();
|
||||||
|
};
|
||||||
|
|
||||||
|
vmovd(r8d, xmm0);
|
||||||
|
vmovaps(xmm1, xmm0);
|
||||||
|
mov(ecx, r8d);
|
||||||
|
//extract mantissa
|
||||||
|
and_(ecx, 0x7fffff);
|
||||||
|
mov(edx, ecx);
|
||||||
|
cmp(r8d, 0xff800000);
|
||||||
|
jz(specialcheck_1, CodeGenerator::T_NEAR);
|
||||||
|
//is exponent zero?
|
||||||
|
test(r8d, 0x7f800000);
|
||||||
|
jne(L18);
|
||||||
|
test(ecx, ecx);
|
||||||
|
jne(L2);
|
||||||
|
|
||||||
|
L(L18);
|
||||||
|
//extract biased exponent and unbias
|
||||||
|
mov(r9d, r8d);
|
||||||
|
shr(r9d, 23);
|
||||||
|
movzx(r9d, r9b);
|
||||||
|
lea(eax, ptr[r9 - 127]);
|
||||||
|
cmp(r9d, 255);
|
||||||
|
jne(L4);
|
||||||
|
jmp(L35);
|
||||||
|
|
||||||
|
L(L2);
|
||||||
|
|
||||||
|
bt(GetBackendFlagsPtr(), kX64BackendNJMOn);
|
||||||
|
jnc(handle_denormal_input, CodeGenerator::T_NEAR);
|
||||||
|
|
||||||
|
// handle denormal input with NJM on
|
||||||
|
// denorms get converted to zero w/ input sign, jump to our label
|
||||||
|
// that handles inputs of 0 for this
|
||||||
|
|
||||||
|
jmp(convert_to_signed_inf_and_ret);
|
||||||
|
L(L35);
|
||||||
|
|
||||||
|
vxorps(xmm0, xmm0, xmm0);
|
||||||
|
mov(eax, 128);
|
||||||
|
vcomiss(xmm1, xmm0);
|
||||||
|
jb(L4);
|
||||||
|
test(ecx, ecx);
|
||||||
|
jne(L8);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
L(L4);
|
||||||
|
cmp(eax, 128);
|
||||||
|
jne(L9);
|
||||||
|
vxorps(xmm0, xmm0, xmm0);
|
||||||
|
vcomiss(xmm0, xmm1);
|
||||||
|
jbe(L9);
|
||||||
|
vmovss(xmm2, ptr[rip+LC1]);
|
||||||
|
vandps(xmm1, GetXmmConstPtr(XMMSignMaskF32));
|
||||||
|
|
||||||
|
test(edx, edx);
|
||||||
|
jne(L8);
|
||||||
|
vorps(xmm0, xmm2, xmm2);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
L(L9);
|
||||||
|
test(edx, edx);
|
||||||
|
je(L10);
|
||||||
|
cmp(eax, 128);
|
||||||
|
jne(L11);
|
||||||
|
L(L8);
|
||||||
|
or_(r8d, 0x400000);
|
||||||
|
vmovd(xmm0, r8d);
|
||||||
|
ret();
|
||||||
|
L(L10);
|
||||||
|
test(r9d, r9d);
|
||||||
|
jne(L11);
|
||||||
|
L(convert_to_signed_inf_and_ret);
|
||||||
|
not_(r8d);
|
||||||
|
shr(r8d, 31);
|
||||||
|
|
||||||
|
lea(rdx, ptr[rip + _LCPI3_1]);
|
||||||
|
shl(r8d, 2);
|
||||||
|
vmovss(xmm0, ptr[r8 + rdx]);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
L(L11);
|
||||||
|
vxorps(xmm2, xmm2, xmm2);
|
||||||
|
vmovss(xmm0, ptr[rip+LC1]);
|
||||||
|
vcomiss(xmm2, xmm1);
|
||||||
|
ja(L1, CodeGenerator::T_NEAR);
|
||||||
|
mov(ecx, 127);
|
||||||
|
sal(eax, 4);
|
||||||
|
sub(ecx, r9d);
|
||||||
|
mov(r9d, edx);
|
||||||
|
and_(eax, 16);
|
||||||
|
shr(edx, 9);
|
||||||
|
shr(r9d, 19);
|
||||||
|
and_(edx, 1023);
|
||||||
|
sar(ecx, 1);
|
||||||
|
or_(eax, r9d);
|
||||||
|
xor_(eax, 16);
|
||||||
|
mov(r9d, ptr[backend()->LookupXMMConstantAddress32(XMMVRsqrteTableStart) +
|
||||||
|
rax * 4]);
|
||||||
|
mov(eax, r9d);
|
||||||
|
shr(r9d, 16);
|
||||||
|
imul(edx, r9d);
|
||||||
|
sal(eax, 10);
|
||||||
|
and_(eax, 0x3fffc00);
|
||||||
|
sub(eax, edx);
|
||||||
|
bt(eax, 25);
|
||||||
|
jc(L12);
|
||||||
|
mov(edx, eax);
|
||||||
|
add(ecx, 6);
|
||||||
|
and_(edx, 0x1ffffff);
|
||||||
|
|
||||||
|
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
||||||
|
lzcnt(edx, edx);
|
||||||
|
} else {
|
||||||
|
emulate_lzcnt_helper_unary_reg(edx, r9d);
|
||||||
|
}
|
||||||
|
|
||||||
|
lea(r9d, ptr[rdx - 6]);
|
||||||
|
sub(ecx, edx);
|
||||||
|
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
||||||
|
shlx(eax, eax, r9d);
|
||||||
|
} else {
|
||||||
|
xchg(ecx, r9d);
|
||||||
|
shl(eax, cl);
|
||||||
|
xchg(ecx, r9d);
|
||||||
|
}
|
||||||
|
|
||||||
|
L(L12);
|
||||||
|
test(al, 5);
|
||||||
|
je(L13);
|
||||||
|
test(al, 2);
|
||||||
|
je(L13);
|
||||||
|
add(eax, 4);
|
||||||
|
|
||||||
|
L(L13);
|
||||||
|
sal(ecx, 23);
|
||||||
|
and_(r8d, 0x80000000);
|
||||||
|
shr(eax, 2);
|
||||||
|
add(ecx, 0x3f800000);
|
||||||
|
and_(eax, 0x7fffff);
|
||||||
|
vxorps(xmm1, xmm1);
|
||||||
|
or_(ecx, r8d);
|
||||||
|
or_(ecx, eax);
|
||||||
|
vmovd(xmm0, ecx);
|
||||||
|
vaddss(xmm0, xmm1);//apply DAZ behavior to output
|
||||||
|
|
||||||
|
L(L1);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
|
||||||
|
L(handle_denormal_input);
|
||||||
|
mov(r9d, r8d);
|
||||||
|
and_(r9d, 0x7FFFFFFF);
|
||||||
|
cmp(r9d, 0x400000);
|
||||||
|
jz(handle_oddball_denormal);
|
||||||
|
if (IsFeatureEnabled(kX64EmitLZCNT)) {
|
||||||
|
lzcnt(ecx, ecx);
|
||||||
|
} else {
|
||||||
|
emulate_lzcnt_helper_unary_reg(ecx, r9d);
|
||||||
|
}
|
||||||
|
|
||||||
|
mov(r9d, 9);
|
||||||
|
mov(eax, -118);
|
||||||
|
lea(edx, ptr[rcx - 8]);
|
||||||
|
sub(r9d, ecx);
|
||||||
|
sub(eax, ecx);
|
||||||
|
if (IsFeatureEnabled(kX64EmitBMI2)) {
|
||||||
|
shlx(edx, r8d, edx);
|
||||||
|
} else {
|
||||||
|
xchg(ecx, edx);
|
||||||
|
// esi is just the value of xmm0's low word, so we can restore it from there
|
||||||
|
shl(r8d, cl);
|
||||||
|
mov(ecx, edx); // restore ecx, dont xchg because we're going to spoil edx anyway
|
||||||
|
mov(edx, r8d);
|
||||||
|
vmovd(r8d, xmm0);
|
||||||
|
}
|
||||||
|
and_(edx, 0x7ffffe);
|
||||||
|
jmp(L4);
|
||||||
|
|
||||||
|
L(specialcheck_1);
|
||||||
|
//should be extremely rare
|
||||||
|
vmovss(xmm0, ptr[rip+LC1]);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
L(handle_oddball_denormal);
|
||||||
|
not_(r8d);
|
||||||
|
lea(r9, ptr[rip + LC1]);
|
||||||
|
|
||||||
|
shr(r8d, 31);
|
||||||
|
movss(xmm0, ptr[r9 + r8 * 4]);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
L(_LCPI3_1);
|
||||||
|
dd(0xFF800000);
|
||||||
|
dd(0x7F800000);
|
||||||
|
L(LC1);
|
||||||
|
//the position of 7FC00000 here matters, this address will be indexed in handle_oddball_denormal
|
||||||
|
dd(0x7FC00000);
|
||||||
|
dd(0x5F34FD00);
|
||||||
|
|
||||||
|
|
||||||
|
code_offsets.prolog_stack_alloc = getSize();
|
||||||
|
code_offsets.body = getSize();
|
||||||
|
code_offsets.prolog = getSize();
|
||||||
|
code_offsets.epilog = getSize();
|
||||||
|
code_offsets.tail = getSize();
|
||||||
|
return EmitCurrentForOffsets(code_offsets);
|
||||||
|
}
|
||||||
|
|
||||||
|
void* X64HelperEmitter::EmitVectorVRsqrteHelper(void* scalar_helper) {
|
||||||
|
_code_offsets code_offsets = {};
|
||||||
|
Xbyak::Label check_scalar_operation_in_vmx, actual_vector_version;
|
||||||
|
auto result_ptr =
|
||||||
|
GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_xmms[0]));
|
||||||
|
auto counter_ptr = GetBackendCtxPtr(offsetof(X64BackendContext, helper_scratch_u64s[2]));
|
||||||
|
counter_ptr.setBit(64);
|
||||||
|
|
||||||
|
//shuffle and xor to check whether all lanes are equal
|
||||||
|
//sadly has to leave the float pipeline for the vptest, which is moderate yikes
|
||||||
|
vmovhlps(xmm2, xmm0, xmm0);
|
||||||
|
vmovsldup(xmm1, xmm0);
|
||||||
|
vxorps(xmm1, xmm1, xmm0);
|
||||||
|
vxorps(xmm2, xmm2, xmm0);
|
||||||
|
vorps(xmm2, xmm1, xmm2);
|
||||||
|
vptest(xmm2, xmm2);
|
||||||
|
jnz(check_scalar_operation_in_vmx);
|
||||||
|
//jmp(scalar_helper, CodeGenerator::T_NEAR);
|
||||||
|
call(scalar_helper);
|
||||||
|
vshufps(xmm0, xmm0, xmm0, 0);
|
||||||
|
ret();
|
||||||
|
|
||||||
|
L(check_scalar_operation_in_vmx);
|
||||||
|
|
||||||
|
vptest(xmm0, ptr[backend()->LookupXMMConstantAddress(XMMThreeFloatMask)]);
|
||||||
|
jnz(actual_vector_version);
|
||||||
|
vshufps(xmm0, xmm0,xmm0, _MM_SHUFFLE(3, 3, 3, 3));
|
||||||
|
call(scalar_helper);
|
||||||
|
// this->DebugBreak();
|
||||||
|
vinsertps(xmm0, xmm0, (3 << 4) | (0 << 6));
|
||||||
|
|
||||||
|
vblendps(xmm0, xmm0, ptr[backend()->LookupXMMConstantAddress(XMMFloatInf)],
|
||||||
|
0b0111);
|
||||||
|
|
||||||
|
ret();
|
||||||
|
|
||||||
|
|
||||||
|
L(actual_vector_version);
|
||||||
|
|
||||||
|
|
||||||
|
xor_(ecx, ecx);
|
||||||
|
vmovaps(result_ptr, xmm0);
|
||||||
|
|
||||||
|
mov(counter_ptr, rcx);
|
||||||
|
Xbyak::Label loop;
|
||||||
|
|
||||||
|
L(loop);
|
||||||
|
lea(rax, result_ptr);
|
||||||
|
vmovss(xmm0, ptr[rax+rcx*4]);
|
||||||
|
call(scalar_helper);
|
||||||
|
mov(rcx, counter_ptr);
|
||||||
|
lea(rax, result_ptr);
|
||||||
|
vmovss(ptr[rax+rcx*4], xmm0);
|
||||||
|
inc(ecx);
|
||||||
|
cmp(ecx, 4);
|
||||||
|
mov(counter_ptr, rcx);
|
||||||
|
jl(loop);
|
||||||
|
vmovaps(xmm0, result_ptr);
|
||||||
|
ret();
|
||||||
|
code_offsets.prolog_stack_alloc = getSize();
|
||||||
|
code_offsets.body = getSize();
|
||||||
|
code_offsets.epilog = getSize();
|
||||||
|
code_offsets.tail = getSize();
|
||||||
|
code_offsets.prolog = getSize();
|
||||||
|
return EmitCurrentForOffsets(code_offsets);
|
||||||
|
}
|
||||||
|
|
||||||
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
||||||
_code_offsets code_offsets = {};
|
_code_offsets code_offsets = {};
|
||||||
code_offsets.prolog = getSize();
|
code_offsets.prolog = getSize();
|
||||||
|
@ -872,7 +1169,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
||||||
Xbyak::Label already_has_a_reservation;
|
Xbyak::Label already_has_a_reservation;
|
||||||
Xbyak::Label acquire_new_reservation;
|
Xbyak::Label acquire_new_reservation;
|
||||||
|
|
||||||
btr(GetBackendFlagsPtr(), 1);
|
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
|
||||||
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
mov(r8, GetBackendCtxPtr(offsetof(X64BackendContext, reserve_helper_)));
|
||||||
jc(already_has_a_reservation);
|
jc(already_has_a_reservation);
|
||||||
|
|
||||||
|
@ -888,7 +1185,7 @@ void* X64HelperEmitter::EmitTryAcquireReservationHelper() {
|
||||||
// set flag on local backend context for thread to indicate our previous
|
// set flag on local backend context for thread to indicate our previous
|
||||||
// attempt to get the reservation succeeded
|
// attempt to get the reservation succeeded
|
||||||
setnc(r9b); // success = bitmap did not have a set bit at the idx
|
setnc(r9b); // success = bitmap did not have a set bit at the idx
|
||||||
shl(r9b, 1);
|
shl(r9b, kX64BackendHasReserveBit);
|
||||||
|
|
||||||
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
mov(GetBackendCtxPtr(offsetof(X64BackendContext, cached_reserve_offset)),
|
||||||
rdx);
|
rdx);
|
||||||
|
@ -917,7 +1214,7 @@ void* X64HelperEmitter::EmitReservedStoreHelper(bool bit64) {
|
||||||
Xbyak::Label somehow_double_cleared;
|
Xbyak::Label somehow_double_cleared;
|
||||||
// carry must be set + zero flag must be set
|
// carry must be set + zero flag must be set
|
||||||
|
|
||||||
btr(GetBackendFlagsPtr(), 1);
|
btr(GetBackendFlagsPtr(), kX64BackendHasReserveBit);
|
||||||
|
|
||||||
jnc(done);
|
jnc(done);
|
||||||
|
|
||||||
|
@ -1097,7 +1394,7 @@ void X64Backend::InitializeBackendContext(void* ctx) {
|
||||||
: nullptr;
|
: nullptr;
|
||||||
bctx->current_stackpoint_depth = 0;
|
bctx->current_stackpoint_depth = 0;
|
||||||
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
bctx->mxcsr_vmx = DEFAULT_VMX_MXCSR;
|
||||||
bctx->flags = 0;
|
bctx->flags = (1U << kX64BackendNJMOn); // NJM on by default
|
||||||
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
// https://media.discordapp.net/attachments/440280035056943104/1000765256643125308/unknown.png
|
||||||
bctx->Ox1000 = 0x1000;
|
bctx->Ox1000 = 0x1000;
|
||||||
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
bctx->guest_tick_count = Clock::GetGuestTickCountPointer();
|
||||||
|
@ -1128,7 +1425,9 @@ void X64Backend::SetGuestRoundingMode(void* ctx, unsigned int mode) {
|
||||||
uint32_t control = mode & 7;
|
uint32_t control = mode & 7;
|
||||||
_mm_setcsr(mxcsr_table[control]);
|
_mm_setcsr(mxcsr_table[control]);
|
||||||
bctx->mxcsr_fpu = mxcsr_table[control];
|
bctx->mxcsr_fpu = mxcsr_table[control];
|
||||||
((ppc::PPCContext*)ctx)->fpscr.bits.rn = control;
|
auto ppc_context = ((ppc::PPCContext*)ctx);
|
||||||
|
ppc_context->fpscr.bits.rn = control;
|
||||||
|
ppc_context->fpscr.bits.ni = control >> 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
|
bool X64Backend::PopulatePseudoStacktrace(GuestPseudoStackTrace* st) {
|
||||||
|
|
|
@ -61,11 +61,22 @@ struct X64BackendStackpoint {
|
||||||
// use
|
// use
|
||||||
unsigned guest_return_address_;
|
unsigned guest_return_address_;
|
||||||
};
|
};
|
||||||
|
enum : uint32_t {
|
||||||
|
kX64BackendMXCSRModeBit = 0,
|
||||||
|
kX64BackendHasReserveBit = 1,
|
||||||
|
kX64BackendNJMOn = 2, //non-java mode bit is currently set. for use in software fp routines
|
||||||
|
kX64BackendNonIEEEMode = 3, //non-ieee mode is currently enabled for scalar fpu.
|
||||||
|
};
|
||||||
// located prior to the ctx register
|
// located prior to the ctx register
|
||||||
// some things it would be nice to have be per-emulator instance instead of per
|
// some things it would be nice to have be per-emulator instance instead of per
|
||||||
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
// context (somehow placing a global X64BackendCtx prior to membase, so we can
|
||||||
// negatively index the membase reg)
|
// negatively index the membase reg)
|
||||||
struct X64BackendContext {
|
struct X64BackendContext {
|
||||||
|
union {
|
||||||
|
__m128 helper_scratch_xmms[4];
|
||||||
|
uint64_t helper_scratch_u64s[8];
|
||||||
|
uint32_t helper_scratch_u32s[16];
|
||||||
|
};
|
||||||
ReserveHelper* reserve_helper_;
|
ReserveHelper* reserve_helper_;
|
||||||
uint64_t cached_reserve_value_;
|
uint64_t cached_reserve_value_;
|
||||||
// guest_tick_count is used if inline_loadclock is used
|
// guest_tick_count is used if inline_loadclock is used
|
||||||
|
@ -147,6 +158,13 @@ class X64Backend : public Backend {
|
||||||
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
|
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
|
||||||
virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
|
virtual bool PopulatePseudoStacktrace(GuestPseudoStackTrace* st) override;
|
||||||
void RecordMMIOExceptionForGuestInstruction(void* host_address);
|
void RecordMMIOExceptionForGuestInstruction(void* host_address);
|
||||||
|
|
||||||
|
uint32_t LookupXMMConstantAddress32(unsigned index) {
|
||||||
|
return static_cast<uint32_t>(emitter_data() + sizeof(vec128_t) * index);
|
||||||
|
}
|
||||||
|
void* LookupXMMConstantAddress(unsigned index) {
|
||||||
|
return reinterpret_cast<void*>(emitter_data() + sizeof(vec128_t) * index);
|
||||||
|
}
|
||||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||||
uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
|
uint64_t* GetProfilerRecordForFunction(uint32_t guest_address);
|
||||||
#endif
|
#endif
|
||||||
|
@ -173,7 +191,8 @@ class X64Backend : public Backend {
|
||||||
void* try_acquire_reservation_helper_ = nullptr;
|
void* try_acquire_reservation_helper_ = nullptr;
|
||||||
void* reserved_store_32_helper = nullptr;
|
void* reserved_store_32_helper = nullptr;
|
||||||
void* reserved_store_64_helper = nullptr;
|
void* reserved_store_64_helper = nullptr;
|
||||||
|
void* vrsqrtefp_vector_helper = nullptr;
|
||||||
|
void* vrsqrtefp_scalar_helper = nullptr;
|
||||||
private:
|
private:
|
||||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||||
GuestProfilerData profiler_data_;
|
GuestProfilerData profiler_data_;
|
||||||
|
|
|
@ -982,6 +982,16 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline vec128_t v128_setr_words(uint32_t v0, uint32_t v1, uint32_t v2,
|
||||||
|
uint32_t v3) {
|
||||||
|
vec128_t result;
|
||||||
|
result.u32[0] = v0;
|
||||||
|
result.u32[1] = v1;
|
||||||
|
result.u32[2] = v2;
|
||||||
|
result.u32[3] = v3;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
static const vec128_t xmm_consts[] = {
|
static const vec128_t xmm_consts[] = {
|
||||||
/* XMMZero */ vec128f(0.0f),
|
/* XMMZero */ vec128f(0.0f),
|
||||||
/* XMMByteSwapMask */
|
/* XMMByteSwapMask */
|
||||||
|
@ -1151,7 +1161,19 @@ static const vec128_t xmm_consts[] = {
|
||||||
vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
|
vec128b((uint8_t)0x83), /*XMMVSRShlByteshuf*/
|
||||||
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
||||||
// XMMVSRMask
|
// XMMVSRMask
|
||||||
vec128b(1)};
|
vec128b(1),
|
||||||
|
//XMMVRsqrteTableStart
|
||||||
|
v128_setr_words(0x568B4FD, 0x4F3AF97, 0x48DAAA5, 0x435A618),
|
||||||
|
v128_setr_words(0x3E7A1E4, 0x3A29DFE, 0x3659A5C, 0x32E96F8),
|
||||||
|
v128_setr_words(0x2FC93CA, 0x2D090CE, 0x2A88DFE, 0x2838B57),
|
||||||
|
v128_setr_words(0x26188D4, 0x2438673, 0x2268431, 0x20B820B),
|
||||||
|
v128_setr_words(0x3D27FFA, 0x3807C29, 0x33878AA, 0x2F97572),
|
||||||
|
v128_setr_words(0x2C27279, 0x2926FB7, 0x2666D26, 0x23F6AC0),
|
||||||
|
v128_setr_words(0x21D6881, 0x1FD6665, 0x1E16468, 0x1C76287),
|
||||||
|
v128_setr_words(0x1AF60C1, 0x1995F12, 0x1855D79, 0x1735BF4),
|
||||||
|
//XMMVRsqrteTableBase
|
||||||
|
vec128i(0) //filled in later
|
||||||
|
};
|
||||||
|
|
||||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||||
for (auto& vec : xmm_consts) {
|
for (auto& vec : xmm_consts) {
|
||||||
|
@ -1223,7 +1245,17 @@ uintptr_t X64Emitter::PlaceConstData() {
|
||||||
|
|
||||||
// The pointer must not be greater than 31 bits.
|
// The pointer must not be greater than 31 bits.
|
||||||
assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
|
assert_zero(reinterpret_cast<uintptr_t>(mem) & ~0x7FFFFFFF);
|
||||||
|
|
||||||
std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
|
std::memcpy(mem, xmm_consts, sizeof(xmm_consts));
|
||||||
|
/*
|
||||||
|
set each 32-bit element of the constant XMMVRsqrteTableBase to be the address of the start of the constant XMMVRsqrteTableStart
|
||||||
|
this
|
||||||
|
*/
|
||||||
|
vec128_t* deferred_constants = reinterpret_cast<vec128_t*>(mem);
|
||||||
|
vec128_t* vrsqrte_table_base = &deferred_constants[XMMVRsqrteTableBase];
|
||||||
|
uint32_t ptr_to_vrsqrte_table32 = static_cast<uint32_t>(reinterpret_cast<uintptr_t>(&deferred_constants[XMMVRsqrteTableStart]));
|
||||||
|
*vrsqrte_table_base = vec128i(ptr_to_vrsqrte_table32);
|
||||||
|
|
||||||
memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
|
memory::Protect(mem, kConstDataSize, memory::PageAccess::kReadOnly, nullptr);
|
||||||
|
|
||||||
return reinterpret_cast<uintptr_t>(mem);
|
return reinterpret_cast<uintptr_t>(mem);
|
||||||
|
@ -1237,8 +1269,9 @@ void X64Emitter::FreeConstData(uintptr_t data) {
|
||||||
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
Xbyak::Address X64Emitter::GetXmmConstPtr(XmmConst id) {
|
||||||
// Load through fixed constant table setup by PlaceConstData.
|
// Load through fixed constant table setup by PlaceConstData.
|
||||||
// It's important that the pointer is not signed, as it will be sign-extended.
|
// It's important that the pointer is not signed, as it will be sign-extended.
|
||||||
return ptr[reinterpret_cast<void*>(backend_->emitter_data() +
|
void* emitter_data_ptr = backend_->LookupXMMConstantAddress(static_cast<unsigned>(id));
|
||||||
sizeof(vec128_t) * id)];
|
xenia_assert(reinterpret_cast<uintptr_t>(emitter_data_ptr) < (1ULL << 31));//must not have signbit set
|
||||||
|
return ptr[emitter_data_ptr];
|
||||||
}
|
}
|
||||||
// Implies possible StashXmm(0, ...)!
|
// Implies possible StashXmm(0, ...)!
|
||||||
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) {
|
||||||
|
@ -1634,9 +1667,9 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
||||||
} else { // even if already set, we still need to update flags to reflect
|
} else { // even if already set, we still need to update flags to reflect
|
||||||
// our mode
|
// our mode
|
||||||
if (new_mode == MXCSRMode::Fpu) {
|
if (new_mode == MXCSRMode::Fpu) {
|
||||||
btr(GetBackendFlagsPtr(), 0);
|
btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||||
} else if (new_mode == MXCSRMode::Vmx) {
|
} else if (new_mode == MXCSRMode::Vmx) {
|
||||||
bts(GetBackendFlagsPtr(), 0);
|
bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||||
} else {
|
} else {
|
||||||
assert_unhandled_case(new_mode);
|
assert_unhandled_case(new_mode);
|
||||||
}
|
}
|
||||||
|
@ -1646,11 +1679,11 @@ bool X64Emitter::ChangeMxcsrMode(MXCSRMode new_mode, bool already_set) {
|
||||||
if (!already_set) {
|
if (!already_set) {
|
||||||
if (new_mode == MXCSRMode::Fpu) {
|
if (new_mode == MXCSRMode::Fpu) {
|
||||||
LoadFpuMxcsrDirect();
|
LoadFpuMxcsrDirect();
|
||||||
btr(GetBackendFlagsPtr(), 0);
|
btr(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||||
return true;
|
return true;
|
||||||
} else if (new_mode == MXCSRMode::Vmx) {
|
} else if (new_mode == MXCSRMode::Vmx) {
|
||||||
LoadVmxMxcsrDirect();
|
LoadVmxMxcsrDirect();
|
||||||
bts(GetBackendFlagsPtr(), 0);
|
bts(GetBackendFlagsPtr(), kX64BackendMXCSRModeBit);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
assert_unhandled_case(new_mode);
|
assert_unhandled_case(new_mode);
|
||||||
|
|
|
@ -174,7 +174,9 @@ enum XmmConst {
|
||||||
XMMSTVLShuffle,
|
XMMSTVLShuffle,
|
||||||
XMMSTVRSwapMask, // swapwordmask with bit 7 set
|
XMMSTVRSwapMask, // swapwordmask with bit 7 set
|
||||||
XMMVSRShlByteshuf,
|
XMMVSRShlByteshuf,
|
||||||
XMMVSRMask
|
XMMVSRMask,
|
||||||
|
XMMVRsqrteTableStart,
|
||||||
|
XMMVRsqrteTableBase = XMMVRsqrteTableStart + (32 / 4), //32 4-byte elements in table, 4 4-byte elements fit in each xmm
|
||||||
|
|
||||||
};
|
};
|
||||||
using amdfx::xopcompare_e;
|
using amdfx::xopcompare_e;
|
||||||
|
@ -308,7 +310,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
|
|
||||||
size_t stack_size() const { return stack_size_; }
|
size_t stack_size() const { return stack_size_; }
|
||||||
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
|
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
|
||||||
|
|
||||||
void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
|
void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
|
||||||
/*
|
/*
|
||||||
returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
|
returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
|
||||||
|
|
|
@ -3376,17 +3376,28 @@ struct SET_NJM_I8 : Sequence<SET_NJM_I8, I<OPCODE_SET_NJM, VoidOp, I8Op>> {
|
||||||
auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
|
auto addr_vmx = e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_vmx));
|
||||||
|
|
||||||
addr_vmx.setBit(32);
|
addr_vmx.setBit(32);
|
||||||
|
auto flags_ptr = e.GetBackendFlagsPtr();
|
||||||
if (i.src1.is_constant) {
|
if (i.src1.is_constant) {
|
||||||
if (i.src1.constant() == 0) {
|
if (i.src1.constant() == 0) {
|
||||||
// turn off daz/flush2z
|
// turn off daz/flush2z
|
||||||
e.mov(addr_vmx, _MM_MASK_MASK);
|
e.mov(addr_vmx, _MM_MASK_MASK);
|
||||||
|
e.btr(flags_ptr, kX64BackendNJMOn);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
|
e.mov(addr_vmx, DEFAULT_VMX_MXCSR);
|
||||||
|
e.bts(flags_ptr, kX64BackendNJMOn);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
e.mov(e.eax, flags_ptr);
|
||||||
|
e.mov(e.edx, 1U << kX64BackendNJMOn);
|
||||||
|
e.mov(e.ecx, e.edx);
|
||||||
|
e.not_(e.ecx);
|
||||||
|
e.and_(e.ecx, e.eax);
|
||||||
|
e.or_(e.edx, e.eax);
|
||||||
e.test(i.src1, i.src1);
|
e.test(i.src1, i.src1);
|
||||||
|
e.cmove(e.edx, e.ecx);
|
||||||
|
e.mov(flags_ptr, e.edx);
|
||||||
e.mov(e.edx, DEFAULT_VMX_MXCSR);
|
e.mov(e.edx, DEFAULT_VMX_MXCSR);
|
||||||
e.mov(e.eax, _MM_MASK_MASK);
|
e.mov(e.eax, _MM_MASK_MASK);
|
||||||
|
|
||||||
|
|
|
@ -2123,12 +2123,19 @@ struct RSQRT_V128 : Sequence<RSQRT_V128, I<OPCODE_RSQRT, V128Op, V128Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
e.ChangeMxcsrMode(MXCSRMode::Vmx);
|
||||||
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
|
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm3);
|
||||||
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho)) {
|
/*
|
||||||
e.vrsqrt14ps(i.dest, src1);
|
the vast majority of inputs to vrsqrte come from vmsum3 or vmsum4 as part
|
||||||
|
of a vector normalization sequence. in fact, its difficult to find uses of vrsqrte in titles
|
||||||
|
that have inputs which do not come from vmsum.
|
||||||
|
*/
|
||||||
|
if (i.src1.value && i.src1.value->AllFloatVectorLanesSameValue()) {
|
||||||
|
e.vmovss(e.xmm0, src1);
|
||||||
|
e.call(e.backend()->vrsqrtefp_scalar_helper);
|
||||||
|
e.vshufps(i.dest, e.xmm0, e.xmm0, 0);
|
||||||
} else {
|
} else {
|
||||||
e.vmovaps(e.xmm0, e.GetXmmConstPtr(XMMOne));
|
e.vmovaps(e.xmm0, src1);
|
||||||
e.vsqrtps(e.xmm1, src1);
|
e.call(e.backend()->vrsqrtefp_vector_helper);
|
||||||
e.vdivps(i.dest, e.xmm0, e.xmm1);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -3183,16 +3190,37 @@ struct SET_ROUNDING_MODE_I32
|
||||||
// removed the And with 7 and hoisted that and into the InstrEmit_'s that
|
// removed the And with 7 and hoisted that and into the InstrEmit_'s that
|
||||||
// generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and
|
// generate OPCODE_SET_ROUNDING_MODE so that it can be constant folded and
|
||||||
// backends dont have to worry about it
|
// backends dont have to worry about it
|
||||||
|
auto flags_ptr = e.GetBackendFlagsPtr();
|
||||||
if (i.src1.is_constant) {
|
if (i.src1.is_constant) {
|
||||||
e.mov(e.eax, mxcsr_table[i.src1.constant()]);
|
unsigned constant_value = i.src1.constant();
|
||||||
|
e.mov(e.eax, mxcsr_table[constant_value]);
|
||||||
|
|
||||||
|
if (constant_value & 4) {
|
||||||
|
e.or_(flags_ptr, 1U << kX64BackendNonIEEEMode);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
e.btr(flags_ptr, kX64BackendNonIEEEMode);
|
||||||
|
}
|
||||||
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
|
e.mov(e.dword[e.rsp + StackLayout::GUEST_SCRATCH], e.eax);
|
||||||
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
|
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.eax);
|
||||||
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
|
e.vldmxcsr(e.dword[e.rsp + StackLayout::GUEST_SCRATCH]);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
e.mov(e.ecx, i.src1);
|
//can andnot, but this is a very infrequently used opcode
|
||||||
|
e.mov(e.eax, 1U << kX64BackendNonIEEEMode);
|
||||||
|
e.mov(e.edx, e.eax);
|
||||||
|
e.not_(e.edx);
|
||||||
|
e.mov(e.ecx, flags_ptr);
|
||||||
|
//edx = flags w/ non ieee cleared
|
||||||
|
e.and_(e.edx, e.ecx);
|
||||||
|
//eax = flags w/ non ieee set
|
||||||
|
e.or_(e.eax, e.ecx);
|
||||||
|
e.bt(i.src1, 2);
|
||||||
|
|
||||||
|
e.mov(e.ecx, i.src1);
|
||||||
|
e.cmovc(e.edx, e.eax);
|
||||||
e.mov(e.rax, uintptr_t(mxcsr_table));
|
e.mov(e.rax, uintptr_t(mxcsr_table));
|
||||||
|
e.mov(flags_ptr, e.edx);
|
||||||
e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]);
|
e.mov(e.edx, e.ptr[e.rax + e.rcx * 4]);
|
||||||
// this was not here
|
// this was not here
|
||||||
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx);
|
e.mov(e.GetBackendCtxPtr(offsetof(X64BackendContext, mxcsr_fpu)), e.edx);
|
||||||
|
|
|
@ -1370,6 +1370,38 @@ bool SimplificationPass::SimplifyVectorOps(hir::Instr* i,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
splatting a 32-bit value extracted from a vector where all 4 32-bit values are the same should be eliminated and
|
||||||
|
instead use the vector extracted from, which will be identical
|
||||||
|
have seen this happen, some games vmsum and then splat the low float to all 4 floats, even though it already is there
|
||||||
|
*/
|
||||||
|
if (opc == OPCODE_SPLAT) {
|
||||||
|
if (i->dest->type == VEC128_TYPE) {
|
||||||
|
auto splatted_value = i->src1.value;
|
||||||
|
auto splat_type = splatted_value->type;
|
||||||
|
if (splat_type == FLOAT32_TYPE || splat_type == INT32_TYPE) {
|
||||||
|
//its a splat of a fourbyte value, check the definition
|
||||||
|
auto splat_input_definition = splatted_value->GetDefSkipAssigns();
|
||||||
|
if (splat_input_definition) {
|
||||||
|
auto defining_opcode = splat_input_definition->GetOpcodeNum();
|
||||||
|
if (defining_opcode == OPCODE_EXTRACT) {
|
||||||
|
auto value_extracted_from = splat_input_definition->src1.value;
|
||||||
|
if (value_extracted_from->type == VEC128_TYPE) {
|
||||||
|
|
||||||
|
xenia_assert(splat_input_definition->dest->type == splat_type);
|
||||||
|
|
||||||
|
if (value_extracted_from->AllFloatVectorLanesSameValue()) {
|
||||||
|
i->Replace(&OPCODE_ASSIGN_info,0);
|
||||||
|
i->set_src1(value_extracted_from);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
|
bool SimplificationPass::SimplifyVectorOps(hir::HIRBuilder* builder) {
|
||||||
|
|
|
@ -1805,6 +1805,86 @@ bool Value::AllUsesByOneInsn() const {
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
bool Value::AllFloatVectorLanesSameValue(const hir::Value* for_value,
|
||||||
|
uint32_t current_depth) {
|
||||||
|
// limit recursion, otherwise this function will slow down emission
|
||||||
|
if (current_depth == 16) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
using namespace hir;
|
||||||
|
hir::Instr* definition;
|
||||||
|
Opcode definition_opcode_number;
|
||||||
|
re_enter:
|
||||||
|
definition = for_value->def;
|
||||||
|
if (!definition) {
|
||||||
|
xenia_assert(for_value->IsConstant());
|
||||||
|
|
||||||
|
auto&& constant_value = for_value->constant.v128;
|
||||||
|
for (unsigned constant_lane_index = 1; constant_lane_index < 4; ++constant_lane_index) {
|
||||||
|
if (constant_value.u32[0] != constant_value.u32[constant_lane_index]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
definition_opcode_number = definition->GetOpcodeNum();
|
||||||
|
|
||||||
|
if (definition_opcode_number == OPCODE_ASSIGN) {
|
||||||
|
for_value = definition->src1.value;
|
||||||
|
goto re_enter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (definition_opcode_number == OPCODE_VECTOR_DENORMFLUSH) {
|
||||||
|
for_value = definition->src1.value;
|
||||||
|
goto re_enter;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
vmsum propagates its result to every lane
|
||||||
|
*/
|
||||||
|
if (definition_opcode_number == OPCODE_DOT_PRODUCT_4 ||
|
||||||
|
definition_opcode_number == OPCODE_DOT_PRODUCT_3) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
//if splat of 32-bit value type, return true
|
||||||
|
//technically a splat of int16 or int8 would also produce the same "float" in all lanes
|
||||||
|
//but i think its best to keep this function focused on specifically float data
|
||||||
|
if (definition_opcode_number == OPCODE_SPLAT) {
|
||||||
|
if (definition->dest->type == VEC128_TYPE) {
|
||||||
|
auto splat_src_value_type = definition->src1.value->type;
|
||||||
|
if (splat_src_value_type == INT32_TYPE ||
|
||||||
|
splat_src_value_type == FLOAT32_TYPE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (definition_opcode_number) {
|
||||||
|
//all of these opcodes produce the same value for the same input
|
||||||
|
case OPCODE_RSQRT:
|
||||||
|
case OPCODE_RECIP:
|
||||||
|
case OPCODE_POW2:
|
||||||
|
case OPCODE_LOG2:
|
||||||
|
for_value = definition->src1.value;
|
||||||
|
goto re_enter;
|
||||||
|
|
||||||
|
//binary opcodes
|
||||||
|
case OPCODE_ADD:
|
||||||
|
case OPCODE_SUB:
|
||||||
|
case OPCODE_MUL:
|
||||||
|
if (!AllFloatVectorLanesSameValue(definition->src1.value,
|
||||||
|
current_depth + 1)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for_value = definition->src2.value;
|
||||||
|
goto re_enter;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
} // namespace hir
|
} // namespace hir
|
||||||
} // namespace cpu
|
} // namespace cpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -618,8 +618,16 @@ class Value {
|
||||||
bool MaybeFloaty() const {
|
bool MaybeFloaty() const {
|
||||||
return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
|
return type == FLOAT32_TYPE || type == FLOAT64_TYPE || type == VEC128_TYPE;
|
||||||
}
|
}
|
||||||
|
bool AllFloatVectorLanesSameValue() const {
|
||||||
|
return Value::AllFloatVectorLanesSameValue(this);
|
||||||
|
}
|
||||||
private:
|
private:
|
||||||
|
/*
|
||||||
|
returns true if for_value (which must be VEC128_TYPE) has the same value in
|
||||||
|
every float
|
||||||
|
*/
|
||||||
|
static bool AllFloatVectorLanesSameValue(const hir::Value* for_value,
|
||||||
|
uint32_t current_depth = 0);
|
||||||
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
|
static bool CompareInt8(Opcode opcode, Value* a, Value* b);
|
||||||
static bool CompareInt16(Opcode opcode, Value* a, Value* b);
|
static bool CompareInt16(Opcode opcode, Value* a, Value* b);
|
||||||
static bool CompareInt32(Opcode opcode, Value* a, Value* b);
|
static bool CompareInt32(Opcode opcode, Value* a, Value* b);
|
||||||
|
|
Loading…
Reference in New Issue