Fix some more constant folding

fabsx does NOT set fpscr
turns out that our vector unsigned compare instructions are a bit wierd?
This commit is contained in:
chss95cs@gmail.com 2022-08-21 10:27:54 -07:00
parent 0ebc109d4d
commit b26c6ee1b8
5 changed files with 75 additions and 81 deletions

View File

@ -143,6 +143,12 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
feature_flags_ |= kX64EmitTBM;
}
}
if (amd_flags & (1U << 11)) {
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
feature_flags_ |= kX64EmitXOP;
XELOGCPU("Cpu support XOP!\n\n");
}
}
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
bool is_zennish = cpu_.displayFamily >= 0x17;
/*

View File

@ -143,6 +143,7 @@ struct VECTOR_DENORMFLUSH
e.vandps(e.xmm0, i.src1,
e.GetXmmConstPtr(XMMSingleDenormalMask)); // 0.25 P0123
e.vcmpneqps(e.xmm2, e.xmm0, e.xmm1); // 0.5 P01
// todo: xop vpcmov here
e.vandps(e.xmm1, i.src1,
e.GetXmmConstPtr(XMMSignMaskF32)); // 0.5 P0123 take signs, zeros
// must keep their signs
@ -457,68 +458,52 @@ struct VECTOR_COMPARE_UGT_V128
: Sequence<VECTOR_COMPARE_UGT_V128,
I<OPCODE_VECTOR_COMPARE_UGT, V128Op, V128Op, V128Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->flags != FLOAT32_TYPE && e.IsFeatureEnabled(kX64EmitXOP)) {
Xmm src1 = GetInputRegOrConstant(e, i.src1, e.xmm0);
Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1);
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcomub(i.dest, src1, src2, xopcompare_e::GT);
break;
case INT16_TYPE:
e.vpcomuw(i.dest, src1, src2, xopcompare_e::GT);
break;
case INT32_TYPE:
e.vpcomud(i.dest, src1, src2, xopcompare_e::GT);
break;
}
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
switch (i.instr->flags) {
case INT8_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
break;
case INT16_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
break;
case INT32_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
break;
default:
assert_always();
break;
}
if (i.src1.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm0, i.src1.constant());
e.vpxor(e.xmm0, sign_addr);
} else {
Xbyak::Address sign_addr = e.ptr[e.rax]; // dummy
switch (i.instr->flags) {
case INT8_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI8);
break;
case INT16_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI16);
break;
case INT32_TYPE:
sign_addr = e.GetXmmConstPtr(XMMSignMaskI32);
break;
case FLOAT32_TYPE:
e.ChangeMxcsrMode(MXCSRMode::Vmx);
sign_addr = e.GetXmmConstPtr(XMMSignMaskF32);
break;
default:
assert_always();
break;
}
if (i.src1.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm0, i.src1.constant());
e.vpxor(e.xmm0, sign_addr);
} else {
e.vpxor(e.xmm0, i.src1, sign_addr);
}
if (i.src2.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm1, i.src2.constant());
e.vpxor(e.xmm1, sign_addr);
} else {
e.vpxor(e.xmm1, i.src2, sign_addr);
}
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
break;
case INT16_TYPE:
e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
break;
case INT32_TYPE:
e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
break;
case FLOAT32_TYPE:
e.vcmpgtps(i.dest, e.xmm0, e.xmm1);
break;
}
e.vpxor(e.xmm0, i.src1, sign_addr);
}
if (i.src2.is_constant) {
// TODO(benvanik): make this constant.
e.LoadConstantXmm(e.xmm1, i.src2.constant());
e.vpxor(e.xmm1, sign_addr);
} else {
e.vpxor(e.xmm1, i.src2, sign_addr);
}
switch (i.instr->flags) {
case INT8_TYPE:
e.vpcmpgtb(i.dest, e.xmm0, e.xmm1);
break;
case INT16_TYPE:
e.vpcmpgtw(i.dest, e.xmm0, e.xmm1);
break;
case INT32_TYPE:
e.vpcmpgtd(i.dest, e.xmm0, e.xmm1);
break;
case FLOAT32_TYPE:
e.vcmpgtps(i.dest, e.xmm0, e.xmm1);
break;
}
}
};
@ -634,6 +619,7 @@ struct VECTOR_ADD
// overflowed (only need to check one input)
// if (src1 > res) then overflowed
// http://locklessinc.com/articles/sat_arithmetic/
// chrispy: todo - add xop stuff here
e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32));
e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0);

View File

@ -781,11 +781,12 @@ struct SELECT_V128_V128
} else if (mayblend == PermittedBlend::Ps) {
e.vblendvps(i.dest, src2, src3, src1);
} else {
if (e.IsFeatureEnabled(kX64EmitXOP)) {
if (1 && e.IsFeatureEnabled(kX64EmitXOP)) {
XELOGCPU("Doing vpcmov!!");
e.vpcmov(i.dest, src2, src3, src1);
e.vpcmov(i.dest, src3, src2, src1);
} else {
// src1 ? src2 : src3;
e.vpandn(e.xmm3, src1, src2);
e.vpand(i.dest, src1, src3);
e.vpor(i.dest, i.dest, e.xmm3);

View File

@ -1023,13 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
Value* HIRBuilder::Convert(Value* value, TypeName target_type,
RoundMode round_mode) {
if (value->type == target_type) {
return value;
} else if (value->IsConstant()) {
Value* dest = CloneValue(value);
dest->Convert(target_type, round_mode);
return dest;
}
Instr* i =
AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
@ -1041,11 +1034,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
ASSERT_FLOAT_OR_VECTOR_TYPE(value);
if (value->IsConstant()) {
Value* dest = CloneValue(value);
dest->Round(round_mode);
return dest;
}
Instr* i =
AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
@ -1295,7 +1283,7 @@ void HIRBuilder::SetNJM(Value* value) {
Value* HIRBuilder::Max(Value* value1, Value* value2) {
ASSERT_TYPES_EQUAL(value1, value2);
if (value1->type != VEC128_TYPE && value1->IsConstant() &&
if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
value2->IsConstant()) {
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
}
@ -1323,7 +1311,7 @@ Value* HIRBuilder::VectorMax(Value* value1, Value* value2, TypeName part_type,
Value* HIRBuilder::Min(Value* value1, Value* value2) {
ASSERT_TYPES_EQUAL(value1, value2);
if (value1->type != VEC128_TYPE && value1->IsConstant() &&
if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
value2->IsConstant()) {
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value1 : value2;
}
@ -1351,8 +1339,9 @@ Value* HIRBuilder::VectorMin(Value* value1, Value* value2, TypeName part_type,
Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
assert_true(cond->type == INT8_TYPE || cond->type == VEC128_TYPE); // for now
ASSERT_TYPES_EQUAL(value1, value2);
if (cond->IsConstant()) {
// chrispy: this was being done with V128, which was breaking stuff obviously
// because that should be an element by element select
if (cond->IsConstant() && IsScalarIntegralType(cond->type)) {
return cond->IsConstantTrue() ? value1 : value2;
}
@ -1518,7 +1507,8 @@ Value* HIRBuilder::Add(Value* value1, Value* value2,
ASSERT_TYPES_EQUAL(value1, value2);
// TODO(benvanik): optimize when flags set.
if (!arithmetic_flags) {
if (!arithmetic_flags && IsScalarIntegralType(value1->type)) {
if (value1->IsConstantZero()) {
return value2;
} else if (value2->IsConstantZero()) {

View File

@ -442,7 +442,18 @@ int InstrEmit_fabsx(PPCHIRBuilder& f, const InstrData& i) {
// frD <- abs(frB)
Value* v = f.Abs(f.LoadFPR(i.X.RB));
f.StoreFPR(i.X.RT, v);
f.UpdateFPSCR(v, i.X.Rc);
/*
The contents of frB with bit 0 cleared are placed into frD.
Note that the fabs instruction treats NaNs just like any other kind of value. That is, the sign
bit of a NaN may be altered by fabs. This instruction does not alter the FPSCR.
Other registers altered:
Condition Register (CR1 field):
Affected: FX, FEX, VX, OX (if Rc = 1)
*/
// f.UpdateFPSCR(v, i.X.Rc);
if (i.X.Rc) {
}
return 0;
}