From 1fb575fbe43730ede76ccb53d20b4532fe42e914 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Fri, 24 Jan 2020 20:18:26 -0800 Subject: [PATCH] Fixed shit precision in RECIP, made multiplication codegen less garbage --- src/xenia/cpu/backend/x64/x64_emitter.cc | 1 + src/xenia/cpu/backend/x64/x64_emitter.h | 1 + src/xenia/cpu/backend/x64/x64_sequences.cc | 74 ++++++++++++++++++++-- 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 98d4b830c..d143bc760 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -738,6 +738,7 @@ static const vec128_t xmm_consts[] = { /* XMMIntMaxPD */ vec128d(INT_MAX), /* XMMPosIntMinPS */ vec128f((float)0x80000000u), /* XMMQNaN */ vec128i(0x7FC00000u), + /*XMMOneDouble*/ vec128d(1.0) }; // First location to try and place constants. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 4f661a331..a81f5e2b2 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -114,6 +114,7 @@ enum XmmConst { XMMIntMaxPD, XMMPosIntMinPS, XMMQNaN, + XMMOneDouble }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 72401c5f9..33f53d2e4 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -175,7 +175,7 @@ struct ZERO_EXTEND_I32_I8 struct ZERO_EXTEND_I64_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.movzx(i.dest, i.src1); + e.movzx(i.dest.reg().cvt32(), i.src1); } }; struct ZERO_EXTEND_I32_I16 @@ -187,7 +187,7 @@ struct ZERO_EXTEND_I32_I16 struct ZERO_EXTEND_I64_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.movzx(i.dest, i.src1); + e.movzx(i.dest.reg().cvt32(), i.src1); } }; struct ZERO_EXTEND_I64_I32 @@ -1323,6 +1323,19 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, // We exploit mulx here to avoid creating too much register pressure. struct MUL_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1364,6 +1377,19 @@ struct MUL_I8 : Sequence> { }; struct MUL_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1412,6 +1438,20 @@ struct MUL_I32 : Sequence> { return; } } + + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1462,6 +1502,21 @@ struct MUL_I64 : Sequence> { return; } } + + if(i.src1.is_constant || i.src2.is_constant ) { + + + uint64_t cval =i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if(cval < (1ull<<32)) { + + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = RDX * $3 @@ -2470,9 +2525,14 @@ struct RSQRT_F32 : Sequence> { }; struct RSQRT_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcvtsd2ss(i.dest, i.src1); + /*e.vcvtsd2ss(i.dest, i.src1); e.vrsqrtss(i.dest, i.dest); - e.vcvtss2sd(i.dest, i.dest); + e.vcvtss2sd(i.dest, i.dest);*/ + + e.vmovsd(e.xmm0, e.GetXmmConstPtr(XmmConst::XMMOneDouble)); + e.vsqrtsd(i.dest, i.src1); + e.vdivsd(i.dest, e.xmm0, i.dest); + } }; struct RSQRT_V128 : Sequence> { @@ -2492,9 +2552,11 @@ struct RECIP_F32 : Sequence> { }; struct RECIP_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcvtsd2ss(i.dest, i.src1); + /*e.vcvtsd2ss(i.dest, i.src1); e.vrcpss(i.dest, i.dest); - e.vcvtss2sd(i.dest, i.dest); + e.vcvtss2sd(i.dest, i.dest);*/ + e.vmovsd(e.xmm0, e.GetXmmConstPtr(XmmConst::XMMOneDouble)); + e.vdivsd(i.dest, e.xmm0, i.src1); } }; struct RECIP_V128 : Sequence> {