mirror of https://github.com/xemu-project/xemu.git
target/sparc: Fix FMULD8*X16
Not only do these instructions have f32 inputs, they also do not perform rounding. Since these are relatively simple, implement them properly inline. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-Id: <20240502165528.244004-6-richard.henderson@linaro.org> Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
This commit is contained in:
parent
a859602c74
commit
be8998e046
|
@ -99,8 +99,6 @@ DEF_HELPER_FLAGS_2(fmul8x16, TCG_CALL_NO_RWG_SE, i64, i32, i64)
|
|||
DEF_HELPER_FLAGS_2(fmul8x16a, TCG_CALL_NO_RWG_SE, i64, i32, s32)
|
||||
DEF_HELPER_FLAGS_2(fmul8sux16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
|
||||
DEF_HELPER_FLAGS_2(fmul8ulx16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
|
||||
DEF_HELPER_FLAGS_2(fmuld8sux16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
|
||||
DEF_HELPER_FLAGS_2(fmuld8ulx16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
|
||||
DEF_HELPER_FLAGS_1(fexpand, TCG_CALL_NO_RWG_SE, i64, i32)
|
||||
DEF_HELPER_FLAGS_3(pdist, TCG_CALL_NO_RWG_SE, i64, i64, i64, i64)
|
||||
DEF_HELPER_FLAGS_2(fpack16, TCG_CALL_NO_RWG_SE, i32, i64, i64)
|
||||
|
|
|
@ -74,8 +74,6 @@
|
|||
# define gen_helper_fmul8sux16 ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fmul8ulx16 ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fmul8x16 ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fmuld8sux16 ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fmuld8ulx16 ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fpmerge ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fqtox ({ qemu_build_not_reached(); NULL; })
|
||||
# define gen_helper_fstox ({ qemu_build_not_reached(); NULL; })
|
||||
|
@ -730,6 +728,48 @@ static void gen_op_fmul8x16au(TCGv_i64 dst, TCGv_i32 src1, TCGv_i32 src2)
|
|||
gen_helper_fmul8x16a(dst, src1, src2);
|
||||
}
|
||||
|
||||
static void gen_op_fmuld8ulx16(TCGv_i64 dst, TCGv_i32 src1, TCGv_i32 src2)
|
||||
{
|
||||
TCGv_i32 t0 = tcg_temp_new_i32();
|
||||
TCGv_i32 t1 = tcg_temp_new_i32();
|
||||
TCGv_i32 t2 = tcg_temp_new_i32();
|
||||
|
||||
tcg_gen_ext8u_i32(t0, src1);
|
||||
tcg_gen_ext16s_i32(t1, src2);
|
||||
tcg_gen_mul_i32(t0, t0, t1);
|
||||
|
||||
tcg_gen_extract_i32(t1, src1, 16, 8);
|
||||
tcg_gen_sextract_i32(t2, src2, 16, 16);
|
||||
tcg_gen_mul_i32(t1, t1, t2);
|
||||
|
||||
tcg_gen_concat_i32_i64(dst, t0, t1);
|
||||
}
|
||||
|
||||
static void gen_op_fmuld8sux16(TCGv_i64 dst, TCGv_i32 src1, TCGv_i32 src2)
|
||||
{
|
||||
TCGv_i32 t0 = tcg_temp_new_i32();
|
||||
TCGv_i32 t1 = tcg_temp_new_i32();
|
||||
TCGv_i32 t2 = tcg_temp_new_i32();
|
||||
|
||||
/*
|
||||
* The insn description talks about extracting the upper 8 bits
|
||||
* of the signed 16-bit input rs1, performing the multiply, then
|
||||
* shifting left by 8 bits. Instead, zap the lower 8 bits of
|
||||
* the rs1 input, which avoids the need for two shifts.
|
||||
*/
|
||||
tcg_gen_ext16s_i32(t0, src1);
|
||||
tcg_gen_andi_i32(t0, t0, ~0xff);
|
||||
tcg_gen_ext16s_i32(t1, src2);
|
||||
tcg_gen_mul_i32(t0, t0, t1);
|
||||
|
||||
tcg_gen_sextract_i32(t1, src1, 16, 16);
|
||||
tcg_gen_andi_i32(t1, t1, ~0xff);
|
||||
tcg_gen_sextract_i32(t2, src2, 16, 16);
|
||||
tcg_gen_mul_i32(t1, t1, t2);
|
||||
|
||||
tcg_gen_concat_i32_i64(dst, t0, t1);
|
||||
}
|
||||
|
||||
static void finishing_insn(DisasContext *dc)
|
||||
{
|
||||
/*
|
||||
|
@ -4614,6 +4654,8 @@ static bool do_dff(DisasContext *dc, arg_r_r_r *a,
|
|||
|
||||
TRANS(FMUL8x16AU, VIS1, do_dff, a, gen_op_fmul8x16au)
|
||||
TRANS(FMUL8x16AL, VIS1, do_dff, a, gen_op_fmul8x16al)
|
||||
TRANS(FMULD8SUx16, VIS1, do_dff, a, gen_op_fmuld8sux16)
|
||||
TRANS(FMULD8ULx16, VIS1, do_dff, a, gen_op_fmuld8ulx16)
|
||||
|
||||
static bool do_dfd(DisasContext *dc, arg_r_r_r *a,
|
||||
void (*func)(TCGv_i64, TCGv_i32, TCGv_i64))
|
||||
|
@ -4654,8 +4696,6 @@ static bool do_ddd(DisasContext *dc, arg_r_r_r *a,
|
|||
|
||||
TRANS(FMUL8SUx16, VIS1, do_ddd, a, gen_helper_fmul8sux16)
|
||||
TRANS(FMUL8ULx16, VIS1, do_ddd, a, gen_helper_fmul8ulx16)
|
||||
TRANS(FMULD8SUx16, VIS1, do_ddd, a, gen_helper_fmuld8sux16)
|
||||
TRANS(FMULD8ULx16, VIS1, do_ddd, a, gen_helper_fmuld8ulx16)
|
||||
TRANS(FPMERGE, VIS1, do_ddd, a, gen_helper_fpmerge)
|
||||
|
||||
TRANS(FPADD16, VIS1, do_ddd, a, tcg_gen_vec_add16_i64)
|
||||
|
|
|
@ -194,52 +194,6 @@ uint64_t helper_fmul8ulx16(uint64_t src1, uint64_t src2)
|
|||
return d.ll;
|
||||
}
|
||||
|
||||
uint64_t helper_fmuld8sux16(uint64_t src1, uint64_t src2)
|
||||
{
|
||||
VIS64 s, d;
|
||||
uint32_t tmp;
|
||||
|
||||
s.ll = src1;
|
||||
d.ll = src2;
|
||||
|
||||
#define PMUL(r) \
|
||||
tmp = (int32_t)d.VIS_SW64(r) * ((int32_t)s.VIS_SW64(r) >> 8); \
|
||||
if ((tmp & 0xff) > 0x7f) { \
|
||||
tmp += 0x100; \
|
||||
} \
|
||||
d.VIS_L64(r) = tmp;
|
||||
|
||||
/* Reverse calculation order to handle overlap */
|
||||
PMUL(1);
|
||||
PMUL(0);
|
||||
#undef PMUL
|
||||
|
||||
return d.ll;
|
||||
}
|
||||
|
||||
uint64_t helper_fmuld8ulx16(uint64_t src1, uint64_t src2)
|
||||
{
|
||||
VIS64 s, d;
|
||||
uint32_t tmp;
|
||||
|
||||
s.ll = src1;
|
||||
d.ll = src2;
|
||||
|
||||
#define PMUL(r) \
|
||||
tmp = (int32_t)d.VIS_SW64(r) * ((uint32_t)s.VIS_B64(r * 2)); \
|
||||
if ((tmp & 0xff) > 0x7f) { \
|
||||
tmp += 0x100; \
|
||||
} \
|
||||
d.VIS_L64(r) = tmp;
|
||||
|
||||
/* Reverse calculation order to handle overlap */
|
||||
PMUL(1);
|
||||
PMUL(0);
|
||||
#undef PMUL
|
||||
|
||||
return d.ll;
|
||||
}
|
||||
|
||||
uint64_t helper_fexpand(uint32_t src2)
|
||||
{
|
||||
VIS32 s;
|
||||
|
|
Loading…
Reference in New Issue