target/sparc: Fix FMULD8*X16

Not only do these instructions have f32 inputs, they also do not
perform rounding.  Since these are relatively simple, implement
them properly inline.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-Id: <20240502165528.244004-6-richard.henderson@linaro.org>
Signed-off-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
This commit is contained in:
Richard Henderson 2024-05-02 09:55:26 -07:00 committed by Mark Cave-Ayland
parent a859602c74
commit be8998e046
3 changed files with 44 additions and 52 deletions

View File

@ -99,8 +99,6 @@ DEF_HELPER_FLAGS_2(fmul8x16, TCG_CALL_NO_RWG_SE, i64, i32, i64)
DEF_HELPER_FLAGS_2(fmul8x16a, TCG_CALL_NO_RWG_SE, i64, i32, s32)
DEF_HELPER_FLAGS_2(fmul8sux16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(fmul8ulx16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(fmuld8sux16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(fmuld8ulx16, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_1(fexpand, TCG_CALL_NO_RWG_SE, i64, i32)
DEF_HELPER_FLAGS_3(pdist, TCG_CALL_NO_RWG_SE, i64, i64, i64, i64)
DEF_HELPER_FLAGS_2(fpack16, TCG_CALL_NO_RWG_SE, i32, i64, i64)

View File

@ -74,8 +74,6 @@
# define gen_helper_fmul8sux16 ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fmul8ulx16 ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fmul8x16 ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fmuld8sux16 ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fmuld8ulx16 ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fpmerge ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fqtox ({ qemu_build_not_reached(); NULL; })
# define gen_helper_fstox ({ qemu_build_not_reached(); NULL; })
@ -730,6 +728,48 @@ static void gen_op_fmul8x16au(TCGv_i64 dst, TCGv_i32 src1, TCGv_i32 src2)
gen_helper_fmul8x16a(dst, src1, src2);
}
static void gen_op_fmuld8ulx16(TCGv_i64 dst, TCGv_i32 src1, TCGv_i32 src2)
{
TCGv_i32 t0 = tcg_temp_new_i32();
TCGv_i32 t1 = tcg_temp_new_i32();
TCGv_i32 t2 = tcg_temp_new_i32();
tcg_gen_ext8u_i32(t0, src1);
tcg_gen_ext16s_i32(t1, src2);
tcg_gen_mul_i32(t0, t0, t1);
tcg_gen_extract_i32(t1, src1, 16, 8);
tcg_gen_sextract_i32(t2, src2, 16, 16);
tcg_gen_mul_i32(t1, t1, t2);
tcg_gen_concat_i32_i64(dst, t0, t1);
}
static void gen_op_fmuld8sux16(TCGv_i64 dst, TCGv_i32 src1, TCGv_i32 src2)
{
TCGv_i32 t0 = tcg_temp_new_i32();
TCGv_i32 t1 = tcg_temp_new_i32();
TCGv_i32 t2 = tcg_temp_new_i32();
/*
* The insn description talks about extracting the upper 8 bits
* of the signed 16-bit input rs1, performing the multiply, then
* shifting left by 8 bits. Instead, zap the lower 8 bits of
* the rs1 input, which avoids the need for two shifts.
*/
tcg_gen_ext16s_i32(t0, src1);
tcg_gen_andi_i32(t0, t0, ~0xff);
tcg_gen_ext16s_i32(t1, src2);
tcg_gen_mul_i32(t0, t0, t1);
tcg_gen_sextract_i32(t1, src1, 16, 16);
tcg_gen_andi_i32(t1, t1, ~0xff);
tcg_gen_sextract_i32(t2, src2, 16, 16);
tcg_gen_mul_i32(t1, t1, t2);
tcg_gen_concat_i32_i64(dst, t0, t1);
}
static void finishing_insn(DisasContext *dc)
{
/*
@ -4614,6 +4654,8 @@ static bool do_dff(DisasContext *dc, arg_r_r_r *a,
TRANS(FMUL8x16AU, VIS1, do_dff, a, gen_op_fmul8x16au)
TRANS(FMUL8x16AL, VIS1, do_dff, a, gen_op_fmul8x16al)
TRANS(FMULD8SUx16, VIS1, do_dff, a, gen_op_fmuld8sux16)
TRANS(FMULD8ULx16, VIS1, do_dff, a, gen_op_fmuld8ulx16)
static bool do_dfd(DisasContext *dc, arg_r_r_r *a,
void (*func)(TCGv_i64, TCGv_i32, TCGv_i64))
@ -4654,8 +4696,6 @@ static bool do_ddd(DisasContext *dc, arg_r_r_r *a,
TRANS(FMUL8SUx16, VIS1, do_ddd, a, gen_helper_fmul8sux16)
TRANS(FMUL8ULx16, VIS1, do_ddd, a, gen_helper_fmul8ulx16)
TRANS(FMULD8SUx16, VIS1, do_ddd, a, gen_helper_fmuld8sux16)
TRANS(FMULD8ULx16, VIS1, do_ddd, a, gen_helper_fmuld8ulx16)
TRANS(FPMERGE, VIS1, do_ddd, a, gen_helper_fpmerge)
TRANS(FPADD16, VIS1, do_ddd, a, tcg_gen_vec_add16_i64)

View File

@ -194,52 +194,6 @@ uint64_t helper_fmul8ulx16(uint64_t src1, uint64_t src2)
return d.ll;
}
uint64_t helper_fmuld8sux16(uint64_t src1, uint64_t src2)
{
VIS64 s, d;
uint32_t tmp;
s.ll = src1;
d.ll = src2;
#define PMUL(r) \
tmp = (int32_t)d.VIS_SW64(r) * ((int32_t)s.VIS_SW64(r) >> 8); \
if ((tmp & 0xff) > 0x7f) { \
tmp += 0x100; \
} \
d.VIS_L64(r) = tmp;
/* Reverse calculation order to handle overlap */
PMUL(1);
PMUL(0);
#undef PMUL
return d.ll;
}
uint64_t helper_fmuld8ulx16(uint64_t src1, uint64_t src2)
{
VIS64 s, d;
uint32_t tmp;
s.ll = src1;
d.ll = src2;
#define PMUL(r) \
tmp = (int32_t)d.VIS_SW64(r) * ((uint32_t)s.VIS_B64(r * 2)); \
if ((tmp & 0xff) > 0x7f) { \
tmp += 0x100; \
} \
d.VIS_L64(r) = tmp;
/* Reverse calculation order to handle overlap */
PMUL(1);
PMUL(0);
#undef PMUL
return d.ll;
}
uint64_t helper_fexpand(uint32_t src2)
{
VIS32 s;