mirror of https://github.com/xemu-project/xemu.git
target/arm: Promote consecutive memory ops for aa64
For a sequence of loads or stores from a single register, little-endian operations can be promoted to an 8-byte op. This can reduce the number of operations by a factor of 8. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20181011205206.3552-5-richard.henderson@linaro.org Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
parent
10e0b33c67
commit
87f9a7f0c8
|
@ -1201,25 +1201,23 @@ static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
|
||||||
|
|
||||||
/* Store from vector register to memory */
|
/* Store from vector register to memory */
|
||||||
static void do_vec_st(DisasContext *s, int srcidx, int element,
|
static void do_vec_st(DisasContext *s, int srcidx, int element,
|
||||||
TCGv_i64 tcg_addr, int size)
|
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
|
||||||
{
|
{
|
||||||
TCGMemOp memop = s->be_data + size;
|
|
||||||
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
|
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
|
||||||
|
|
||||||
read_vec_element(s, tcg_tmp, srcidx, element, size);
|
read_vec_element(s, tcg_tmp, srcidx, element, size);
|
||||||
tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
|
tcg_gen_qemu_st_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
|
||||||
|
|
||||||
tcg_temp_free_i64(tcg_tmp);
|
tcg_temp_free_i64(tcg_tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Load from memory to vector register */
|
/* Load from memory to vector register */
|
||||||
static void do_vec_ld(DisasContext *s, int destidx, int element,
|
static void do_vec_ld(DisasContext *s, int destidx, int element,
|
||||||
TCGv_i64 tcg_addr, int size)
|
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
|
||||||
{
|
{
|
||||||
TCGMemOp memop = s->be_data + size;
|
|
||||||
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
|
TCGv_i64 tcg_tmp = tcg_temp_new_i64();
|
||||||
|
|
||||||
tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), memop);
|
tcg_gen_qemu_ld_i64(tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
|
||||||
write_vec_element(s, tcg_tmp, destidx, element, size);
|
write_vec_element(s, tcg_tmp, destidx, element, size);
|
||||||
|
|
||||||
tcg_temp_free_i64(tcg_tmp);
|
tcg_temp_free_i64(tcg_tmp);
|
||||||
|
@ -3019,9 +3017,10 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
||||||
bool is_postidx = extract32(insn, 23, 1);
|
bool is_postidx = extract32(insn, 23, 1);
|
||||||
bool is_q = extract32(insn, 30, 1);
|
bool is_q = extract32(insn, 30, 1);
|
||||||
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
|
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
|
||||||
|
TCGMemOp endian = s->be_data;
|
||||||
|
|
||||||
int ebytes = 1 << size;
|
int ebytes; /* bytes per element */
|
||||||
int elements = (is_q ? 128 : 64) / (8 << size);
|
int elements; /* elements per vector */
|
||||||
int rpt; /* num iterations */
|
int rpt; /* num iterations */
|
||||||
int selem; /* structure elements */
|
int selem; /* structure elements */
|
||||||
int r;
|
int r;
|
||||||
|
@ -3080,6 +3079,20 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
||||||
gen_check_sp_alignment(s);
|
gen_check_sp_alignment(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* For our purposes, bytes are always little-endian. */
|
||||||
|
if (size == 0) {
|
||||||
|
endian = MO_LE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Consecutive little-endian elements from a single register
|
||||||
|
* can be promoted to a larger little-endian operation.
|
||||||
|
*/
|
||||||
|
if (selem == 1 && endian == MO_LE) {
|
||||||
|
size = 3;
|
||||||
|
}
|
||||||
|
ebytes = 1 << size;
|
||||||
|
elements = (is_q ? 16 : 8) / ebytes;
|
||||||
|
|
||||||
tcg_rn = cpu_reg_sp(s, rn);
|
tcg_rn = cpu_reg_sp(s, rn);
|
||||||
tcg_addr = tcg_temp_new_i64();
|
tcg_addr = tcg_temp_new_i64();
|
||||||
tcg_gen_mov_i64(tcg_addr, tcg_rn);
|
tcg_gen_mov_i64(tcg_addr, tcg_rn);
|
||||||
|
@ -3088,31 +3101,32 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
||||||
for (r = 0; r < rpt; r++) {
|
for (r = 0; r < rpt; r++) {
|
||||||
int e;
|
int e;
|
||||||
for (e = 0; e < elements; e++) {
|
for (e = 0; e < elements; e++) {
|
||||||
int tt = (rt + r) % 32;
|
|
||||||
int xs;
|
int xs;
|
||||||
for (xs = 0; xs < selem; xs++) {
|
for (xs = 0; xs < selem; xs++) {
|
||||||
|
int tt = (rt + r + xs) % 32;
|
||||||
if (is_store) {
|
if (is_store) {
|
||||||
do_vec_st(s, tt, e, tcg_addr, size);
|
do_vec_st(s, tt, e, tcg_addr, size, endian);
|
||||||
} else {
|
} else {
|
||||||
do_vec_ld(s, tt, e, tcg_addr, size);
|
do_vec_ld(s, tt, e, tcg_addr, size, endian);
|
||||||
|
}
|
||||||
|
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_store) {
|
||||||
/* For non-quad operations, setting a slice of the low
|
/* For non-quad operations, setting a slice of the low
|
||||||
* 64 bits of the register clears the high 64 bits (in
|
* 64 bits of the register clears the high 64 bits (in
|
||||||
* the ARM ARM pseudocode this is implicit in the fact
|
* the ARM ARM pseudocode this is implicit in the fact
|
||||||
* that 'rval' is a 64 bit wide variable).
|
* that 'rval' is a 64 bit wide variable).
|
||||||
* For quad operations, we might still need to zero the
|
* For quad operations, we might still need to zero the
|
||||||
* high bits of SVE. We optimize by noticing that we only
|
* high bits of SVE.
|
||||||
* need to do this the first time we touch a register.
|
|
||||||
*/
|
*/
|
||||||
if (e == 0 && (r == 0 || xs == selem - 1)) {
|
for (r = 0; r < rpt * selem; r++) {
|
||||||
|
int tt = (rt + r) % 32;
|
||||||
clear_vec_high(s, is_q, tt);
|
clear_vec_high(s, is_q, tt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
|
|
||||||
tt = (tt + 1) % 32;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_postidx) {
|
if (is_postidx) {
|
||||||
int rm = extract32(insn, 16, 5);
|
int rm = extract32(insn, 16, 5);
|
||||||
|
@ -3234,9 +3248,9 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
|
||||||
} else {
|
} else {
|
||||||
/* Load/store one element per register */
|
/* Load/store one element per register */
|
||||||
if (is_load) {
|
if (is_load) {
|
||||||
do_vec_ld(s, rt, index, tcg_addr, scale);
|
do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
|
||||||
} else {
|
} else {
|
||||||
do_vec_st(s, rt, index, tcg_addr, scale);
|
do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
|
tcg_gen_add_i64(tcg_addr, tcg_addr, tcg_ebytes);
|
||||||
|
|
Loading…
Reference in New Issue