separated T and S bits from sh4 status register into their own context members, sr_t and sr_s, reducing codegen by around 15%

This commit is contained in:
Anthony Pesch 2016-12-30 15:41:29 -08:00
parent d658bb2bfe
commit b2fd497cda
6 changed files with 143 additions and 95 deletions

View File

@ -63,11 +63,12 @@ void sh4_sr_updated(void *data, uint32_t old_sr) {
prof_counter_add(COUNTER_sh4_sr_updates, 1);
if ((ctx->sr & RB) != (old_sr & RB)) {
if ((ctx->sr & RB_MASK) != (old_sr & RB_MASK)) {
sh4_swap_gpr_bank(sh4);
}
if ((ctx->sr & I) != (old_sr & I) || (ctx->sr & BL) != (old_sr & BL)) {
if ((ctx->sr & I_MASK) != (old_sr & I_MASK) ||
(ctx->sr & BL_MASK) != (old_sr & BL_MASK)) {
sh4_intc_update_pending(sh4);
}
}
@ -205,6 +206,16 @@ static void sh4_translate(void *data, uint32_t addr, struct ir *ir, int fastmem,
*size = as.size;
}
void sh4_implode_sr(struct sh4 *sh4) {
sh4->ctx.sr &= ~(S_MASK | T_MASK);
sh4->ctx.sr |= (sh4->ctx.sr_s << S_BIT) | (sh4->ctx.sr_t << T_BIT);
}
void sh4_explode_sr(struct sh4 *sh4) {
sh4->ctx.sr_t = (sh4->ctx.sr & T_MASK) >> T_BIT;
sh4->ctx.sr_s = (sh4->ctx.sr & S_MASK) >> S_BIT;
}
void sh4_clear_interrupt(struct sh4 *sh4, enum sh4_interrupt intr) {
sh4->requested_interrupts &= ~sh4->sort_id[intr];
sh4_intc_update_pending(sh4);

View File

@ -83,6 +83,8 @@ void sh4_destroy(struct sh4 *sh);
void sh4_reset(struct sh4 *sh4, uint32_t pc);
void sh4_raise_interrupt(struct sh4 *sh, enum sh4_interrupt intr);
void sh4_clear_interrupt(struct sh4 *sh, enum sh4_interrupt intr);
void sh4_explode_sr(struct sh4 *sh4);
void sh4_implode_sr(struct sh4 *sh4);
void sh4_sr_updated(void *data, uint32_t old_sr);
void sh4_fpscr_updated(void *data, uint32_t old_fpscr);

View File

@ -12,9 +12,9 @@ static struct sh4_interrupt_info sh4_interrupts[NUM_SH_INTERRUPTS] = {
};
void sh4_intc_update_pending(struct sh4 *sh4) {
int min_priority = (sh4->ctx.sr & I) >> 4;
int min_priority = (sh4->ctx.sr & I_MASK) >> I_BIT;
uint64_t priority_mask =
(sh4->ctx.sr & BL) ? 0 : ~sh4->priority_mask[min_priority];
(sh4->ctx.sr & BL_MASK) ? 0 : ~sh4->priority_mask[min_priority];
sh4->ctx.pending_interrupts = sh4->requested_interrupts & priority_mask;
}
@ -28,11 +28,14 @@ int sh4_intc_check_pending(struct sh4 *sh4) {
enum sh4_interrupt intr = sh4->sorted_interrupts[n];
struct sh4_interrupt_info *int_info = &sh4_interrupts[intr];
/* ensure sr is up to date */
sh4_implode_sr(sh4);
*sh4->INTEVT = int_info->intevt;
sh4->ctx.ssr = sh4->ctx.sr;
sh4->ctx.spc = sh4->ctx.pc;
sh4->ctx.sgr = sh4->ctx.r[15];
sh4->ctx.sr |= (BL | MD | RB);
sh4->ctx.sr |= (BL_MASK | MD_MASK | RB_MASK);
sh4->ctx.pc = sh4->ctx.vbr + 0x600;
sh4_sr_updated(sh4, sh4->ctx.ssr);

View File

@ -4,29 +4,35 @@
#include <stdint.h>
/* SR bits */
enum {
/* true / false condition or carry/borrow bit */
T = 0x00000001,
#define T_BIT 0
/* specifies a saturation operation for a MAC instruction */
S = 0x00000002,
#define S_BIT 1
/* interrupt mask level */
I = 0x000000f0,
#define I_BIT 4
/* used by the DIV0S, DIV0U, and DIV1 instructions */
Q = 0x00000100,
/* used by the DIV0S, DIV0U, and DIV1 instructions */
M = 0x00000200,
#define Q_BIT 8
#define M_BIT 9
/* an FPU instr causes a general FPU disable exception */
FD = 0x00008000,
#define FD_BIT 15
/* interrupt requests are masked */
BL = 0x10000000,
/*
* general register bank specifier in privileged mode (set
* to 1 by a reset, exception, or interrupt)
*/
RB = 0x20000000,
#define BL_BIT 28
/* general register bank specifier in privileged mode (set
to 1 by a reset, exception, or interrupt) */
#define RB_BIT 29
/* processor mode (0 is user mode, 1 is privileged mode) */
MD = 0x40000000
};
#define MD_BIT 30
#define T_MASK (1u << T_BIT)
#define S_MASK (1u << S_BIT)
#define I_MASK 0xf0
#define Q_MASK (1u << Q_BIT)
#define M_MASK (1u << M_BIT)
#define FD_MASK (1u << FD_BIT)
#define BL_MASK (1u << BL_BIT)
#define RB_MASK (1u << RB_BIT)
#define MD_MASK (1u << MD_BIT)
/* FPSCR bits */
enum {
@ -38,41 +44,37 @@ enum {
};
struct sh4_ctx {
/*
* there are 24 32-bit general registers, r0_bank0-r7_bank0, r0_bank1-r7_bank1
* and r8-r15. r contains the active bank's r0-r7 as well as r8-r15. ralt
* contains the inactive bank's r0-r7 and is swapped in when the processor
* mode changes
*/
/* there are 24 32-bit general registers, r0_bank0-r7_bank0, r0_bank1-r7_bank1
and r8-r15. r contains the active bank's r0-r7 as well as r8-r15. ralt
contains the inactive bank's r0-r7 and is swapped in when the processor
mode changes */
uint32_t r[16], ralt[8];
/*
* there are 32 32-bit floating point registers, fr0-fr15 and xf0-xf15. these
* registers are banked, and swapped with eachother when the bank bit of
* FPSCR changes. in addition, fr0fr15 can be used as the eight registers
* dr0/2/4/6/8/10/12/14 (double-precision, or pair registers) or the four
* registers fv0/4/8/12 (vector registers). while xf0-xf15 can be used as
* the eight registers xd0/2/4/6/8/10/12/14 (pair registers) or register
* matrix XMTRX
*
* note, the sh4 does not support endian conversion for 64-bit data.
* therefore, if 64-bit floating point access is performed in little endian
* mode, the upper and lower 32 bits will be reversed. for example, dr2
* aliases fr2 and fr3, but fr3 is actually the low-order word
*
* in order to avoid swapping the words in every double-precision opcode, the
* mapping for each pair of single-precision registers is instead swapped by
* XOR'ing the actual index with 1. for example, fr2 becomes fr[3] and fr3
* becomes fr[2], enabling dr2 to perfectly alias fr[2]
/* there are 32 32-bit floating point registers, fr0-fr15 and xf0-xf15. these
registers are banked, and swapped with eachother when the bank bit of
FPSCR changes. in addition, fr0fr15 can be used as the eight registers
dr0/2/4/6/8/10/12/14 (double-precision, or pair registers) or the four
registers fv0/4/8/12 (vector registers). while xf0-xf15 can be used as
the eight registers xd0/2/4/6/8/10/12/14 (pair registers) or register
matrix XMTRX
* note note, this incorrectly causes fv registers to be swizzled. fv0 should
* be loaded as {fr0, fr1, fr2, fr3} but it's actually loaded as
* {fr1, fr0, fr3, fr2}. however, due to the way the FV registers are
* used (FIPR and FTRV) this doesn't actually affect the results
*/
note, the sh4 does not support endian conversion for 64-bit data.
therefore, if 64-bit floating point access is performed in little endian
mode, the upper and lower 32 bits will be reversed. for example, dr2
aliases fr2 and fr3, but fr3 is actually the low-order word
in order to avoid swapping the words in every double-precision opcode, the
mapping for each pair of single-precision registers is instead swapped by
XOR'ing the actual index with 1. for example, fr2 becomes fr[3] and fr3
becomes fr[2], enabling dr2 to perfectly alias fr[2]
note note, this incorrectly causes fv registers to be swizzled. fv0 should
be loaded as {fr0, fr1, fr2, fr3} but it's actually loaded as
{fr1, fr0, fr3, fr2}. however, due to the way the FV registers are
used (FIPR and FTRV) this doesn't actually affect the results */
uint32_t fr[16], xf[16];
uint32_t pc, pr, sr, sr_qm, fpscr;
uint32_t pc, pr, sr, sr_t, sr_s, sr_qm, fpscr;
uint32_t dbr, gbr, vbr;
uint32_t fpul, mach, macl;
uint32_t sgr, spc, ssr;

View File

@ -53,9 +53,11 @@ static emit_cb emit_callbacks[NUM_SH4_OPS] = {
#define load_xfr(n, type) ir_load_xfr(ir, n, type)
#define store_xfr(n, v) ir_store_xfr(ir, n, v)
#define load_sr() ir_load_sr(ir)
#define store_sr(v, update) ir_store_sr(frontend, ir, v, update)
#define store_sr(v) ir_store_sr(frontend, ir, v)
#define load_t() ir_load_t(ir)
#define store_t(v) ir_store_t(frontend, ir, v)
#define load_s() ir_load_s(ir)
#define store_s(v) ir_store_s(frontend, ir, v)
#define load_gbr() ir_load_gbr(ir)
#define store_gbr(v) ir_store_gbr(ir, v)
#define load_fpscr() ir_load_fpscr(ir)
@ -120,41 +122,63 @@ static void ir_store_xfr(struct ir *ir, int n, struct ir_value *v) {
}
static struct ir_value *ir_load_sr(struct ir *ir) {
return ir_load_context(ir, offsetof(struct sh4_ctx, sr), VALUE_I32);
struct ir_value *sr =
ir_load_context(ir, offsetof(struct sh4_ctx, sr), VALUE_I32);
/* inlined version of sh4_implode_sr */
struct ir_value *sr_t =
ir_load_context(ir, offsetof(struct sh4_ctx, sr_t), VALUE_I32);
struct ir_value *sr_s =
ir_load_context(ir, offsetof(struct sh4_ctx, sr_s), VALUE_I32);
sr = ir_and(ir, sr, ir_alloc_i32(ir, ~(S_MASK | T_MASK)));
sr = ir_or(ir, sr, sr_t);
sr = ir_or(ir, sr, ir_shli(ir, sr_s, S_BIT));
return sr;
}
static void ir_store_sr(struct sh4_frontend *frontend, struct ir *ir,
struct ir_value *v, int update) {
CHECK_EQ(v->type, VALUE_I32);
struct ir_value *sr) {
CHECK_EQ(sr->type, VALUE_I32);
struct ir_value *sr_updated = NULL;
struct ir_value *data = NULL;
struct ir_value *old_sr = NULL;
struct ir_value *sr_updated = ir_alloc_ptr(ir, frontend->sr_updated);
struct ir_value *data = ir_alloc_ptr(ir, frontend->data);
struct ir_value *old_sr = load_sr();
if (update) {
sr_updated = ir_alloc_ptr(ir, frontend->sr_updated);
data = ir_alloc_ptr(ir, frontend->data);
old_sr = load_sr();
}
ir_store_context(ir, offsetof(struct sh4_ctx, sr), sr);
ir_store_context(ir, offsetof(struct sh4_ctx, sr), v);
/* inline version of sh4_explode_sr */
struct ir_value *sr_t = ir_and(ir, sr, ir_alloc_i32(ir, T_MASK));
struct ir_value *sr_s =
ir_lshri(ir, ir_and(ir, sr, ir_alloc_i32(ir, S_MASK)), S_BIT);
ir_store_context(ir, offsetof(struct sh4_ctx, sr_t), sr_t);
ir_store_context(ir, offsetof(struct sh4_ctx, sr_s), sr_s);
if (update) {
/* TODO inline the check to see if RB, I or BL bits changed */
ir_call_2(ir, sr_updated, data, old_sr);
}
}
static struct ir_value *ir_load_t(struct ir *ir) {
return ir_and(ir, load_sr(), ir_alloc_i32(ir, T));
return ir_load_context(ir, offsetof(struct sh4_ctx, sr_t), VALUE_I32);
}
static void ir_store_t(struct sh4_frontend *frontend, struct ir *ir,
struct ir_value *v) {
struct ir_value *sr = load_sr();
struct ir_value *sr_t = ir_or(ir, sr, ir_alloc_i32(ir, T));
struct ir_value *sr_not = ir_and(ir, sr, ir_alloc_i32(ir, ~T));
struct ir_value *res = ir_select(ir, v, sr_t, sr_not);
store_sr(res, 0);
/* zext the results of ir_cmp_* */
if (v->type != VALUE_I32) {
v = ir_zext(ir, v, VALUE_I32);
}
ir_store_context(ir, offsetof(struct sh4_ctx, sr_t), v);
}
static struct ir_value *ir_load_s(struct ir *ir) {
return ir_load_context(ir, offsetof(struct sh4_ctx, sr_s), VALUE_I32);
}
static void ir_store_s(struct sh4_frontend *frontend, struct ir *ir,
struct ir_value *v) {
CHECK_EQ(v->type, VALUE_I32);
ir_store_context(ir, offsetof(struct sh4_ctx, sr_s), v);
}
static struct ir_value *ir_load_gbr(struct ir *ir) {
@ -609,6 +633,7 @@ EMITTER(ADDC) {
struct ir_value *not_v = ir_not(ir, v);
struct ir_value *carry = ir_and(ir, or_rnrm, not_v);
carry = ir_or(ir, and_rnrm, carry);
carry = ir_lshri(ir, carry, 31);
store_t(carry);
}
@ -624,7 +649,8 @@ EMITTER(ADDV) {
/* compute overflow flag, taken from Hacker's Delight */
struct ir_value *xor_vrn = ir_xor(ir, v, rn);
struct ir_value *xor_vrm = ir_xor(ir, v, rm);
struct ir_value *overflow = ir_lshri(ir, ir_and(ir, xor_vrn, xor_vrm), 31);
struct ir_value *overflow = ir_and(ir, xor_vrn, xor_vrm);
overflow = ir_lshri(ir, overflow, 31);
store_t(overflow);
}
@ -731,7 +757,8 @@ EMITTER(DIV0S) {
ir_store_context(ir, offsetof(struct sh4_ctx, sr_qm), ir_not(ir, qm));
/* msb of Q ^ M -> T */
store_t(ir_lshri(ir, qm, 31));
struct ir_value *t = ir_lshri(ir, qm, 31);
store_t(t);
}
// code cycles t-bit
@ -740,8 +767,7 @@ EMITTER(DIV0S) {
EMITTER(DIV0U) {
ir_store_context(ir, offsetof(struct sh4_ctx, sr_qm),
ir_alloc_i32(ir, 0x80000000));
store_sr(ir_and(ir, load_sr(), ir_alloc_i32(ir, ~T)), 0);
store_t(ir_alloc_i32(ir, 0));
}
// code cycles t-bit
@ -780,7 +806,8 @@ EMITTER(DIV1) {
ir_store_context(ir, offsetof(struct sh4_ctx, sr_qm), qm);
/* set T to output bit (which happens to be Q == M) */
store_t(ir_lshri(ir, qm, 31));
struct ir_value *t = ir_lshri(ir, qm, 31);
store_t(t);
}
// DMULS.L Rm,Rn
@ -893,6 +920,7 @@ EMITTER(NEGC) {
struct ir_value *v = ir_sub(ir, ir_neg(ir, rm), t);
store_gpr(i->Rn, v);
struct ir_value *carry = ir_or(ir, t, rm);
carry = ir_lshri(ir, carry, 31);
store_t(carry);
}
@ -916,6 +944,7 @@ EMITTER(SUBC) {
struct ir_value *l = ir_and(ir, ir_not(ir, rn), rm);
struct ir_value *r = ir_and(ir, ir_or(ir, ir_not(ir, rn), rm), v);
struct ir_value *carry = ir_or(ir, l, r);
carry = ir_lshri(ir, carry, 31);
store_t(carry);
}
@ -929,7 +958,8 @@ EMITTER(SUBV) {
// compute overflow flag, taken from Hacker's Delight
struct ir_value *xor_rnrm = ir_xor(ir, rn, rm);
struct ir_value *xor_vrn = ir_xor(ir, v, rn);
struct ir_value *overflow = ir_lshri(ir, ir_and(ir, xor_rnrm, xor_vrn), 31);
struct ir_value *overflow = ir_and(ir, xor_rnrm, xor_vrn);
overflow = ir_lshri(ir, overflow, 31);
store_t(overflow);
}
@ -1319,9 +1349,7 @@ EMITTER(CLRMAC) {
}
EMITTER(CLRS) {
struct ir_value *sr = load_sr();
sr = ir_and(ir, sr, ir_alloc_i32(ir, ~S));
store_sr(sr, 1);
store_s(ir_alloc_i32(ir, 0));
}
// code cycles t-bit
@ -1334,7 +1362,7 @@ EMITTER(CLRT) {
// LDC Rm,SR
EMITTER(LDCSR) {
struct ir_value *rm = load_gpr(i->Rm, VALUE_I32);
store_sr(rm, 1);
store_sr(rm);
}
// LDC Rm,GBR
@ -1378,7 +1406,7 @@ EMITTER(LDCRBANK) {
EMITTER(LDCMSR) {
struct ir_value *addr = load_gpr(i->Rm, VALUE_I32);
struct ir_value *v = load_guest(addr, VALUE_I32);
store_sr(v, 1);
store_sr(v);
/* reload Rm, sr store could have swapped banks */
addr = load_gpr(i->Rm, VALUE_I32);
addr = ir_add(ir, addr, ir_alloc_i32(ir, 4));
@ -1524,15 +1552,14 @@ EMITTER(RTE) {
ir_load_context(ir, offsetof(struct sh4_ctx, spc), VALUE_I32);
struct ir_value *ssr =
ir_load_context(ir, offsetof(struct sh4_ctx, ssr), VALUE_I32);
store_sr(ssr, 1);
store_sr(ssr);
emit_delay_instr();
branch(spc);
}
// SETS
EMITTER(SETS) {
struct ir_value *sr = ir_or(ir, load_sr(), ir_alloc_i32(ir, S));
store_sr(sr, 1);
store_s(ir_alloc_i32(ir, 1));
}
// SETT

View File

@ -107,6 +107,9 @@ static void run_sh4_test(struct dreamcast *dc, const struct sh4_test *test) {
dc_tick(dc, 1);
}
/* ensure sh4 sr is up to date before testing against it */
sh4_implode_sr(dc->sh4);
/* validate out registers */
for (int i = 0; i < sh4_num_test_regs; i++) {
struct sh4_test_reg *reg = &sh4_test_regs[i];
@ -138,7 +141,7 @@ TEST(sh4_x64) {
{0}, \
{fr1, fr0, fr3, fr2, fr5, fr4, fr7, fr6, fr9, fr8, fr11, fr10, fr13, fr12, fr15, fr14}, \
{xf1, xf0, xf3, xf2, xf5, xf4, xf7, xf6, xf9, xf8, xf11, xf10, xf13, xf12, xf15, xf14}, \
0, 0, 0, 0, fpscr, \
0, 0, 0, 0, 0, 0, fpscr, \
0, 0, 0, \
0, 0, 0, \
0, 0, 0, \