separated T and S bits from sh4 status register into their own context members, sr_t and sr_s, reducing codegen by around 15%

2016-12-30 15:41:29 -08:00 · 2016-12-30 15:41:29 -08:00 · b2fd497cda
parent d658bb2bfe
commit b2fd497cda
6 changed files with 143 additions and 95 deletions
--- a/src/hw/sh4/sh4.c
+++ b/src/hw/sh4/sh4.c
@ -63,11 +63,12 @@ void sh4_sr_updated(void *data, uint32_t old_sr) {

  prof_counter_add(COUNTER_sh4_sr_updates, 1);

-  if ((ctx->sr & RB) != (old_sr & RB)) {
+  if ((ctx->sr & RB_MASK) != (old_sr & RB_MASK)) {
    sh4_swap_gpr_bank(sh4);
  }

-  if ((ctx->sr & I) != (old_sr & I) || (ctx->sr & BL) != (old_sr & BL)) {
+  if ((ctx->sr & I_MASK) != (old_sr & I_MASK) ||
+      (ctx->sr & BL_MASK) != (old_sr & BL_MASK)) {
    sh4_intc_update_pending(sh4);
  }
 }
@ -205,6 +206,16 @@ static void sh4_translate(void *data, uint32_t addr, struct ir *ir, int fastmem,
  *size = as.size;
 }

+void sh4_implode_sr(struct sh4 *sh4) {
+  sh4->ctx.sr &= ~(S_MASK | T_MASK);
+  sh4->ctx.sr |= (sh4->ctx.sr_s << S_BIT) | (sh4->ctx.sr_t << T_BIT);
+}
+
+void sh4_explode_sr(struct sh4 *sh4) {
+  sh4->ctx.sr_t = (sh4->ctx.sr & T_MASK) >> T_BIT;
+  sh4->ctx.sr_s = (sh4->ctx.sr & S_MASK) >> S_BIT;
+}
+
 void sh4_clear_interrupt(struct sh4 *sh4, enum sh4_interrupt intr) {
  sh4->requested_interrupts &= ~sh4->sort_id[intr];
  sh4_intc_update_pending(sh4);
--- a/src/hw/sh4/sh4.h
+++ b/src/hw/sh4/sh4.h
@ -83,6 +83,8 @@ void sh4_destroy(struct sh4 *sh);
 void sh4_reset(struct sh4 *sh4, uint32_t pc);
 void sh4_raise_interrupt(struct sh4 *sh, enum sh4_interrupt intr);
 void sh4_clear_interrupt(struct sh4 *sh, enum sh4_interrupt intr);
+void sh4_explode_sr(struct sh4 *sh4);
+void sh4_implode_sr(struct sh4 *sh4);
 void sh4_sr_updated(void *data, uint32_t old_sr);
 void sh4_fpscr_updated(void *data, uint32_t old_fpscr);

--- a/src/hw/sh4/sh4_intc.c
+++ b/src/hw/sh4/sh4_intc.c
@ -12,9 +12,9 @@ static struct sh4_interrupt_info sh4_interrupts[NUM_SH_INTERRUPTS] = {
 };

 void sh4_intc_update_pending(struct sh4 *sh4) {
-  int min_priority = (sh4->ctx.sr & I) >> 4;
+  int min_priority = (sh4->ctx.sr & I_MASK) >> I_BIT;
  uint64_t priority_mask =
-      (sh4->ctx.sr & BL) ? 0 : ~sh4->priority_mask[min_priority];
+      (sh4->ctx.sr & BL_MASK) ? 0 : ~sh4->priority_mask[min_priority];
  sh4->ctx.pending_interrupts = sh4->requested_interrupts & priority_mask;
 }

@ -28,11 +28,14 @@ int sh4_intc_check_pending(struct sh4 *sh4) {
  enum sh4_interrupt intr = sh4->sorted_interrupts[n];
  struct sh4_interrupt_info *int_info = &sh4_interrupts[intr];

+  /* ensure sr is up to date */
+  sh4_implode_sr(sh4);
+
  *sh4->INTEVT = int_info->intevt;
  sh4->ctx.ssr = sh4->ctx.sr;
  sh4->ctx.spc = sh4->ctx.pc;
  sh4->ctx.sgr = sh4->ctx.r[15];
-  sh4->ctx.sr |= (BL | MD | RB);
+  sh4->ctx.sr |= (BL_MASK | MD_MASK | RB_MASK);
  sh4->ctx.pc = sh4->ctx.vbr + 0x600;
  sh4_sr_updated(sh4, sh4->ctx.ssr);

--- a/src/jit/frontend/sh4/sh4_context.h
+++ b/src/jit/frontend/sh4/sh4_context.h
@ -4,29 +4,35 @@
 #include <stdint.h>

 /* SR bits */
-enum {
+
 /* true / false condition or carry/borrow bit */
-  T = 0x00000001,
+#define T_BIT 0
 /* specifies a saturation operation for a MAC instruction */
-  S = 0x00000002,
+#define S_BIT 1
 /* interrupt mask level */
-  I = 0x000000f0,
+#define I_BIT 4
 /* used by the DIV0S, DIV0U, and DIV1 instructions */
-  Q = 0x00000100,
-  /* used by the DIV0S, DIV0U, and DIV1 instructions */
-  M = 0x00000200,
+#define Q_BIT 8
+#define M_BIT 9
 /* an FPU instr causes a general FPU disable exception */
-  FD = 0x00008000,
+#define FD_BIT 15
 /* interrupt requests are masked */
-  BL = 0x10000000,
-  /*
-   * general register bank specifier in privileged mode (set
-   * to 1 by a reset, exception, or interrupt)
-   */
-  RB = 0x20000000,
+#define BL_BIT 28
+/* general register bank specifier in privileged mode (set
+   to 1 by a reset, exception, or interrupt) */
+#define RB_BIT 29
 /* processor mode (0 is user mode, 1 is privileged mode) */
-  MD = 0x40000000
-};
+#define MD_BIT 30
+
+#define T_MASK (1u << T_BIT)
+#define S_MASK (1u << S_BIT)
+#define I_MASK 0xf0
+#define Q_MASK (1u << Q_BIT)
+#define M_MASK (1u << M_BIT)
+#define FD_MASK (1u << FD_BIT)
+#define BL_MASK (1u << BL_BIT)
+#define RB_MASK (1u << RB_BIT)
+#define MD_MASK (1u << MD_BIT)

 /* FPSCR bits */
 enum {
@ -38,41 +44,37 @@ enum {
 };

 struct sh4_ctx {
-  /*
-   * there are 24 32-bit general registers, r0_bank0-r7_bank0, r0_bank1-r7_bank1
-   * and r8-r15. r contains the active bank's r0-r7 as well as r8-r15. ralt
-   * contains the inactive bank's r0-r7 and is swapped in when the processor
-   * mode changes
-   */
+  /* there are 24 32-bit general registers, r0_bank0-r7_bank0, r0_bank1-r7_bank1
+     and r8-r15. r contains the active bank's r0-r7 as well as r8-r15. ralt
+     contains the inactive bank's r0-r7 and is swapped in when the processor
+     mode changes */
  uint32_t r[16], ralt[8];

-  /*
-   * there are 32 32-bit floating point registers, fr0-fr15 and xf0-xf15. these
-   * registers are banked, and swapped with eachother when the bank bit of
-   * FPSCR changes. in addition, fr0–fr15 can be used as the eight registers
-   * dr0/2/4/6/8/10/12/14 (double-precision, or pair registers) or the four
-   * registers fv0/4/8/12 (vector registers). while xf0-xf15 can be used as
-   * the eight registers xd0/2/4/6/8/10/12/14 (pair registers) or register
-   * matrix XMTRX
-   *
-   * note, the sh4 does not support endian conversion for 64-bit data.
-   * therefore, if 64-bit floating point access is performed in little endian
-   * mode, the upper and lower 32 bits will be reversed. for example, dr2
-   * aliases fr2 and fr3, but fr3 is actually the low-order word
-   *
-   * in order to avoid swapping the words in every double-precision opcode, the
-   * mapping for each pair of single-precision registers is instead swapped by
-   * XOR'ing the actual index with 1. for example, fr2 becomes fr[3] and fr3
-   * becomes fr[2], enabling dr2 to perfectly alias fr[2]
+  /* there are 32 32-bit floating point registers, fr0-fr15 and xf0-xf15. these
+     registers are banked, and swapped with eachother when the bank bit of
+     FPSCR changes. in addition, fr0–fr15 can be used as the eight registers
+     dr0/2/4/6/8/10/12/14 (double-precision, or pair registers) or the four
+     registers fv0/4/8/12 (vector registers). while xf0-xf15 can be used as
+     the eight registers xd0/2/4/6/8/10/12/14 (pair registers) or register
+     matrix XMTRX

-   * note note, this incorrectly causes fv registers to be swizzled. fv0 should
-   * be loaded as {fr0, fr1, fr2, fr3} but it's actually loaded as
-   * {fr1, fr0, fr3, fr2}. however, due to the way the FV registers are
-   * used (FIPR and FTRV) this doesn't actually affect the results
-   */
+     note, the sh4 does not support endian conversion for 64-bit data.
+     therefore, if 64-bit floating point access is performed in little endian
+     mode, the upper and lower 32 bits will be reversed. for example, dr2
+     aliases fr2 and fr3, but fr3 is actually the low-order word
+
+     in order to avoid swapping the words in every double-precision opcode, the
+     mapping for each pair of single-precision registers is instead swapped by
+     XOR'ing the actual index with 1. for example, fr2 becomes fr[3] and fr3
+     becomes fr[2], enabling dr2 to perfectly alias fr[2]
+
+     note note, this incorrectly causes fv registers to be swizzled. fv0 should
+     be loaded as {fr0, fr1, fr2, fr3} but it's actually loaded as
+     {fr1, fr0, fr3, fr2}. however, due to the way the FV registers are
+     used (FIPR and FTRV) this doesn't actually affect the results */
  uint32_t fr[16], xf[16];

-  uint32_t pc, pr, sr, sr_qm, fpscr;
+  uint32_t pc, pr, sr, sr_t, sr_s, sr_qm, fpscr;
  uint32_t dbr, gbr, vbr;
  uint32_t fpul, mach, macl;
  uint32_t sgr, spc, ssr;
--- a/src/jit/frontend/sh4/sh4_translate.c
+++ b/src/jit/frontend/sh4/sh4_translate.c
@ -53,9 +53,11 @@ static emit_cb emit_callbacks[NUM_SH4_OPS] = {
 #define load_xfr(n, type) ir_load_xfr(ir, n, type)
 #define store_xfr(n, v) ir_store_xfr(ir, n, v)
 #define load_sr() ir_load_sr(ir)
-#define store_sr(v, update) ir_store_sr(frontend, ir, v, update)
+#define store_sr(v) ir_store_sr(frontend, ir, v)
 #define load_t() ir_load_t(ir)
 #define store_t(v) ir_store_t(frontend, ir, v)
+#define load_s() ir_load_s(ir)
+#define store_s(v) ir_store_s(frontend, ir, v)
 #define load_gbr() ir_load_gbr(ir)
 #define store_gbr(v) ir_store_gbr(ir, v)
 #define load_fpscr() ir_load_fpscr(ir)
@ -120,41 +122,63 @@ static void ir_store_xfr(struct ir *ir, int n, struct ir_value *v) {
 }

 static struct ir_value *ir_load_sr(struct ir *ir) {
-  return ir_load_context(ir, offsetof(struct sh4_ctx, sr), VALUE_I32);
+  struct ir_value *sr =
+      ir_load_context(ir, offsetof(struct sh4_ctx, sr), VALUE_I32);
+
+  /* inlined version of sh4_implode_sr */
+  struct ir_value *sr_t =
+      ir_load_context(ir, offsetof(struct sh4_ctx, sr_t), VALUE_I32);
+  struct ir_value *sr_s =
+      ir_load_context(ir, offsetof(struct sh4_ctx, sr_s), VALUE_I32);
+  sr = ir_and(ir, sr, ir_alloc_i32(ir, ~(S_MASK | T_MASK)));
+  sr = ir_or(ir, sr, sr_t);
+  sr = ir_or(ir, sr, ir_shli(ir, sr_s, S_BIT));
+
+  return sr;
 }

 static void ir_store_sr(struct sh4_frontend *frontend, struct ir *ir,
-                        struct ir_value *v, int update) {
-  CHECK_EQ(v->type, VALUE_I32);
+                        struct ir_value *sr) {
+  CHECK_EQ(sr->type, VALUE_I32);

-  struct ir_value *sr_updated = NULL;
-  struct ir_value *data = NULL;
-  struct ir_value *old_sr = NULL;
+  struct ir_value *sr_updated = ir_alloc_ptr(ir, frontend->sr_updated);
+  struct ir_value *data = ir_alloc_ptr(ir, frontend->data);
+  struct ir_value *old_sr = load_sr();

-  if (update) {
-    sr_updated = ir_alloc_ptr(ir, frontend->sr_updated);
-    data = ir_alloc_ptr(ir, frontend->data);
-    old_sr = load_sr();
-  }
+  ir_store_context(ir, offsetof(struct sh4_ctx, sr), sr);

-  ir_store_context(ir, offsetof(struct sh4_ctx, sr), v);
+  /* inline version of sh4_explode_sr */
+  struct ir_value *sr_t = ir_and(ir, sr, ir_alloc_i32(ir, T_MASK));
+  struct ir_value *sr_s =
+      ir_lshri(ir, ir_and(ir, sr, ir_alloc_i32(ir, S_MASK)), S_BIT);
+  ir_store_context(ir, offsetof(struct sh4_ctx, sr_t), sr_t);
+  ir_store_context(ir, offsetof(struct sh4_ctx, sr_s), sr_s);

-  if (update) {
+  /* TODO inline the check to see if RB, I or BL bits changed */
  ir_call_2(ir, sr_updated, data, old_sr);
 }
-}

 static struct ir_value *ir_load_t(struct ir *ir) {
-  return ir_and(ir, load_sr(), ir_alloc_i32(ir, T));
+  return ir_load_context(ir, offsetof(struct sh4_ctx, sr_t), VALUE_I32);
 }

 static void ir_store_t(struct sh4_frontend *frontend, struct ir *ir,
                       struct ir_value *v) {
-  struct ir_value *sr = load_sr();
-  struct ir_value *sr_t = ir_or(ir, sr, ir_alloc_i32(ir, T));
-  struct ir_value *sr_not = ir_and(ir, sr, ir_alloc_i32(ir, ~T));
-  struct ir_value *res = ir_select(ir, v, sr_t, sr_not);
-  store_sr(res, 0);
+  /* zext the results of ir_cmp_* */
+  if (v->type != VALUE_I32) {
+    v = ir_zext(ir, v, VALUE_I32);
+  }
+  ir_store_context(ir, offsetof(struct sh4_ctx, sr_t), v);
+}
+
+static struct ir_value *ir_load_s(struct ir *ir) {
+  return ir_load_context(ir, offsetof(struct sh4_ctx, sr_s), VALUE_I32);
+}
+
+static void ir_store_s(struct sh4_frontend *frontend, struct ir *ir,
+                       struct ir_value *v) {
+  CHECK_EQ(v->type, VALUE_I32);
+  ir_store_context(ir, offsetof(struct sh4_ctx, sr_s), v);
 }

 static struct ir_value *ir_load_gbr(struct ir *ir) {
@ -609,6 +633,7 @@ EMITTER(ADDC) {
  struct ir_value *not_v = ir_not(ir, v);
  struct ir_value *carry = ir_and(ir, or_rnrm, not_v);
  carry = ir_or(ir, and_rnrm, carry);
+  carry = ir_lshri(ir, carry, 31);
  store_t(carry);
 }

@ -624,7 +649,8 @@ EMITTER(ADDV) {
  /* compute overflow flag, taken from Hacker's Delight */
  struct ir_value *xor_vrn = ir_xor(ir, v, rn);
  struct ir_value *xor_vrm = ir_xor(ir, v, rm);
-  struct ir_value *overflow = ir_lshri(ir, ir_and(ir, xor_vrn, xor_vrm), 31);
+  struct ir_value *overflow = ir_and(ir, xor_vrn, xor_vrm);
+  overflow = ir_lshri(ir, overflow, 31);
  store_t(overflow);
 }

@ -731,7 +757,8 @@ EMITTER(DIV0S) {
  ir_store_context(ir, offsetof(struct sh4_ctx, sr_qm), ir_not(ir, qm));

  /* msb of Q ^ M -> T */
-  store_t(ir_lshri(ir, qm, 31));
+  struct ir_value *t = ir_lshri(ir, qm, 31);
+  store_t(t);
 }

 // code                 cycles  t-bit
@ -740,8 +767,7 @@ EMITTER(DIV0S) {
 EMITTER(DIV0U) {
  ir_store_context(ir, offsetof(struct sh4_ctx, sr_qm),
                   ir_alloc_i32(ir, 0x80000000));
-
-  store_sr(ir_and(ir, load_sr(), ir_alloc_i32(ir, ~T)), 0);
+  store_t(ir_alloc_i32(ir, 0));
 }

 // code                 cycles  t-bit
@ -780,7 +806,8 @@ EMITTER(DIV1) {
  ir_store_context(ir, offsetof(struct sh4_ctx, sr_qm), qm);

  /* set T to output bit (which happens to be Q == M) */
-  store_t(ir_lshri(ir, qm, 31));
+  struct ir_value *t = ir_lshri(ir, qm, 31);
+  store_t(t);
 }

 // DMULS.L Rm,Rn
@ -893,6 +920,7 @@ EMITTER(NEGC) {
  struct ir_value *v = ir_sub(ir, ir_neg(ir, rm), t);
  store_gpr(i->Rn, v);
  struct ir_value *carry = ir_or(ir, t, rm);
+  carry = ir_lshri(ir, carry, 31);
  store_t(carry);
 }

@ -916,6 +944,7 @@ EMITTER(SUBC) {
  struct ir_value *l = ir_and(ir, ir_not(ir, rn), rm);
  struct ir_value *r = ir_and(ir, ir_or(ir, ir_not(ir, rn), rm), v);
  struct ir_value *carry = ir_or(ir, l, r);
+  carry = ir_lshri(ir, carry, 31);
  store_t(carry);
 }

@ -929,7 +958,8 @@ EMITTER(SUBV) {
  // compute overflow flag, taken from Hacker's Delight
  struct ir_value *xor_rnrm = ir_xor(ir, rn, rm);
  struct ir_value *xor_vrn = ir_xor(ir, v, rn);
-  struct ir_value *overflow = ir_lshri(ir, ir_and(ir, xor_rnrm, xor_vrn), 31);
+  struct ir_value *overflow = ir_and(ir, xor_rnrm, xor_vrn);
+  overflow = ir_lshri(ir, overflow, 31);
  store_t(overflow);
 }

@ -1319,9 +1349,7 @@ EMITTER(CLRMAC) {
 }

 EMITTER(CLRS) {
-  struct ir_value *sr = load_sr();
-  sr = ir_and(ir, sr, ir_alloc_i32(ir, ~S));
-  store_sr(sr, 1);
+  store_s(ir_alloc_i32(ir, 0));
 }

 // code                 cycles  t-bit
@ -1334,7 +1362,7 @@ EMITTER(CLRT) {
 // LDC     Rm,SR
 EMITTER(LDCSR) {
  struct ir_value *rm = load_gpr(i->Rm, VALUE_I32);
-  store_sr(rm, 1);
+  store_sr(rm);
 }

 // LDC     Rm,GBR
@ -1378,7 +1406,7 @@ EMITTER(LDCRBANK) {
 EMITTER(LDCMSR) {
  struct ir_value *addr = load_gpr(i->Rm, VALUE_I32);
  struct ir_value *v = load_guest(addr, VALUE_I32);
-  store_sr(v, 1);
+  store_sr(v);
  /* reload Rm, sr store could have swapped banks */
  addr = load_gpr(i->Rm, VALUE_I32);
  addr = ir_add(ir, addr, ir_alloc_i32(ir, 4));
@ -1524,15 +1552,14 @@ EMITTER(RTE) {
      ir_load_context(ir, offsetof(struct sh4_ctx, spc), VALUE_I32);
  struct ir_value *ssr =
      ir_load_context(ir, offsetof(struct sh4_ctx, ssr), VALUE_I32);
-  store_sr(ssr, 1);
+  store_sr(ssr);
  emit_delay_instr();
  branch(spc);
 }

 // SETS
 EMITTER(SETS) {
-  struct ir_value *sr = ir_or(ir, load_sr(), ir_alloc_i32(ir, S));
-  store_sr(sr, 1);
+  store_s(ir_alloc_i32(ir, 1));
 }

 // SETT
--- a/test/test_sh4.c
+++ b/test/test_sh4.c
@ -107,6 +107,9 @@ static void run_sh4_test(struct dreamcast *dc, const struct sh4_test *test) {
    dc_tick(dc, 1);
  }

+  /* ensure sh4 sr is up to date before testing against it */
+  sh4_implode_sr(dc->sh4);
+
  /* validate out registers */
  for (int i = 0; i < sh4_num_test_regs; i++) {
    struct sh4_test_reg *reg = &sh4_test_regs[i];
@ -138,7 +141,7 @@ TEST(sh4_x64) {
    {0},                                                                                         \
    {fr1, fr0, fr3,  fr2,  fr5,  fr4,  fr7,  fr6, fr9, fr8, fr11, fr10, fr13, fr12, fr15, fr14}, \
    {xf1, xf0, xf3,  xf2,  xf5,  xf4,  xf7,  xf6, xf9, xf8, xf11, xf10, xf13, xf12, xf15, xf14}, \
-    0, 0, 0, 0, fpscr,                                                                           \
+    0, 0, 0, 0, 0, 0, fpscr,                                                                     \
    0, 0, 0,                                                                                     \
    0, 0, 0,                                                                                     \
    0, 0, 0,                                                                                     \