bsnes/higan/processor/arm7tdmi/instructions-arm.cpp

auto ARM7TDMI::armALU(uint4 mode, uint4 target, uint4 source, uint32 data) -> void {
  switch(mode) {
  case  0: r(target) = BIT(r(source) & data);          break;  //AND
  case  1: r(target) = BIT(r(source) ^ data);          break;  //EOR
  case  2: r(target) = SUB(r(source), data, 1);        break;  //SUB
  case  3: r(target) = SUB(data, r(source), 1);        break;  //RSB
  case  4: r(target) = ADD(r(source), data, 0);        break;  //ADD
  case  5: r(target) = ADD(r(source), data, cpsr().c); break;  //ADC
  case  6: r(target) = SUB(r(source), data, cpsr().c); break;  //SBC
  case  7: r(target) = SUB(data, r(source), cpsr().c); break;  //RSC
  case  8:             BIT(r(source) & data);          break;  //TST
  case  9:             BIT(r(source) ^ data);          break;  //TEQ
  case 10:             SUB(r(source), data, 1);        break;  //CMP
  case 11:             ADD(r(source), data, 0);        break;  //CMN
  case 12: r(target) = BIT(r(source) | data);          break;  //ORR
  case 13: r(target) = BIT(data);                      break;  //MOV
  case 14: r(target) = BIT(r(source) & ~data);         break;  //BIC
  case 15: r(target) = BIT(~data);                     break;  //MVN
  }

  if(exception() && target == 15 && opcode.bit(20)) {
    cpsr() = spsr();
  }
}

auto ARM7TDMI::armMoveToStatus(uint4 field, uint1 mode, uint32 data) -> void {
  if(mode && (cpsr().m == PSR::USR || cpsr().m == PSR::SYS)) return;
  PSR& psr = mode ? spsr() : cpsr();

  if(field.bit(0)) {
    if(mode || privileged()) {
      psr.m = 0x10 | data.bits(0,4);
      psr.t = data.bit (5);
      psr.f = data.bit (6);
      psr.i = data.bit (7);
    }
  }

  if(field.bit(3)) {
    psr.v = data.bit(28);
    psr.c = data.bit(29);
    psr.z = data.bit(30);
    psr.n = data.bit(31);
  }
}

//

auto ARM7TDMI::armInstructionBranchExchangeRegister
(uint4 m) -> void {
  uint32 address = r(m);
  cpsr().t = address.bit(0);
  r(15) = address;
}

auto ARM7TDMI::armInstructionLoadImmediate
(uint8 immediate, uint1 half, uint4 d, uint4 n, uint1 writeback, uint1 up, uint1 pre) -> void {
  uint32 rn = r(n);
  uint32 rd = r(d);

  if(pre == 1) rn = up ? rn + immediate : rn - immediate;
  rd = load((half ? Half : Byte) | Nonsequential | Signed, rn);
  if(pre == 0) rn = up ? rn + immediate : rn - immediate;

  if(pre == 0 || writeback) r(n) = rn;
  r(d) = rd;
}

auto ARM7TDMI::armInstructionLoadRegister
(uint4 m, uint1 half, uint4 d, uint4 n, uint1 writeback, uint1 up, uint1 pre) -> void {
  uint32 rn = r(n);
  uint32 rm = r(m);
  uint32 rd = r(d);

  if(pre == 1) rn = up ? rn + rm : rn - rm;
  rd = load((half ? Half : Byte) | Nonsequential | Signed, rn);
  if(pre == 0) rn = up ? rn + rm : rn - rm;

  if(pre == 0 || writeback) r(n) = rn;
  r(d) = rd;
}

auto ARM7TDMI::armInstructionMemorySwap
(uint4 m, uint4 d, uint4 n, uint1 byte) -> void {
  uint32 word = load((byte ? Byte : Word) | Nonsequential, r(n));
  store((byte ? Byte : Word) | Nonsequential, r(n), r(m));
  r(d) = word;
}

auto ARM7TDMI::armInstructionMoveHalfImmediate
(uint8 immediate, uint4 d, uint4 n, uint1 mode, uint1 writeback, uint1 up, uint1 pre) -> void {
  uint32 rn = r(n);
  uint32 rd = r(d);

  if(pre == 1) rn = up ? rn + immediate : rn - immediate;
  if(mode == 1) rd = load(Half | Nonsequential, rn);
  if(mode == 0) store(Half | Nonsequential, rn, rd);
  if(pre == 0) rn = up ? rn + immediate : rn - immediate;

  if(pre == 0 || writeback) r(n) = rn;
  if(mode == 1) r(d) = rd;
}

auto ARM7TDMI::armInstructionMoveHalfRegister
(uint4 m, uint4 d, uint4 n, uint1 mode, uint1 writeback, uint1 up, uint1 pre) -> void {
  uint32 rn = r(n);
  uint32 rm = r(m);
  uint32 rd = r(d);

  if(pre == 1) rn = up ? rn + rm : rn - rm;
  if(mode == 1) rd = load(Half | Nonsequential, rn);
  if(mode == 0) store(Half | Nonsequential, rn, rd);
  if(pre == 0) rn = up ? rn + rm : rn - rm;

  if(pre == 0 || writeback) r(n) = rn;
  if(mode == 1) r(d) = rd;
}

auto ARM7TDMI::armInstructionMoveToRegisterFromStatus
(uint4 d, uint1 mode) -> void {
  if(mode && (cpsr().m == PSR::USR || cpsr().m == PSR::SYS)) return;
  r(d) = mode ? spsr() : cpsr();
}

auto ARM7TDMI::armInstructionMoveToStatusFromImmediate
(uint8 immediate, uint4 rotate, uint4 field, uint1 mode) -> void {
  uint32 data = immediate;
  if(rotate) data = ROR(data, rotate << 1);
  armMoveToStatus(field, mode, data);
}

auto ARM7TDMI::armInstructionMoveToStatusFromRegister
(uint4 m, uint4 field, uint1 mode) -> void {
  armMoveToStatus(field, mode, r(m));
}

auto ARM7TDMI::armInstructionMultiply
(uint4 m, uint4 s, uint4 n, uint4 d, uint1 save, uint1 accumulate) -> void {
  if(accumulate) idle();
  r(d) = MUL(accumulate ? r(n) : 0, r(m), r(s));
}

auto ARM7TDMI::armInstructionMultiplyLong
(uint4 m, uint4 s, uint4 l, uint4 h, uint1 save, uint1 accumulate, uint1 sign) -> void {
  uint64 rm = r(m);
  uint64 rs = r(s);

  idle();
  idle();
  if(accumulate) idle();

  if(sign) {
    if(rs >>  8 && rs >>  8 != 0xffffff) idle();
    if(rs >> 16 && rs >> 16 !=   0xffff) idle();
    if(rs >> 24 && rs >> 24 !=     0xff) idle();
    rm = (int32)rm;
    rs = (int32)rs;
  } else {
    if(rs >>  8) idle();
    if(rs >> 16) idle();
    if(rs >> 24) idle();
  }

  uint64 rd = rm * rs;
  if(accumulate) rd += (uint64)r(h) << 32 | (uint64)r(l) << 0;

  r(h) = rd >> 32;
  r(l) = rd >>  0;

  if(save) {
    cpsr().z = rd == 0;
    cpsr().n = rd.bit(63);
  }
}
Update to v103r28 release. byuu says: Changelog: - processor/arm7tdmi: implemented 10 of 19 ARM instructions - processor/arm7tdmi: implemented 1 of 22 THUMB instructions Today's WIP was 6 hours of work, and yesterday's was 5 hours. Half of today was just trying to come up with the design to use a lambda-based dispatcher to map both instructions and disassembly, similar to the 68K core. The problem is that the ARM core has 28 unique bits, which is just far too many bits to have a full lookup table like the 16-bit 68K core. The thing I wanted more than anything else was to perform the opcode bitfield decoding once, and have it decoded for both instructions and the disassembler. It took three hours to come up with a design that worked for the ARM half ... relying on #defines being able to pull in other #defines that were declared and changed later after the first one. But, I'm happy with it. The decoding is in the table building, as it is with the 68K core. The decoding does happen at run-time on each instruction invocation, but it has to be done. As to the THUMB core, I can create a 64K-entry lambda table to cover all possible encodings, and ... even though it's a cache killer, I've decided to go for it, given the outstanding performance it obtained in the M68K core, as well as considering that THUMB mode is far more common in GBA games. As to both cores ... I'm a little torn between two extremes: On the one hand, I can condense the number of ARM/THUMB instructions further to eliminate more redundant code. On the other, I can split them apart to reduce the number of conditional tests needed to execute each instruction. It's really the disassembler that makes me not want to split them up further ... as I have to split the disassembler functions up equally to the instruction functions. But it may be worth it if it's a speed improvement. 2017-08-07 12:20:35 +00:00			`auto ARM7TDMI::armALU(uint4 mode, uint4 target, uint4 source, uint32 data) -> void {`
Update to v103r27 release. byuu says: Changelog: - hiro/windows: set dpiAware=false, fixes icarus window sizes relative to higan window sizes - higan, icarus, hiro, ruby: add support for high resolution displays on macOS [ncbncb] - processor/lr35902-legacy: removed - processor/arm7tdmi: new processor core started; intended to one day be a replacement for processor/arm It will probably take several WIPs to get the new ARM core up and running. It's the last processor rewrite. After this, all processor cores will be up to date with all my current programming conventions. 2017-08-06 13:36:26 +00:00			`switch(mode) {`
Update to v103r28 release. byuu says: Changelog: - processor/arm7tdmi: implemented 10 of 19 ARM instructions - processor/arm7tdmi: implemented 1 of 22 THUMB instructions Today's WIP was 6 hours of work, and yesterday's was 5 hours. Half of today was just trying to come up with the design to use a lambda-based dispatcher to map both instructions and disassembly, similar to the 68K core. The problem is that the ARM core has 28 unique bits, which is just far too many bits to have a full lookup table like the 16-bit 68K core. The thing I wanted more than anything else was to perform the opcode bitfield decoding once, and have it decoded for both instructions and the disassembler. It took three hours to come up with a design that worked for the ARM half ... relying on #defines being able to pull in other #defines that were declared and changed later after the first one. But, I'm happy with it. The decoding is in the table building, as it is with the 68K core. The decoding does happen at run-time on each instruction invocation, but it has to be done. As to the THUMB core, I can create a 64K-entry lambda table to cover all possible encodings, and ... even though it's a cache killer, I've decided to go for it, given the outstanding performance it obtained in the M68K core, as well as considering that THUMB mode is far more common in GBA games. As to both cores ... I'm a little torn between two extremes: On the one hand, I can condense the number of ARM/THUMB instructions further to eliminate more redundant code. On the other, I can split them apart to reduce the number of conditional tests needed to execute each instruction. It's really the disassembler that makes me not want to split them up further ... as I have to split the disassembler functions up equally to the instruction functions. But it may be worth it if it's a speed improvement. 2017-08-07 12:20:35 +00:00			`case 0: r(target) = BIT(r(source) & data); break; //AND`
			`case 1: r(target) = BIT(r(source) ^ data); break; //EOR`
			`case 2: r(target) = SUB(r(source), data, 1); break; //SUB`
			`case 3: r(target) = SUB(data, r(source), 1); break; //RSB`
			`case 4: r(target) = ADD(r(source), data, 0); break; //ADD`
			`case 5: r(target) = ADD(r(source), data, cpsr().c); break; //ADC`
			`case 6: r(target) = SUB(r(source), data, cpsr().c); break; //SBC`
			`case 7: r(target) = SUB(data, r(source), cpsr().c); break; //RSC`
			`case 8: BIT(r(source) & data); break; //TST`
			`case 9: BIT(r(source) ^ data); break; //TEQ`
			`case 10: SUB(r(source), data, 1); break; //CMP`
			`case 11: ADD(r(source), data, 0); break; //CMN`
			`case 12: r(target) = BIT(r(source) \| data); break; //ORR`
			`case 13: r(target) = BIT(data); break; //MOV`
			`case 14: r(target) = BIT(r(source) & ~data); break; //BIC`
			`case 15: r(target) = BIT(~data); break; //MVN`
Update to v103r27 release. byuu says: Changelog: - hiro/windows: set dpiAware=false, fixes icarus window sizes relative to higan window sizes - higan, icarus, hiro, ruby: add support for high resolution displays on macOS [ncbncb] - processor/lr35902-legacy: removed - processor/arm7tdmi: new processor core started; intended to one day be a replacement for processor/arm It will probably take several WIPs to get the new ARM core up and running. It's the last processor rewrite. After this, all processor cores will be up to date with all my current programming conventions. 2017-08-06 13:36:26 +00:00			`}`

Update to v103r28 release. byuu says: Changelog: - processor/arm7tdmi: implemented 10 of 19 ARM instructions - processor/arm7tdmi: implemented 1 of 22 THUMB instructions Today's WIP was 6 hours of work, and yesterday's was 5 hours. Half of today was just trying to come up with the design to use a lambda-based dispatcher to map both instructions and disassembly, similar to the 68K core. The problem is that the ARM core has 28 unique bits, which is just far too many bits to have a full lookup table like the 16-bit 68K core. The thing I wanted more than anything else was to perform the opcode bitfield decoding once, and have it decoded for both instructions and the disassembler. It took three hours to come up with a design that worked for the ARM half ... relying on #defines being able to pull in other #defines that were declared and changed later after the first one. But, I'm happy with it. The decoding is in the table building, as it is with the 68K core. The decoding does happen at run-time on each instruction invocation, but it has to be done. As to the THUMB core, I can create a 64K-entry lambda table to cover all possible encodings, and ... even though it's a cache killer, I've decided to go for it, given the outstanding performance it obtained in the M68K core, as well as considering that THUMB mode is far more common in GBA games. As to both cores ... I'm a little torn between two extremes: On the one hand, I can condense the number of ARM/THUMB instructions further to eliminate more redundant code. On the other, I can split them apart to reduce the number of conditional tests needed to execute each instruction. It's really the disassembler that makes me not want to split them up further ... as I have to split the disassembler functions up equally to the instruction functions. But it may be worth it if it's a speed improvement. 2017-08-07 12:20:35 +00:00			`if(exception() && target == 15 && opcode.bit(20)) {`
Update to v103r27 release. byuu says: Changelog: - hiro/windows: set dpiAware=false, fixes icarus window sizes relative to higan window sizes - higan, icarus, hiro, ruby: add support for high resolution displays on macOS [ncbncb] - processor/lr35902-legacy: removed - processor/arm7tdmi: new processor core started; intended to one day be a replacement for processor/arm It will probably take several WIPs to get the new ARM core up and running. It's the last processor rewrite. After this, all processor cores will be up to date with all my current programming conventions. 2017-08-06 13:36:26 +00:00			`cpsr() = spsr();`
			`}`
			`}`
Update to v103r28 release. byuu says: Changelog: - processor/arm7tdmi: implemented 10 of 19 ARM instructions - processor/arm7tdmi: implemented 1 of 22 THUMB instructions Today's WIP was 6 hours of work, and yesterday's was 5 hours. Half of today was just trying to come up with the design to use a lambda-based dispatcher to map both instructions and disassembly, similar to the 68K core. The problem is that the ARM core has 28 unique bits, which is just far too many bits to have a full lookup table like the 16-bit 68K core. The thing I wanted more than anything else was to perform the opcode bitfield decoding once, and have it decoded for both instructions and the disassembler. It took three hours to come up with a design that worked for the ARM half ... relying on #defines being able to pull in other #defines that were declared and changed later after the first one. But, I'm happy with it. The decoding is in the table building, as it is with the 68K core. The decoding does happen at run-time on each instruction invocation, but it has to be done. As to the THUMB core, I can create a 64K-entry lambda table to cover all possible encodings, and ... even though it's a cache killer, I've decided to go for it, given the outstanding performance it obtained in the M68K core, as well as considering that THUMB mode is far more common in GBA games. As to both cores ... I'm a little torn between two extremes: On the one hand, I can condense the number of ARM/THUMB instructions further to eliminate more redundant code. On the other, I can split them apart to reduce the number of conditional tests needed to execute each instruction. It's really the disassembler that makes me not want to split them up further ... as I have to split the disassembler functions up equally to the instruction functions. But it may be worth it if it's a speed improvement. 2017-08-07 12:20:35 +00:00
			`auto ARM7TDMI::armMoveToStatus(uint4 field, uint1 mode, uint32 data) -> void {`
			`if(mode && (cpsr().m == PSR::USR \|\| cpsr().m == PSR::SYS)) return;`
			`PSR& psr = mode ? spsr() : cpsr();`

			`if(field.bit(0)) {`
			`if(mode \|\| privileged()) {`
			`psr.m = 0x10 \| data.bits(0,4);`
			`psr.t = data.bit (5);`
			`psr.f = data.bit (6);`
			`psr.i = data.bit (7);`
			`}`
			`}`

			`if(field.bit(3)) {`
			`psr.v = data.bit(28);`
			`psr.c = data.bit(29);`
			`psr.z = data.bit(30);`
			`psr.n = data.bit(31);`
			`}`
			`}`

			`//`

			`auto ARM7TDMI::armInstructionBranchExchangeRegister`
			`(uint4 m) -> void {`
			`uint32 address = r(m);`
			`cpsr().t = address.bit(0);`
			`r(15) = address;`
			`}`

			`auto ARM7TDMI::armInstructionLoadImmediate`
			`(uint8 immediate, uint1 half, uint4 d, uint4 n, uint1 writeback, uint1 up, uint1 pre) -> void {`
			`uint32 rn = r(n);`
			`uint32 rd = r(d);`

			`if(pre == 1) rn = up ? rn + immediate : rn - immediate;`
			`rd = load((half ? Half : Byte) \| Nonsequential \| Signed, rn);`
			`if(pre == 0) rn = up ? rn + immediate : rn - immediate;`

			`if(pre == 0 \|\| writeback) r(n) = rn;`
			`r(d) = rd;`
			`}`

			`auto ARM7TDMI::armInstructionLoadRegister`
			`(uint4 m, uint1 half, uint4 d, uint4 n, uint1 writeback, uint1 up, uint1 pre) -> void {`
			`uint32 rn = r(n);`
			`uint32 rm = r(m);`
			`uint32 rd = r(d);`

			`if(pre == 1) rn = up ? rn + rm : rn - rm;`
			`rd = load((half ? Half : Byte) \| Nonsequential \| Signed, rn);`
			`if(pre == 0) rn = up ? rn + rm : rn - rm;`

			`if(pre == 0 \|\| writeback) r(n) = rn;`
			`r(d) = rd;`
			`}`

			`auto ARM7TDMI::armInstructionMemorySwap`
			`(uint4 m, uint4 d, uint4 n, uint1 byte) -> void {`
			`uint32 word = load((byte ? Byte : Word) \| Nonsequential, r(n));`
			`store((byte ? Byte : Word) \| Nonsequential, r(n), r(m));`
			`r(d) = word;`
			`}`

			`auto ARM7TDMI::armInstructionMoveHalfImmediate`
			`(uint8 immediate, uint4 d, uint4 n, uint1 mode, uint1 writeback, uint1 up, uint1 pre) -> void {`
			`uint32 rn = r(n);`
			`uint32 rd = r(d);`

			`if(pre == 1) rn = up ? rn + immediate : rn - immediate;`
			`if(mode == 1) rd = load(Half \| Nonsequential, rn);`
			`if(mode == 0) store(Half \| Nonsequential, rn, rd);`
			`if(pre == 0) rn = up ? rn + immediate : rn - immediate;`

			`if(pre == 0 \|\| writeback) r(n) = rn;`
			`if(mode == 1) r(d) = rd;`
			`}`

			`auto ARM7TDMI::armInstructionMoveHalfRegister`
			`(uint4 m, uint4 d, uint4 n, uint1 mode, uint1 writeback, uint1 up, uint1 pre) -> void {`
			`uint32 rn = r(n);`
			`uint32 rm = r(m);`
			`uint32 rd = r(d);`

			`if(pre == 1) rn = up ? rn + rm : rn - rm;`
			`if(mode == 1) rd = load(Half \| Nonsequential, rn);`
			`if(mode == 0) store(Half \| Nonsequential, rn, rd);`
			`if(pre == 0) rn = up ? rn + rm : rn - rm;`

			`if(pre == 0 \|\| writeback) r(n) = rn;`
			`if(mode == 1) r(d) = rd;`
			`}`

			`auto ARM7TDMI::armInstructionMoveToRegisterFromStatus`
			`(uint4 d, uint1 mode) -> void {`
			`if(mode && (cpsr().m == PSR::USR \|\| cpsr().m == PSR::SYS)) return;`
			`r(d) = mode ? spsr() : cpsr();`
			`}`

			`auto ARM7TDMI::armInstructionMoveToStatusFromImmediate`
			`(uint8 immediate, uint4 rotate, uint4 field, uint1 mode) -> void {`
			`uint32 data = immediate;`
			`if(rotate) data = ROR(data, rotate << 1);`
			`armMoveToStatus(field, mode, data);`
			`}`

			`auto ARM7TDMI::armInstructionMoveToStatusFromRegister`
			`(uint4 m, uint4 field, uint1 mode) -> void {`
			`armMoveToStatus(field, mode, r(m));`
			`}`

			`auto ARM7TDMI::armInstructionMultiply`
			`(uint4 m, uint4 s, uint4 n, uint4 d, uint1 save, uint1 accumulate) -> void {`
			`if(accumulate) idle();`
			`r(d) = MUL(accumulate ? r(n) : 0, r(m), r(s));`
			`}`

			`auto ARM7TDMI::armInstructionMultiplyLong`
			`(uint4 m, uint4 s, uint4 l, uint4 h, uint1 save, uint1 accumulate, uint1 sign) -> void {`
			`uint64 rm = r(m);`
			`uint64 rs = r(s);`

			`idle();`
			`idle();`
			`if(accumulate) idle();`

			`if(sign) {`
			`if(rs >> 8 && rs >> 8 != 0xffffff) idle();`
			`if(rs >> 16 && rs >> 16 != 0xffff) idle();`
			`if(rs >> 24 && rs >> 24 != 0xff) idle();`
			`rm = (int32)rm;`
			`rs = (int32)rs;`
			`} else {`
			`if(rs >> 8) idle();`
			`if(rs >> 16) idle();`
			`if(rs >> 24) idle();`
			`}`

			`uint64 rd = rm * rs;`
			`if(accumulate) rd += (uint64)r(h) << 32 \| (uint64)r(l) << 0;`

			`r(h) = rd >> 32;`
			`r(l) = rd >> 0;`

			`if(save) {`
			`cpsr().z = rd == 0;`
			`cpsr().n = rd.bit(63);`
			`}`
			`}`