diff --git a/core/hw/sh4/dyna/decoder.cpp b/core/hw/sh4/dyna/decoder.cpp index 4977b92f5..457996ee3 100644 --- a/core/hw/sh4/dyna/decoder.cpp +++ b/core/hw/sh4/dyna/decoder.cpp @@ -508,6 +508,8 @@ static void dec_param(DecParam p,shil_param& r1,shil_param& r2, u32 op) u32 shft=p-PRM_RN_D4_x1; r1=mk_regi(reg_r0+GetN(op)); r2=mk_imm(GetImm4(op)<guest_opcodes++; dec_updateBlockCycles(blk, op); - if (OpDesc[op]->IsFloatingPoint()) + if (!blk->has_fpu_op && OpDesc[op]->IsFloatingPoint()) { if (sr.FD == 1) { diff --git a/core/hw/sh4/dyna/shil_canonical.h b/core/hw/sh4/dyna/shil_canonical.h index 74fe6a489..4c499c5e6 100644 --- a/core/hw/sh4/dyna/shil_canonical.h +++ b/core/hw/sh4/dyna/shil_canonical.h @@ -135,6 +135,28 @@ shil_compile( \ die("This opcode requires native dynarec implementation"); \ ) +#if SHIL_MODE==1 + +template +static inline float innerProduct(const float *f1, const float *f2) +{ +#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64 + const double f = (double)f1[0] * f2[Stride * 0] + + (double)f1[1] * f2[Stride * 1] + + (double)f1[2] * f2[Stride * 2] + + (double)f1[3] * f2[Stride * 3]; + return fixNaN((float)f); +#else + const float f = f1[0] * f2[Stride * 0] + + f1[1] * f2[Stride * 1] + + f1[2] * f2[Stride * 2] + + f1[3] * f2[Stride * 3]; + return fixNaN(f); +#endif +} + +#endif + #else #define BIN_OP_I(z) @@ -415,24 +437,6 @@ shil_compile shil_opc_end() - -//shop_swap -- swap all bytes in word -shil_opc(swap) -shil_canonical -( -u32,f1,(u32 r1), - return (r1 >>24) | ((r1 >>16)&0xFF00) |((r1&0xFF00)<<8) | (r1<<24); -) - -shil_compile -( - shil_cf_arg_u32(rs1); - shil_cf(f1); - shil_cf_rv_u32(rd); -) - -shil_opc_end() - //shop_shld shil_opc(shld) shil_canonical @@ -909,31 +913,12 @@ shil_opc_end() //shop_fipr shil_opc(fipr) -#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 shil_canonical ( f32,f1,(const float* fn, const float* fm), - double idp = (double)fn[0] * fm[0]; - idp += (double)fn[1] * fm[1]; - idp += (double)fn[2] * fm[2]; - idp += (double)fn[3] * fm[3]; - - return fixNaN((float)idp); + return innerProduct(fn, fm); ) -#else -shil_canonical -( -f32,f1,(float* fn, float* fm), - - float idp = fn[0] * fm[0]; - idp+=fn[1]*fm[1]; - idp+=fn[2]*fm[2]; - idp+=fn[3]*fm[3]; - - return fixNaN(idp); -) -#endif shil_compile ( @@ -942,74 +927,24 @@ shil_compile shil_cf(f1); shil_cf_rv_f32(rd); ) - shil_opc_end() - - //shop_ftrv shil_opc(ftrv) -#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 shil_canonical ( -void,f1,(float* fd, const float* fn, const float* fm), +void,f1,(float *fd, const float *fn, const float *fm), - double v1 = (double)fm[0] * fn[0] + - (double)fm[4] * fn[1] + - (double)fm[8] * fn[2] + - (double)fm[12] * fn[3]; - - double v2 = (double)fm[1] * fn[0] + - (double)fm[5] * fn[1] + - (double)fm[9] * fn[2] + - (double)fm[13] * fn[3]; - - double v3 = (double)fm[2] * fn[0] + - (double)fm[6] * fn[1] + - (double)fm[10] * fn[2] + - (double)fm[14] * fn[3]; - - double v4 = (double)fm[3] * fn[0] + - (double)fm[7] * fn[1] + - (double)fm[11] * fn[2] + - (double)fm[15] * fn[3]; - - fd[0] = fixNaN((float)v1); - fd[1] = fixNaN((float)v2); - fd[2] = fixNaN((float)v3); - fd[3] = fixNaN((float)v4); + float v1 = innerProduct<4>(fn, fm); + float v2 = innerProduct<4>(fn, fm + 1); + float v3 = innerProduct<4>(fn, fm + 2); + float v4 = innerProduct<4>(fn, fm + 3); + fd[0] = v1; + fd[1] = v2; + fd[2] = v3; + fd[3] = v4; ) -#else -shil_canonical -( -void,f1,(float* fd,float* fn, float* fm), - float v1 = fm[0] * fn[0] + - fm[4] * fn[1] + - fm[8] * fn[2] + - fm[12] * fn[3]; - - float v2 = fm[1] * fn[0] + - fm[5] * fn[1] + - fm[9] * fn[2] + - fm[13] * fn[3]; - - float v3 = fm[2] * fn[0] + - fm[6] * fn[1] + - fm[10] * fn[2] + - fm[14] * fn[3]; - - float v4 = fm[3] * fn[0] + - fm[7] * fn[1] + - fm[11] * fn[2] + - fm[15] * fn[3]; - - fd[0] = fixNaN(v1); - fd[1] = fixNaN(v2); - fd[2] = fixNaN(v3); - fd[3] = fixNaN(v4); -) -#endif shil_compile ( shil_cf_arg_ptr(rs2); @@ -1024,7 +959,7 @@ shil_opc(fmac) shil_canonical ( f32,f1,(float fn, float f0,float fm), - return fixNaN(fn + f0 * fm); + return fixNaN(std::fma(f0, fm, fn)); ) shil_compile ( @@ -1038,7 +973,18 @@ shil_opc_end() //shop_fsrra shil_opc(fsrra) -UN_OP_F(1/sqrtf) +shil_canonical +( +f32,f1,(float fn), + + return std::sqrt(1.f / fn); +) +shil_compile +( + shil_cf_arg_f32(rs1); + shil_cf(f1); + shil_cf_rv_f32(rd); +) shil_opc_end() diff --git a/core/hw/sh4/dyna/ssa.cpp b/core/hw/sh4/dyna/ssa.cpp index 66e4e911c..11b641bf3 100644 --- a/core/hw/sh4/dyna/ssa.cpp +++ b/core/hw/sh4/dyna/ssa.cpp @@ -167,9 +167,6 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op) case shop_swaplb: rd = shil_opcl_swaplb::f1::impl(rs1); break; - case shop_swap: - rd = shil_opcl_swap::f1::impl(rs1); - break; case shop_seteq: rd = shil_opcl_seteq::f1::impl(rs1, rs2); break; diff --git a/core/hw/sh4/dyna/ssa.h b/core/hw/sh4/dyna/ssa.h index 68f84d558..369c31a10 100644 --- a/core/hw/sh4/dyna/ssa.h +++ b/core/hw/sh4/dyna/ssa.h @@ -70,6 +70,7 @@ public: for (shil_opcode& op : block->oplist) { + // FIXME shop_ifb should be assumed to increase versions too? (increment all reg_versions[]) AddVersionToOperand(op.rs1, false); AddVersionToOperand(op.rs2, false); AddVersionToOperand(op.rs3, false); @@ -212,26 +213,18 @@ private: } else if (op.op == shop_readm || op.op == shop_writem) { - if (op.rs1.is_imm()) + if (op.rs1.is_imm() && !op.rs3.is_reg()) { - if (op.rs3.is_imm()) - { - // Merge base addr and offset + // Merge base addr and offset + if (op.rs3.is_imm()) { op.rs1._imm += op.rs3.imm_value(); op.rs3.type = FMT_NULL; } - else if (op.rs3.is_reg()) - { - // Swap rs1 and rs3 so that rs1 is never an immediate operand - shil_param t = op.rs1; - op.rs1 = op.rs3; - op.rs3 = t; - } // If we know the address to read and it's in the same memory page(s) as the block // and if those pages are read-only, then we can directly read the memory at compile time // and propagate the read value as a constant. - if (op.rs1.is_imm() && op.op == shop_readm && block->read_only + if (op.op == shop_readm && block->read_only && (op.rs1._imm >> 12) >= (block->vaddr >> 12) && (op.rs1._imm >> 12) <= ((block->vaddr + block->sh4_code_size - 1) >> 12) && op.size <= 4) @@ -263,6 +256,15 @@ private: } } } + else + { + if (op.rs1.is_imm() && op.rs3.is_reg()) + // Swap rs1 and rs3 so that rs1 is never an immediate operand + std::swap(op.rs1, op.rs3); + if (op.rs3.is_imm() && op.rs3.imm_value() == 0) + // 0 displacement has no effect + op.rs3.type = FMT_NULL; + } } else if (ExecuteConstOp(&op)) { @@ -440,9 +442,9 @@ private: for (size_t opnum = 0; opnum < block->oplist.size(); opnum++) { shil_opcode& op = block->oplist[opnum]; - if (op.rs2.is_imm()) + if (op.rs2.is_imm() || op.rs2.is_null()) { - if (op.rs2.imm_value() == 0) + if (op.rs2.is_null() || op.rs2.imm_value() == 0) { // a & 0 == 0 // a * 0 == 0 @@ -590,10 +592,15 @@ private: defnum = opnum; // find alias redef - if (DefinesHigherVersion(op->rd, alias.second) && aliasdef == (size_t)-1) - aliasdef = opnum; - else if (DefinesHigherVersion(op->rd2, alias.second) && aliasdef == (size_t)-1) - aliasdef = opnum; + if (aliasdef == (size_t)-1) + { + if (DefinesHigherVersion(op->rd, alias.second)) + aliasdef = opnum; + else if (DefinesHigherVersion(op->rd2, alias.second)) + aliasdef = opnum; + else if (op->op == shop_ifb) + aliasdef = opnum; + } // find last use if (UsesRegValue(op->rs1, alias.first)) diff --git a/core/hw/sh4/interpr/sh4_fpu.cpp b/core/hw/sh4/interpr/sh4_fpu.cpp index f25673b22..66ec61004 100644 --- a/core/hw/sh4/interpr/sh4_fpu.cpp +++ b/core/hw/sh4/interpr/sh4_fpu.cpp @@ -362,7 +362,7 @@ sh4op(i1111_nnnn_0111_1101) u32 n = GetN(op); if (fpscr.PR==0) { - fr[n] = (float)(1/sqrtf(fr[n])); + fr[n] = sqrtf(1.f / fr[n]); CHECK_FPU_32(fr[n]); } else @@ -406,23 +406,12 @@ sh4op(i1111_nnmm_1110_1101) int m=(GetN(op)&0x3)<<2; if (fpscr.PR == 0) { -#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 - // multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits double idp = (double)fr[n + 0] * fr[m + 0]; idp += (double)fr[n + 1] * fr[m + 1]; idp += (double)fr[n + 2] * fr[m + 2]; idp += (double)fr[n + 3] * fr[m + 3]; fr[n + 3] = fixNaN((float)idp); -#else - float rv = fr[n + 0] * fr[m + 0]; - rv += fr[n + 1] * fr[m + 1]; - rv += fr[n + 2] * fr[m + 2]; - rv += fr[n + 3] * fr[m + 3]; - - CHECK_FPU_32(rv); - fr[n + 3] = rv; -#endif } else { @@ -521,7 +510,7 @@ sh4op(i1111_nnnn_0110_1101) } else { - setDRn(op, fixNaN64(sqrt(getDRn(op)))); + setDRn(op, fixNaN64(std::sqrt(getDRn(op)))); } } @@ -567,7 +556,7 @@ sh4op(i1111_nnnn_mmmm_1110) u32 n = GetN(op); u32 m = GetM(op); - fr[n] =(f32) ((f64)fr[n]+(f64)fr[0] * (f64)fr[m]); + fr[n] = std::fma(fr[0], fr[m], fr[n]); CHECK_FPU_32(fr[n]); } else @@ -591,7 +580,6 @@ sh4op(i1111_nn01_1111_1101) if (fpscr.PR==0) { -#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 double v1 = (double)xf[0] * fr[n + 0] + (double)xf[4] * fr[n + 1] + (double)xf[8] * fr[n + 2] + @@ -616,39 +604,6 @@ sh4op(i1111_nn01_1111_1101) fr[n + 1] = fixNaN((float)v2); fr[n + 2] = fixNaN((float)v3); fr[n + 3] = fixNaN((float)v4); -#else - float v1, v2, v3, v4; - - v1 = xf[0] * fr[n + 0] + - xf[4] * fr[n + 1] + - xf[8] * fr[n + 2] + - xf[12] * fr[n + 3]; - - v2 = xf[1] * fr[n + 0] + - xf[5] * fr[n + 1] + - xf[9] * fr[n + 2] + - xf[13] * fr[n + 3]; - - v3 = xf[2] * fr[n + 0] + - xf[6] * fr[n + 1] + - xf[10] * fr[n + 2] + - xf[14] * fr[n + 3]; - - v4 = xf[3] * fr[n + 0] + - xf[7] * fr[n + 1] + - xf[11] * fr[n + 2] + - xf[15] * fr[n + 3]; - - CHECK_FPU_32(v1); - CHECK_FPU_32(v2); - CHECK_FPU_32(v3); - CHECK_FPU_32(v4); - - fr[n + 0] = v1; - fr[n + 1] = v2; - fr[n + 2] = v3; - fr[n + 3] = v4; -#endif } else { diff --git a/core/hw/sh4/interpr/sh4_opcodes.cpp b/core/hw/sh4/interpr/sh4_opcodes.cpp index 13f334238..77f479e34 100644 --- a/core/hw/sh4/interpr/sh4_opcodes.cpp +++ b/core/hw/sh4/interpr/sh4_opcodes.cpp @@ -16,8 +16,6 @@ #include "hw/sh4/sh4_cache.h" #endif -#define iNimp cpu_iNimp - //Read Mem macros #define ReadMemU32(to,addr) to=ReadMem32(addr) @@ -41,11 +39,6 @@ #define WriteMemBOU8(addr,offset,data) WriteMemU8(addr+offset,data) // 0xxx -void cpu_iNimp(u32 op, const char* info) -{ - ERROR_LOG(INTERPRETER, "Unimplemented opcode: %08X next_pc: %08X pr: %08X msg: %s", op, next_pc, pr, info); - die("iNimp reached\n"); -} //stc GBR, sh4op(i0000_nnnn_0001_0010) diff --git a/core/hw/sh4/sh4_opcode_list.cpp b/core/hw/sh4/sh4_opcode_list.cpp index 9fe9f007e..67ac81d2d 100644 --- a/core/hw/sh4/sh4_opcode_list.cpp +++ b/core/hw/sh4/sh4_opcode_list.cpp @@ -80,9 +80,9 @@ static u64 dec_MRd(DecParam d,DecParam s,u32 sz) { return dec_Fill(DM_ReadM,d,s, //d= reg to read from static u64 dec_MWt(DecParam d,DecParam s,u32 sz) { return dec_Fill(DM_WriteM,d,s,shop_writem,sz); } -sh4_opcodelistentry missing_opcode = {dec_illegalOp, iNotImplemented, 0, 0, ReadWritePC, "missing", 0, 0, CO, 1 }; +static sh4_opcodelistentry missing_opcode = {dec_illegalOp, iNotImplemented, 0, 0, ReadWritePC, "missing", 0, 0, CO, 1 }; -sh4_opcodelistentry opcodes[]= +static sh4_opcodelistentry opcodes[]= { //HLE {0, reios_trap, Mask_none, REIOS_OPCODE, Branch_dir, "reios_trap", 100, 100, CO, 1 }, @@ -344,7 +344,7 @@ sh4_opcodelistentry opcodes[]= {0,0,0,0,ReadWritePC}//Branch in order to stop the block and save PC ect :) }; -void BuildOpcodeTables() +static void BuildOpcodeTables() { for (int i=0;i<0x10000;i++) diff --git a/core/hw/sh4/sh4_opcode_list.h b/core/hw/sh4/sh4_opcode_list.h index 718d61164..a87be546e 100644 --- a/core/hw/sh4/sh4_opcode_list.h +++ b/core/hw/sh4/sh4_opcode_list.h @@ -76,7 +76,6 @@ struct sh4_opcodelistentry }; extern sh4_opcodelistentry* OpDesc[0x10000]; -extern sh4_opcodelistentry opcodes[]; void DissasembleOpcode(u16 opcode,u32 pc,char* Dissasm); enum DecParam diff --git a/core/rec-ARM/rec_arm.cpp b/core/rec-ARM/rec_arm.cpp index 9a194df19..21ad5faf0 100644 --- a/core/rec-ARM/rec_arm.cpp +++ b/core/rec-ARM/rec_arm.cpp @@ -1933,8 +1933,8 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op case shop_fsrra: Vmov(s1, 1.f); - Vsqrt(s0, reg.mapFReg(op->rs1)); - Vdiv(reg.mapFReg(op->rd), s1, s0); + Vdiv(s0, s1, reg.mapFReg(op->rs1)); + Vsqrt(reg.mapFReg(op->rd), s0); break; case shop_fsetgt: @@ -1986,7 +1986,6 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op case shop_fipr: { - QRegister _r1 = q0; QRegister _r2 = q0; diff --git a/core/rec-ARM64/rec_arm64.cpp b/core/rec-ARM64/rec_arm64.cpp index 69dccc976..654acf4ba 100644 --- a/core/rec-ARM64/rec_arm64.cpp +++ b/core/rec-ARM64/rec_arm64.cpp @@ -879,9 +879,9 @@ public: break; case shop_fsrra: - Fsqrt(s0, regalloc.MapVRegister(op.rs1)); Fmov(s1, 1.f); - Fdiv(regalloc.MapVRegister(op.rd), s1, s0); + Fdiv(s0, s1, regalloc.MapVRegister(op.rs1)); + Fsqrt(regalloc.MapVRegister(op.rd), s0); break; case shop_fsetgt: @@ -907,6 +907,7 @@ public: } break; + /* fall back to the canonical implementations for better precision case shop_fipr: Add(x9, x28, sh4_context_mem_operand(op.rs1.reg_ptr()).GetOffset()); Ld1(v0.V4S(), MemOperand(x9)); @@ -937,6 +938,7 @@ public: Add(x9, x28, sh4_context_mem_operand(op.rd.reg_ptr()).GetOffset()); St1(v5.V4S(), MemOperand(x9)); break; + */ case shop_frswap: Add(x9, x28, sh4_context_mem_operand(op.rs1.reg_ptr()).GetOffset()); @@ -1077,13 +1079,11 @@ public: switch (size) { case 1: - GenCallRuntime(addrspace::read8); - Sxtb(w0, w0); + GenCallRuntime(addrspace::read8SX32); break; case 2: - GenCallRuntime(addrspace::read16); - Sxth(w0, w0); + GenCallRuntime(addrspace::read16SX32); break; case 4: @@ -1497,7 +1497,7 @@ public: // w0: vaddr, w1: addr checkBlockFpu = GetCursorAddress(); Label fpu_enabled; - Ldr(w10, sh4_context_mem_operand(&sr)); + Ldr(w10, sh4_context_mem_operand(&sr.status)); Tbz(w10, 15, &fpu_enabled); // test SR.FD bit Mov(w1, Sh4Ex_FpuDisabled); // exception code diff --git a/core/rec-x64/rec_x64.cpp b/core/rec-x64/rec_x64.cpp index db2f7fe5f..ffbaa2573 100644 --- a/core/rec-x64/rec_x64.cpp +++ b/core/rec-x64/rec_x64.cpp @@ -135,7 +135,7 @@ public: if (mmu_enabled() && block->has_fpu_op) { Xbyak::Label fpu_enabled; - mov(rax, (uintptr_t)&sr); + mov(rax, (uintptr_t)&sr.status); test(dword[rax], 0x8000); // test SR.FD bit jz(fpu_enabled); mov(call_regs[0], block->vaddr); // pc diff --git a/core/rec-x64/xbyak_base.h b/core/rec-x64/xbyak_base.h index e4cbc28ea..1baa4579e 100644 --- a/core/rec-x64/xbyak_base.h +++ b/core/rec-x64/xbyak_base.h @@ -537,19 +537,18 @@ protected: break; case shop_fsrra: - // RSQRTSS has an |error| <= 1.5*2^-12 where the SH4 FSRRA needs |error| <= 2^-21 - sqrtss(xmm0, mapXRegister(op.rs1)); if (ArchX64) { mov(eax, 0x3f800000); // 1.0 - movd(mapXRegister(op.rd), eax); + movd(xmm0, eax); } else { static float one = 1.f; - movss(mapXRegister(op.rd), dword[&one]); + movss(xmm0, dword[&one]); } - divss(mapXRegister(op.rd), xmm0); + divss(xmm0, mapXRegister(op.rs1)); + sqrtss(mapXRegister(op.rd), xmm0); break; case shop_fsetgt: diff --git a/core/rec-x86/rec_x86.cpp b/core/rec-x86/rec_x86.cpp index 4e6fbba44..f78e811bc 100644 --- a/core/rec-x86/rec_x86.cpp +++ b/core/rec-x86/rec_x86.cpp @@ -108,7 +108,7 @@ void X86Compiler::compile(RuntimeBlockInfo* block, bool force_checks, bool optim if (mmu_enabled() && block->has_fpu_op) { Xbyak::Label fpu_enabled; - mov(eax, dword[&sr]); + mov(eax, dword[&sr.status]); test(eax, 0x8000); // test SR.FD bit jz(fpu_enabled); push(Sh4Ex_FpuDisabled); // exception code