sh4: use double for fipr and ftrv. divide before sqrt for fsrra. ssa fix

dynarec: use double to implement fipr and ftrv except on arm32
interpreter: always use double for fipr and ftrv
fsrra: perform division before square root
fmac: use std::fma or native fma op
get rid of unused dynarec op shop_swap
ssa: dead register pass must assume interpreter fallback modifies all
registers
ssa: replace reg+0 address by reg in constant propagation pass
decoder: replace address offset 0 by null param for indexed mem access
This commit is contained in:
Flyinghead 2024-06-12 10:54:15 +02:00
parent 9aa7371d82
commit ec3ad9b328
13 changed files with 98 additions and 197 deletions

View File

@ -508,6 +508,8 @@ static void dec_param(DecParam p,shil_param& r1,shil_param& r2, u32 op)
u32 shft=p-PRM_RN_D4_x1;
r1=mk_regi(reg_r0+GetN(op));
r2=mk_imm(GetImm4(op)<<shft);
if (r2.imm_value() == 0)
r2 = shil_param();
}
break;
@ -523,6 +525,8 @@ static void dec_param(DecParam p,shil_param& r1,shil_param& r2, u32 op)
u32 shft=p-PRM_RM_D4_x1;
r1=mk_regi(reg_r0+GetM(op));
r2=mk_imm(GetImm4(op)<<shft);
if (r2.imm_value() == 0)
r2 = shil_param();
}
break;
@ -538,6 +542,8 @@ static void dec_param(DecParam p,shil_param& r1,shil_param& r2, u32 op)
u32 shft=p-PRM_GBR_D8_x1;
r1=mk_regi(reg_gbr);
r2=mk_imm(GetImm8(op)<<shft);
if (r2.imm_value() == 0)
r2 = shil_param();
}
break;
@ -979,7 +985,7 @@ bool dec_DecodeBlock(RuntimeBlockInfo* rbi,u32 max_cycles)
blk->guest_opcodes++;
dec_updateBlockCycles(blk, op);
if (OpDesc[op]->IsFloatingPoint())
if (!blk->has_fpu_op && OpDesc[op]->IsFloatingPoint())
{
if (sr.FD == 1)
{

View File

@ -135,6 +135,28 @@ shil_compile( \
die("This opcode requires native dynarec implementation"); \
)
#if SHIL_MODE==1
template<int Stride = 1>
static inline float innerProduct(const float *f1, const float *f2)
{
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64 || HOST_CPU == CPU_ARM64
const double f = (double)f1[0] * f2[Stride * 0]
+ (double)f1[1] * f2[Stride * 1]
+ (double)f1[2] * f2[Stride * 2]
+ (double)f1[3] * f2[Stride * 3];
return fixNaN((float)f);
#else
const float f = f1[0] * f2[Stride * 0]
+ f1[1] * f2[Stride * 1]
+ f1[2] * f2[Stride * 2]
+ f1[3] * f2[Stride * 3];
return fixNaN(f);
#endif
}
#endif
#else
#define BIN_OP_I(z)
@ -415,24 +437,6 @@ shil_compile
shil_opc_end()
//shop_swap -- swap all bytes in word
shil_opc(swap)
shil_canonical
(
u32,f1,(u32 r1),
return (r1 >>24) | ((r1 >>16)&0xFF00) |((r1&0xFF00)<<8) | (r1<<24);
)
shil_compile
(
shil_cf_arg_u32(rs1);
shil_cf(f1);
shil_cf_rv_u32(rd);
)
shil_opc_end()
//shop_shld
shil_opc(shld)
shil_canonical
@ -909,31 +913,12 @@ shil_opc_end()
//shop_fipr
shil_opc(fipr)
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
shil_canonical
(
f32,f1,(const float* fn, const float* fm),
double idp = (double)fn[0] * fm[0];
idp += (double)fn[1] * fm[1];
idp += (double)fn[2] * fm[2];
idp += (double)fn[3] * fm[3];
return fixNaN((float)idp);
return innerProduct(fn, fm);
)
#else
shil_canonical
(
f32,f1,(float* fn, float* fm),
float idp = fn[0] * fm[0];
idp+=fn[1]*fm[1];
idp+=fn[2]*fm[2];
idp+=fn[3]*fm[3];
return fixNaN(idp);
)
#endif
shil_compile
(
@ -942,74 +927,24 @@ shil_compile
shil_cf(f1);
shil_cf_rv_f32(rd);
)
shil_opc_end()
//shop_ftrv
shil_opc(ftrv)
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
shil_canonical
(
void,f1,(float* fd, const float* fn, const float* fm),
void,f1,(float *fd, const float *fn, const float *fm),
double v1 = (double)fm[0] * fn[0] +
(double)fm[4] * fn[1] +
(double)fm[8] * fn[2] +
(double)fm[12] * fn[3];
double v2 = (double)fm[1] * fn[0] +
(double)fm[5] * fn[1] +
(double)fm[9] * fn[2] +
(double)fm[13] * fn[3];
double v3 = (double)fm[2] * fn[0] +
(double)fm[6] * fn[1] +
(double)fm[10] * fn[2] +
(double)fm[14] * fn[3];
double v4 = (double)fm[3] * fn[0] +
(double)fm[7] * fn[1] +
(double)fm[11] * fn[2] +
(double)fm[15] * fn[3];
fd[0] = fixNaN((float)v1);
fd[1] = fixNaN((float)v2);
fd[2] = fixNaN((float)v3);
fd[3] = fixNaN((float)v4);
float v1 = innerProduct<4>(fn, fm);
float v2 = innerProduct<4>(fn, fm + 1);
float v3 = innerProduct<4>(fn, fm + 2);
float v4 = innerProduct<4>(fn, fm + 3);
fd[0] = v1;
fd[1] = v2;
fd[2] = v3;
fd[3] = v4;
)
#else
shil_canonical
(
void,f1,(float* fd,float* fn, float* fm),
float v1 = fm[0] * fn[0] +
fm[4] * fn[1] +
fm[8] * fn[2] +
fm[12] * fn[3];
float v2 = fm[1] * fn[0] +
fm[5] * fn[1] +
fm[9] * fn[2] +
fm[13] * fn[3];
float v3 = fm[2] * fn[0] +
fm[6] * fn[1] +
fm[10] * fn[2] +
fm[14] * fn[3];
float v4 = fm[3] * fn[0] +
fm[7] * fn[1] +
fm[11] * fn[2] +
fm[15] * fn[3];
fd[0] = fixNaN(v1);
fd[1] = fixNaN(v2);
fd[2] = fixNaN(v3);
fd[3] = fixNaN(v4);
)
#endif
shil_compile
(
shil_cf_arg_ptr(rs2);
@ -1024,7 +959,7 @@ shil_opc(fmac)
shil_canonical
(
f32,f1,(float fn, float f0,float fm),
return fixNaN(fn + f0 * fm);
return fixNaN(std::fma(f0, fm, fn));
)
shil_compile
(
@ -1038,7 +973,18 @@ shil_opc_end()
//shop_fsrra
shil_opc(fsrra)
UN_OP_F(1/sqrtf)
shil_canonical
(
f32,f1,(float fn),
return std::sqrt(1.f / fn);
)
shil_compile
(
shil_cf_arg_f32(rs1);
shil_cf(f1);
shil_cf_rv_f32(rd);
)
shil_opc_end()

View File

@ -167,9 +167,6 @@ bool SSAOptimizer::ExecuteConstOp(shil_opcode* op)
case shop_swaplb:
rd = shil_opcl_swaplb::f1::impl(rs1);
break;
case shop_swap:
rd = shil_opcl_swap::f1::impl(rs1);
break;
case shop_seteq:
rd = shil_opcl_seteq::f1::impl(rs1, rs2);
break;

View File

@ -70,6 +70,7 @@ public:
for (shil_opcode& op : block->oplist)
{
// FIXME shop_ifb should be assumed to increase versions too? (increment all reg_versions[])
AddVersionToOperand(op.rs1, false);
AddVersionToOperand(op.rs2, false);
AddVersionToOperand(op.rs3, false);
@ -212,26 +213,18 @@ private:
}
else if (op.op == shop_readm || op.op == shop_writem)
{
if (op.rs1.is_imm())
if (op.rs1.is_imm() && !op.rs3.is_reg())
{
if (op.rs3.is_imm())
{
// Merge base addr and offset
// Merge base addr and offset
if (op.rs3.is_imm()) {
op.rs1._imm += op.rs3.imm_value();
op.rs3.type = FMT_NULL;
}
else if (op.rs3.is_reg())
{
// Swap rs1 and rs3 so that rs1 is never an immediate operand
shil_param t = op.rs1;
op.rs1 = op.rs3;
op.rs3 = t;
}
// If we know the address to read and it's in the same memory page(s) as the block
// and if those pages are read-only, then we can directly read the memory at compile time
// and propagate the read value as a constant.
if (op.rs1.is_imm() && op.op == shop_readm && block->read_only
if (op.op == shop_readm && block->read_only
&& (op.rs1._imm >> 12) >= (block->vaddr >> 12)
&& (op.rs1._imm >> 12) <= ((block->vaddr + block->sh4_code_size - 1) >> 12)
&& op.size <= 4)
@ -263,6 +256,15 @@ private:
}
}
}
else
{
if (op.rs1.is_imm() && op.rs3.is_reg())
// Swap rs1 and rs3 so that rs1 is never an immediate operand
std::swap(op.rs1, op.rs3);
if (op.rs3.is_imm() && op.rs3.imm_value() == 0)
// 0 displacement has no effect
op.rs3.type = FMT_NULL;
}
}
else if (ExecuteConstOp(&op))
{
@ -440,9 +442,9 @@ private:
for (size_t opnum = 0; opnum < block->oplist.size(); opnum++)
{
shil_opcode& op = block->oplist[opnum];
if (op.rs2.is_imm())
if (op.rs2.is_imm() || op.rs2.is_null())
{
if (op.rs2.imm_value() == 0)
if (op.rs2.is_null() || op.rs2.imm_value() == 0)
{
// a & 0 == 0
// a * 0 == 0
@ -590,10 +592,15 @@ private:
defnum = opnum;
// find alias redef
if (DefinesHigherVersion(op->rd, alias.second) && aliasdef == (size_t)-1)
aliasdef = opnum;
else if (DefinesHigherVersion(op->rd2, alias.second) && aliasdef == (size_t)-1)
aliasdef = opnum;
if (aliasdef == (size_t)-1)
{
if (DefinesHigherVersion(op->rd, alias.second))
aliasdef = opnum;
else if (DefinesHigherVersion(op->rd2, alias.second))
aliasdef = opnum;
else if (op->op == shop_ifb)
aliasdef = opnum;
}
// find last use
if (UsesRegValue(op->rs1, alias.first))

View File

@ -362,7 +362,7 @@ sh4op(i1111_nnnn_0111_1101)
u32 n = GetN(op);
if (fpscr.PR==0)
{
fr[n] = (float)(1/sqrtf(fr[n]));
fr[n] = sqrtf(1.f / fr[n]);
CHECK_FPU_32(fr[n]);
}
else
@ -406,23 +406,12 @@ sh4op(i1111_nnmm_1110_1101)
int m=(GetN(op)&0x3)<<2;
if (fpscr.PR == 0)
{
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
// multiplications are done with 28 bits of precision (53 - 25) and the final sum at 30 bits
double idp = (double)fr[n + 0] * fr[m + 0];
idp += (double)fr[n + 1] * fr[m + 1];
idp += (double)fr[n + 2] * fr[m + 2];
idp += (double)fr[n + 3] * fr[m + 3];
fr[n + 3] = fixNaN((float)idp);
#else
float rv = fr[n + 0] * fr[m + 0];
rv += fr[n + 1] * fr[m + 1];
rv += fr[n + 2] * fr[m + 2];
rv += fr[n + 3] * fr[m + 3];
CHECK_FPU_32(rv);
fr[n + 3] = rv;
#endif
}
else
{
@ -521,7 +510,7 @@ sh4op(i1111_nnnn_0110_1101)
}
else
{
setDRn(op, fixNaN64(sqrt(getDRn(op))));
setDRn(op, fixNaN64(std::sqrt(getDRn(op))));
}
}
@ -567,7 +556,7 @@ sh4op(i1111_nnnn_mmmm_1110)
u32 n = GetN(op);
u32 m = GetM(op);
fr[n] =(f32) ((f64)fr[n]+(f64)fr[0] * (f64)fr[m]);
fr[n] = std::fma(fr[0], fr[m], fr[n]);
CHECK_FPU_32(fr[n]);
}
else
@ -591,7 +580,6 @@ sh4op(i1111_nn01_1111_1101)
if (fpscr.PR==0)
{
#if HOST_CPU == CPU_X86 || HOST_CPU == CPU_X64
double v1 = (double)xf[0] * fr[n + 0] +
(double)xf[4] * fr[n + 1] +
(double)xf[8] * fr[n + 2] +
@ -616,39 +604,6 @@ sh4op(i1111_nn01_1111_1101)
fr[n + 1] = fixNaN((float)v2);
fr[n + 2] = fixNaN((float)v3);
fr[n + 3] = fixNaN((float)v4);
#else
float v1, v2, v3, v4;
v1 = xf[0] * fr[n + 0] +
xf[4] * fr[n + 1] +
xf[8] * fr[n + 2] +
xf[12] * fr[n + 3];
v2 = xf[1] * fr[n + 0] +
xf[5] * fr[n + 1] +
xf[9] * fr[n + 2] +
xf[13] * fr[n + 3];
v3 = xf[2] * fr[n + 0] +
xf[6] * fr[n + 1] +
xf[10] * fr[n + 2] +
xf[14] * fr[n + 3];
v4 = xf[3] * fr[n + 0] +
xf[7] * fr[n + 1] +
xf[11] * fr[n + 2] +
xf[15] * fr[n + 3];
CHECK_FPU_32(v1);
CHECK_FPU_32(v2);
CHECK_FPU_32(v3);
CHECK_FPU_32(v4);
fr[n + 0] = v1;
fr[n + 1] = v2;
fr[n + 2] = v3;
fr[n + 3] = v4;
#endif
}
else
{

View File

@ -16,8 +16,6 @@
#include "hw/sh4/sh4_cache.h"
#endif
#define iNimp cpu_iNimp
//Read Mem macros
#define ReadMemU32(to,addr) to=ReadMem32(addr)
@ -41,11 +39,6 @@
#define WriteMemBOU8(addr,offset,data) WriteMemU8(addr+offset,data)
// 0xxx
void cpu_iNimp(u32 op, const char* info)
{
ERROR_LOG(INTERPRETER, "Unimplemented opcode: %08X next_pc: %08X pr: %08X msg: %s", op, next_pc, pr, info);
die("iNimp reached\n");
}
//stc GBR,<REG_N>
sh4op(i0000_nnnn_0001_0010)

View File

@ -80,9 +80,9 @@ static u64 dec_MRd(DecParam d,DecParam s,u32 sz) { return dec_Fill(DM_ReadM,d,s,
//d= reg to read from
static u64 dec_MWt(DecParam d,DecParam s,u32 sz) { return dec_Fill(DM_WriteM,d,s,shop_writem,sz); }
sh4_opcodelistentry missing_opcode = {dec_illegalOp, iNotImplemented, 0, 0, ReadWritePC, "missing", 0, 0, CO, 1 };
static sh4_opcodelistentry missing_opcode = {dec_illegalOp, iNotImplemented, 0, 0, ReadWritePC, "missing", 0, 0, CO, 1 };
sh4_opcodelistentry opcodes[]=
static sh4_opcodelistentry opcodes[]=
{
//HLE
{0, reios_trap, Mask_none, REIOS_OPCODE, Branch_dir, "reios_trap", 100, 100, CO, 1 },
@ -344,7 +344,7 @@ sh4_opcodelistentry opcodes[]=
{0,0,0,0,ReadWritePC}//Branch in order to stop the block and save PC ect :)
};
void BuildOpcodeTables()
static void BuildOpcodeTables()
{
for (int i=0;i<0x10000;i++)

View File

@ -76,7 +76,6 @@ struct sh4_opcodelistentry
};
extern sh4_opcodelistentry* OpDesc[0x10000];
extern sh4_opcodelistentry opcodes[];
void DissasembleOpcode(u16 opcode,u32 pc,char* Dissasm);
enum DecParam

View File

@ -1933,8 +1933,8 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
case shop_fsrra:
Vmov(s1, 1.f);
Vsqrt(s0, reg.mapFReg(op->rs1));
Vdiv(reg.mapFReg(op->rd), s1, s0);
Vdiv(s0, s1, reg.mapFReg(op->rs1));
Vsqrt(reg.mapFReg(op->rd), s0);
break;
case shop_fsetgt:
@ -1986,7 +1986,6 @@ void Arm32Assembler::compileOp(RuntimeBlockInfo* block, shil_opcode* op, bool op
case shop_fipr:
{
QRegister _r1 = q0;
QRegister _r2 = q0;

View File

@ -879,9 +879,9 @@ public:
break;
case shop_fsrra:
Fsqrt(s0, regalloc.MapVRegister(op.rs1));
Fmov(s1, 1.f);
Fdiv(regalloc.MapVRegister(op.rd), s1, s0);
Fdiv(s0, s1, regalloc.MapVRegister(op.rs1));
Fsqrt(regalloc.MapVRegister(op.rd), s0);
break;
case shop_fsetgt:
@ -907,6 +907,7 @@ public:
}
break;
/* fall back to the canonical implementations for better precision
case shop_fipr:
Add(x9, x28, sh4_context_mem_operand(op.rs1.reg_ptr()).GetOffset());
Ld1(v0.V4S(), MemOperand(x9));
@ -937,6 +938,7 @@ public:
Add(x9, x28, sh4_context_mem_operand(op.rd.reg_ptr()).GetOffset());
St1(v5.V4S(), MemOperand(x9));
break;
*/
case shop_frswap:
Add(x9, x28, sh4_context_mem_operand(op.rs1.reg_ptr()).GetOffset());
@ -1077,13 +1079,11 @@ public:
switch (size)
{
case 1:
GenCallRuntime(addrspace::read8);
Sxtb(w0, w0);
GenCallRuntime(addrspace::read8SX32);
break;
case 2:
GenCallRuntime(addrspace::read16);
Sxth(w0, w0);
GenCallRuntime(addrspace::read16SX32);
break;
case 4:
@ -1497,7 +1497,7 @@ public:
// w0: vaddr, w1: addr
checkBlockFpu = GetCursorAddress<DynaCode *>();
Label fpu_enabled;
Ldr(w10, sh4_context_mem_operand(&sr));
Ldr(w10, sh4_context_mem_operand(&sr.status));
Tbz(w10, 15, &fpu_enabled); // test SR.FD bit
Mov(w1, Sh4Ex_FpuDisabled); // exception code

View File

@ -135,7 +135,7 @@ public:
if (mmu_enabled() && block->has_fpu_op)
{
Xbyak::Label fpu_enabled;
mov(rax, (uintptr_t)&sr);
mov(rax, (uintptr_t)&sr.status);
test(dword[rax], 0x8000); // test SR.FD bit
jz(fpu_enabled);
mov(call_regs[0], block->vaddr); // pc

View File

@ -537,19 +537,18 @@ protected:
break;
case shop_fsrra:
// RSQRTSS has an |error| <= 1.5*2^-12 where the SH4 FSRRA needs |error| <= 2^-21
sqrtss(xmm0, mapXRegister(op.rs1));
if (ArchX64)
{
mov(eax, 0x3f800000); // 1.0
movd(mapXRegister(op.rd), eax);
movd(xmm0, eax);
}
else
{
static float one = 1.f;
movss(mapXRegister(op.rd), dword[&one]);
movss(xmm0, dword[&one]);
}
divss(mapXRegister(op.rd), xmm0);
divss(xmm0, mapXRegister(op.rs1));
sqrtss(mapXRegister(op.rd), xmm0);
break;
case shop_fsetgt:

View File

@ -108,7 +108,7 @@ void X86Compiler::compile(RuntimeBlockInfo* block, bool force_checks, bool optim
if (mmu_enabled() && block->has_fpu_op)
{
Xbyak::Label fpu_enabled;
mov(eax, dword[&sr]);
mov(eax, dword[&sr.status]);
test(eax, 0x8000); // test SR.FD bit
jz(fpu_enabled);
push(Sh4Ex_FpuDisabled); // exception code