SPU ASMJIT: rewrite 128-bit shifts by bit

Six instructions changed to use xmm registers instead of gpr.
ROTQBII, ROTQMBII, SHLQBII look better (shifts by imm)
ROTQBI, ROTQMBI, SHLQBI changed for consistency (shifts by variable)
This commit is contained in:
Nekotekina 2018-02-02 03:49:51 +03:00
parent c7c49ab286
commit 439a78d12c
1 changed files with 66 additions and 62 deletions

View File

@ -1716,50 +1716,57 @@ void spu_recompiler::CDX(spu_opcode_t op)
void spu_recompiler::ROTQBI(spu_opcode_t op) void spu_recompiler::ROTQBI(spu_opcode_t op)
{ {
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->mov(*qw2, *qw0); const XmmLink& vt = XmmAlloc();
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); const XmmLink& v4 = XmmAlloc();
c->and_(*addr, 7); c->psrldq(vb, 12);
c->shld(*qw0, *qw1, *addr); c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
c->shld(*qw1, *qw2, *addr); c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); c->pshufd(vt, va, 0x4e);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); c->psubq(v4, vb);
c->unuse(*addr); c->psllq(va, vb);
c->unuse(*qw0); c->psrlq(vt, v4);
c->unuse(*qw1); c->por(vt, va);
c->unuse(*qw2); c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
} }
void spu_recompiler::ROTQMBI(spu_opcode_t op) void spu_recompiler::ROTQMBI(spu_opcode_t op)
{ {
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); const XmmLink& vb = XmmAlloc();
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); const XmmLink& vt = XmmGet(op.rb, XmmType::Int);
c->neg(*addr); const XmmLink& v4 = XmmAlloc();
c->and_(*addr, 7); c->psrldq(vt, 12);
c->shrd(*qw0, *qw1, *addr); c->pxor(vb, vb);
c->shr(*qw1, *addr); c->psubq(vb, vt);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
c->unuse(*addr); c->movdqa(vt, va);
c->unuse(*qw0); c->psrldq(vt, 8);
c->unuse(*qw1); c->psubq(v4, vb);
c->psrlq(va, vb);
c->psllq(vt, v4);
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
} }
void spu_recompiler::SHLQBI(spu_opcode_t op) void spu_recompiler::SHLQBI(spu_opcode_t op)
{ {
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); const XmmLink& vb = XmmGet(op.rb, XmmType::Int);
c->mov(*addr, SPU_OFF_32(gpr, op.rb, &v128::_u32, 3)); const XmmLink& vt = XmmAlloc();
c->and_(*addr, 7); const XmmLink& v4 = XmmAlloc();
c->shld(*qw1, *qw0, *addr); c->psrldq(vb, 12);
c->shl(*qw0, *addr); c->pand(vb, XmmConst(_mm_set_epi64x(0, 7)));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); c->movdqa(v4, XmmConst(_mm_set_epi64x(0, 64)));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); c->movdqa(vt, va);
c->unuse(*addr); c->pslldq(vt, 8);
c->unuse(*qw0); c->psubq(v4, vb);
c->unuse(*qw1); c->psllq(va, vb);
c->psrlq(vt, v4);
c->por(vt, va);
c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
} }
void spu_recompiler::ROTQBY(spu_opcode_t op) void spu_recompiler::ROTQBY(spu_opcode_t op)
@ -1974,40 +1981,37 @@ void spu_recompiler::CDD(spu_opcode_t op)
void spu_recompiler::ROTQBII(spu_opcode_t op) void spu_recompiler::ROTQBII(spu_opcode_t op)
{ {
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); const XmmLink& vt = XmmAlloc();
c->mov(*qw2, *qw0); c->pshufd(vt, va, 0x4e); // swap 64-bit parts
c->shld(*qw0, *qw1, op.i7 & 0x7); c->psllq(va, (op.i7 & 0x7));
c->shld(*qw1, *qw2, op.i7 & 0x7); c->psrlq(vt, 64 - (op.i7 & 0x7));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); c->por(vt, va);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
c->unuse(*qw0);
c->unuse(*qw1);
c->unuse(*qw2);
} }
void spu_recompiler::ROTQMBII(spu_opcode_t op) void spu_recompiler::ROTQMBII(spu_opcode_t op)
{ {
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); const XmmLink& vt = XmmAlloc();
c->shrd(*qw0, *qw1, 0-op.i7 & 0x7); c->movdqa(vt, va);
c->shr(*qw1, 0-op.i7 & 0x7); c->psrldq(vt, 8);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); c->psrlq(va, ((0 - op.i7) & 0x7));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); c->psllq(vt, 64 - ((0 - op.i7) & 0x7));
c->unuse(*qw0); c->por(vt, va);
c->unuse(*qw1); c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
} }
void spu_recompiler::SHLQBII(spu_opcode_t op) void spu_recompiler::SHLQBII(spu_opcode_t op)
{ {
c->mov(*qw0, SPU_OFF_64(gpr, op.ra, &v128::_u64, 0)); const XmmLink& va = XmmGet(op.ra, XmmType::Int);
c->mov(*qw1, SPU_OFF_64(gpr, op.ra, &v128::_u64, 1)); const XmmLink& vt = XmmAlloc();
c->shld(*qw1, *qw0, op.i7 & 0x7); c->movdqa(vt, va);
c->shl(*qw0, op.i7 & 0x7); c->pslldq(vt, 8);
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 0), *qw0); c->psllq(va, (op.i7 & 0x7));
c->mov(SPU_OFF_64(gpr, op.rt, &v128::_u64, 1), *qw1); c->psrlq(vt, 64 - (op.i7 & 0x7));
c->unuse(*qw0); c->por(vt, va);
c->unuse(*qw1); c->movdqa(SPU_OFF_128(gpr, op.rt), vt);
} }
void spu_recompiler::ROTQBYI(spu_opcode_t op) void spu_recompiler::ROTQBYI(spu_opcode_t op)