[a64] Implement `OPCODE_PACK`(2101010, 4202020, 8-in-16, 16-in-32)
This commit is contained in:
parent
7c094dc6cf
commit
40d908b596
|
@ -1423,13 +1423,237 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
e.EXT(i.dest.reg().B16(), i.dest.reg().B16(), i.dest.reg().B16(), 4);
|
||||
e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
|
||||
}
|
||||
static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {}
|
||||
static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {}
|
||||
static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
|
||||
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
|
||||
// XYZ are 10 bits, signed and saturated.
|
||||
// W is 2 bits, unsigned and saturated.
|
||||
const QReg src = i.dest;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
|
||||
// Saturate.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MinUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaxUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Remove the unneeded bits of the floats.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
|
||||
|
||||
// Shift the components up.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift));
|
||||
e.LDR(Q1, X0);
|
||||
e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Combine the components.
|
||||
e.LoadConstantV(Q1, vec128i(0x03'02'01'00 + 0x04'04'04'04 * 2,
|
||||
0x03'02'01'00 + 0x04'04'04'04 * 3,
|
||||
0x03'02'01'00 + 0x04'04'04'04 * 0,
|
||||
0x03'02'01'00 + 0x04'04'04'04 * 1));
|
||||
e.TBL(Q0.B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
|
||||
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
|
||||
|
||||
e.LoadConstantV(Q1, vec128i(0x03'02'01'00 + 0x04'04'04'04 * 1,
|
||||
0x03'02'01'00 + 0x04'04'04'04 * 0,
|
||||
0x03'02'01'00 + 0x04'04'04'04 * 3,
|
||||
0x03'02'01'00 + 0x04'04'04'04 * 2));
|
||||
e.TBL(Q0.B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
|
||||
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {
|
||||
// XYZ are 20 bits, signed and saturated.
|
||||
// W is 4 bits, unsigned and saturated.
|
||||
QReg src = i.src1;
|
||||
if (i.src1.is_constant) {
|
||||
src = i.dest;
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
// Saturate.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MinUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaxUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Remove the unneeded bits of the floats (so excess nibbles will also be
|
||||
// cleared).
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaskUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
|
||||
|
||||
// Store Y and W shifted left by 4 so vpshufb can be used with them.
|
||||
e.SHL(Q0.S4(), i.dest.reg().S4(), 4);
|
||||
|
||||
// Place XZ where they're supposed to be.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteXZ));
|
||||
e.LDR(Q1, X0);
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
|
||||
// Place YW.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteYW));
|
||||
e.LDR(Q1, X0);
|
||||
e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16());
|
||||
// Merge XZ and YW.
|
||||
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
static void Emit8_IN_16(A64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
||||
if (IsPackInUnsigned(flags)) {
|
||||
if (IsPackOutUnsigned(flags)) {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// unsigned -> unsigned + saturate
|
||||
const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantV(src1, i.src1.constant());
|
||||
}
|
||||
|
||||
const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantV(src2, i.src2.constant());
|
||||
}
|
||||
e.UQXTN(i.dest.reg().toD().B8(), src1.H8());
|
||||
e.UQXTN2(i.dest.reg().B16(), src2.H8());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VByteOrderMask));
|
||||
e.LDR(Q0, X0);
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q0.B16());
|
||||
} else {
|
||||
// unsigned -> unsigned
|
||||
e.XTN(i.dest.reg().toD().B8(), i.src1.reg().H8());
|
||||
e.XTN2(i.dest.reg().B16(), i.src2.reg().H8());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VByteOrderMask));
|
||||
e.LDR(Q0, X0);
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q0.B16());
|
||||
}
|
||||
} else {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// unsigned -> signed + saturate
|
||||
assert_always();
|
||||
} else {
|
||||
// unsigned -> signed
|
||||
assert_always();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (IsPackOutUnsigned(flags)) {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// signed -> unsigned + saturate
|
||||
const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantV(src1, i.src1.constant());
|
||||
}
|
||||
|
||||
const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantV(src2, i.src2.constant());
|
||||
}
|
||||
|
||||
e.UQXTN(i.dest.reg().toD().B8(), i.src1.reg().H8());
|
||||
e.UQXTN2(i.dest.reg().B16(), src2.H8());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VByteOrderMask));
|
||||
e.LDR(Q0, X0);
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q0.B16());
|
||||
} else {
|
||||
// signed -> unsigned
|
||||
assert_always();
|
||||
}
|
||||
} else {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// signed -> signed + saturate
|
||||
e.SQXTN(i.dest.reg().toD().B8(), i.src1.reg().H8());
|
||||
e.SQXTN2(i.dest.reg().B16(), i.src2.reg().H8());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VByteOrderMask));
|
||||
e.LDR(Q0, X0);
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q0.B16());
|
||||
} else {
|
||||
// signed -> signed
|
||||
assert_always();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Pack 2 32-bit vectors into a 16-bit vector.
|
||||
static void Emit16_IN_32(A64Emitter& e, const EmitArgType& i,
|
||||
uint32_t flags) {}
|
||||
uint32_t flags) {
|
||||
// TODO(benvanik): handle src2 (or src1) being constant zero
|
||||
if (IsPackInUnsigned(flags)) {
|
||||
if (IsPackOutUnsigned(flags)) {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// unsigned -> unsigned + saturate
|
||||
const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantV(src1, i.src1.constant());
|
||||
}
|
||||
|
||||
const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantV(src2, i.src2.constant());
|
||||
}
|
||||
|
||||
e.UQXTN(i.dest.reg().toD().H4(), src1.S4());
|
||||
e.UQXTN2(i.dest.reg().H8(), src2.S4());
|
||||
|
||||
e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
|
||||
} else {
|
||||
// unsigned -> unsigned
|
||||
e.XTN(i.dest.reg().toD().H4(), i.src1.reg().S4());
|
||||
e.XTN2(i.dest.reg().H8(), i.src2.reg().S4());
|
||||
|
||||
e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
|
||||
}
|
||||
} else {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// unsigned -> signed + saturate
|
||||
assert_always();
|
||||
} else {
|
||||
// unsigned -> signed
|
||||
assert_always();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (IsPackOutUnsigned(flags)) {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// signed -> unsigned + saturate
|
||||
e.UQXTN(i.dest.reg().toD().H4(), i.src1.reg().S4());
|
||||
e.UQXTN2(i.dest.reg().H8(), i.src2.reg().S4());
|
||||
|
||||
e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
|
||||
} else {
|
||||
// signed -> unsigned
|
||||
assert_always();
|
||||
}
|
||||
} else {
|
||||
if (IsPackOutSaturate(flags)) {
|
||||
// signed -> signed + saturate
|
||||
const QReg src1 = i.src1.is_constant ? Q0 : i.src1;
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantV(src1, i.src1.constant());
|
||||
}
|
||||
|
||||
const QReg src2 = i.src2.is_constant ? Q1 : i.src2;
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantV(src2, i.src2.constant());
|
||||
}
|
||||
e.SQXTN(i.dest.reg().toD().H4(), src1.S4());
|
||||
e.SQXTN2(i.dest.reg().H8(), src2.S4());
|
||||
|
||||
e.REV32(i.dest.reg().H8(), i.dest.reg().H8());
|
||||
} else {
|
||||
// signed -> signed
|
||||
assert_always();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_PACK, PACK);
|
||||
|
||||
|
|
Loading…
Reference in New Issue