[a64] Optimize bulk VConst access with relative addressing
Load the pointer to the VConst table once, and use offsets from this base address from the underlying enum value. Reduces the amount of instructions for each VConst memory load.
This commit is contained in:
parent
4ff43ae1a8
commit
fc1a13d3b2
|
@ -837,11 +837,14 @@ void A64Emitter::FreeConstData(uintptr_t data) {
|
|||
memory::DeallocationType::kRelease);
|
||||
}
|
||||
|
||||
std::byte* A64Emitter::GetVConstPtr(VConst id) {
|
||||
std::byte* A64Emitter::GetVConstPtr() const {
|
||||
return reinterpret_cast<std::byte*>(backend_->emitter_data());
|
||||
}
|
||||
|
||||
std::byte* A64Emitter::GetVConstPtr(VConst id) const {
|
||||
// Load through fixed constant table setup by PlaceConstData.
|
||||
// It's important that the pointer is not signed, as it will be sign-extended.
|
||||
return reinterpret_cast<std::byte*>(backend_->emitter_data() +
|
||||
sizeof(vec128_t) * id);
|
||||
return GetVConstPtr() + GetVConstOffset(id);
|
||||
}
|
||||
|
||||
// Implies possible StashV(0, ...)!
|
||||
|
|
|
@ -119,8 +119,8 @@ enum VConst {
|
|||
};
|
||||
|
||||
enum A64EmitterFeatureFlags {
|
||||
kA64EmitLSE = 1 << 0,
|
||||
kA64EmitF16C = 1 << 1,
|
||||
kA64EmitLSE = 1 << 0,
|
||||
kA64EmitF16C = 1 << 1,
|
||||
};
|
||||
|
||||
class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator {
|
||||
|
@ -204,7 +204,11 @@ class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator {
|
|||
bool ConstantFitsIn32Reg(uint64_t v);
|
||||
void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v);
|
||||
|
||||
std::byte* GetVConstPtr(VConst id);
|
||||
std::byte* GetVConstPtr() const;
|
||||
std::byte* GetVConstPtr(VConst id) const;
|
||||
constexpr uintptr_t GetVConstOffset(VConst id) const {
|
||||
return sizeof(vec128_t) * id;
|
||||
}
|
||||
void LoadConstantV(oaknut::QReg dest, float v);
|
||||
void LoadConstantV(oaknut::QReg dest, double v);
|
||||
void LoadConstantV(oaknut::QReg dest, const vec128_t& v);
|
||||
|
|
|
@ -542,11 +542,11 @@ struct VECTOR_SHL_V128
|
|||
e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -566,11 +566,11 @@ struct VECTOR_SHL_V128
|
|||
e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -590,11 +590,11 @@ struct VECTOR_SHL_V128
|
|||
e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -655,11 +655,11 @@ struct VECTOR_SHR_V128
|
|||
e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -679,11 +679,11 @@ struct VECTOR_SHR_V128
|
|||
e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -703,11 +703,11 @@ struct VECTOR_SHR_V128
|
|||
e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -751,11 +751,11 @@ struct VECTOR_SHA_V128
|
|||
e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -775,11 +775,11 @@ struct VECTOR_SHA_V128
|
|||
e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -799,11 +799,11 @@ struct VECTOR_SHA_V128
|
|||
e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F);
|
||||
return;
|
||||
}
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>));
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -835,11 +835,11 @@ struct VECTOR_ROTATE_LEFT_V128
|
|||
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
|
||||
static void Emit(A64Emitter& e, const EmitArgType& i) {
|
||||
if (i.src2.is_constant) {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant()));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
|
||||
} else {
|
||||
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2));
|
||||
e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
|
||||
}
|
||||
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1));
|
||||
e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
e.CallNativeSafe(
|
||||
|
@ -1333,23 +1333,24 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
src = i.dest;
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
// Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
|
||||
// are valid - max before min to pack NaN as zero (5454082B is heavily
|
||||
// affected by the order - packs 0xFFFFFFFF in matrix code to get a 0
|
||||
// constant).
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3333));
|
||||
e.LDR(Q0, X0);
|
||||
e.LDR(Q0, VConstData, e.GetVConstOffset(V3333));
|
||||
e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLORSat));
|
||||
e.LDR(Q0, X0);
|
||||
e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLORSat));
|
||||
e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4());
|
||||
// Extract bytes.
|
||||
// RGBA (XYZW) -> ARGB (WXYZ)
|
||||
// w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
|
||||
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLOR));
|
||||
e.LDR(Q0, X0);
|
||||
e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLOR));
|
||||
e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16());
|
||||
}
|
||||
static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) {
|
||||
|
@ -1433,18 +1434,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
src = i.dest;
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
// Saturate
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
|
||||
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max));
|
||||
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Pack
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_2));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_2));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
|
||||
}
|
||||
static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -1454,18 +1455,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
src = i.dest;
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
// Saturate
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min));
|
||||
e.LDR(Q1, X0);
|
||||
e.FMAXNM(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max));
|
||||
e.LDR(Q1, X0);
|
||||
e.FMINNM(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
// Saturate
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
|
||||
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max));
|
||||
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Pack
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_4));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_4));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
|
||||
}
|
||||
static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
|
||||
|
@ -1476,24 +1477,22 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
if (i.src1.is_constant) {
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
// Saturate.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MinUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked));
|
||||
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaxUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaxUnpacked));
|
||||
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Remove the unneeded bits of the floats.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskUnpacked));
|
||||
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
|
||||
|
||||
// Shift the components up.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift));
|
||||
e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Combine the components.
|
||||
|
@ -1519,31 +1518,29 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
|||
src = i.dest;
|
||||
e.LoadConstantV(src, i.src1.constant());
|
||||
}
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
// Saturate.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MinUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked));
|
||||
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaxUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaxUnpacked));
|
||||
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Remove the unneeded bits of the floats (so excess nibbles will also be
|
||||
// cleared).
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaskUnpacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaskUnpacked));
|
||||
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
|
||||
|
||||
// Store Y and W shifted left by 4 so vpshufb can be used with them.
|
||||
e.SHL(Q0.S4(), i.dest.reg().S4(), 4);
|
||||
|
||||
// Place XZ where they're supposed to be.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteXZ));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteXZ));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
|
||||
// Place YW.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteYW));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteYW));
|
||||
e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16());
|
||||
// Merge XZ and YW.
|
||||
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
|
||||
|
@ -1742,11 +1739,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
}
|
||||
static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) {
|
||||
// ARGB (WXYZ) -> RGBA (XYZW)
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
QReg src(0);
|
||||
|
||||
if (i.src1.is_constant) {
|
||||
if (i.src1.value->IsConstantZero()) {
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VOne));
|
||||
e.LDR(i.dest.reg(), X0);
|
||||
e.LDR(i.dest.reg(), VConstData, e.GetVConstOffset(VOne));
|
||||
return;
|
||||
}
|
||||
src = i.dest;
|
||||
|
@ -1756,12 +1756,10 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
}
|
||||
// src = ZZYYXXWW
|
||||
// Unpack to 000000ZZ,000000YY,000000XX,000000WW
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackD3DCOLOR));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackD3DCOLOR));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
|
||||
// Add 1.0f to each.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VOne));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VOne));
|
||||
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
|
||||
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
|
||||
}
|
||||
|
@ -1850,12 +1848,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
// (VD.z) = 0.0
|
||||
// (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
|
||||
// src is (xx,xx,xx,VALUE)
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
QReg src(0);
|
||||
if (i.src1.is_constant) {
|
||||
if (i.src1.value->IsConstantZero()) {
|
||||
src = i.dest;
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3301));
|
||||
e.LDR(i.dest, X0);
|
||||
e.LDR(i.dest, VConstData, e.GetVConstOffset(V3301));
|
||||
return;
|
||||
}
|
||||
// TODO(benvanik): check other common constants/perform shuffle/or here.
|
||||
|
@ -1865,8 +1865,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
src = i.src1;
|
||||
}
|
||||
// Shuffle bytes.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_2));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_2));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
|
||||
|
||||
// If negative, make smaller than 3 - sign extend before adding.
|
||||
|
@ -1874,17 +1873,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16);
|
||||
|
||||
// Add 3,3,0,1.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3301));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(V3301));
|
||||
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Return quiet NaNs in case of negative overflow.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow));
|
||||
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VQNaN));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
|
||||
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
@ -1894,11 +1890,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
// (VD.z) = 3.0 + (VB.y>>16)*2^-22
|
||||
// (VD.w) = 3.0 + (VB.y)*2^-22
|
||||
// src is (xx,xx,VALUE,VALUE)
|
||||
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
QReg src(0);
|
||||
if (i.src1.is_constant) {
|
||||
if (i.src1.value->IsConstantZero()) {
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3333));
|
||||
e.LDR(i.dest, X0);
|
||||
e.LDR(i.dest, VConstData, e.GetVConstOffset(V3333));
|
||||
return;
|
||||
}
|
||||
// TODO(benvanik): check other common constants/perform shuffle/or here.
|
||||
|
@ -1908,8 +1907,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
src = i.src1;
|
||||
}
|
||||
// Shuffle bytes.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_4));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_4));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
|
||||
|
||||
// If negative, make smaller than 3 - sign extend before adding.
|
||||
|
@ -1917,26 +1915,25 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16);
|
||||
|
||||
// Add 3,3,3,3.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3333));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(V3333));
|
||||
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
// Return quiet NaNs in case of negative overflow.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow));
|
||||
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VQNaN));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
|
||||
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
QReg src(0);
|
||||
if (i.src1.is_constant) {
|
||||
if (i.src1.value->IsConstantZero()) {
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3331));
|
||||
e.LDR(i.dest, X0);
|
||||
e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331));
|
||||
return;
|
||||
}
|
||||
src = i.dest;
|
||||
|
@ -1949,13 +1946,11 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
e.DUP(i.dest.reg().S4(), src.Selem()[3]);
|
||||
// Keep only the needed components.
|
||||
// Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskPacked));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskPacked));
|
||||
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
|
||||
|
||||
// Shift the components down.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift));
|
||||
e.NEG(Q1.S4(), Q1.S4());
|
||||
e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
// If XYZ are negative, make smaller than 3 - sign extend XYZ before adding.
|
||||
|
@ -1963,27 +1958,26 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22);
|
||||
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22);
|
||||
// Add 3,3,3,1.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3331));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(V3331));
|
||||
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
// Return quiet NaNs in case of negative overflow.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackUINT_2101010_Overflow));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackUINT_2101010_Overflow));
|
||||
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VQNaN));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
|
||||
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
// To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
|
||||
// For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.}
|
||||
}
|
||||
static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {
|
||||
const XReg VConstData = X3;
|
||||
e.MOVP2R(VConstData, e.GetVConstPtr());
|
||||
|
||||
QReg src(0);
|
||||
if (i.src1.is_constant) {
|
||||
if (i.src1.value->IsConstantZero()) {
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3331));
|
||||
e.LDR(i.dest, X0);
|
||||
e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331));
|
||||
return;
|
||||
}
|
||||
src = i.dest;
|
||||
|
@ -1993,8 +1987,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
}
|
||||
// Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW
|
||||
// will have excess 4 lower bits.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Permute));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Permute));
|
||||
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
|
||||
|
||||
// Drop the excess nibble of YW.
|
||||
|
@ -2018,16 +2011,13 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
|||
e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12);
|
||||
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12);
|
||||
// Add 3,3,3,1.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(V3331));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(V3331));
|
||||
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
|
||||
// Return quiet NaNs in case of negative overflow.
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Overflow));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Overflow));
|
||||
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
|
||||
|
||||
e.MOVP2R(X0, e.GetVConstPtr(VQNaN));
|
||||
e.LDR(Q1, X0);
|
||||
e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
|
||||
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
|
||||
e.MOV(i.dest.reg().B16(), Q0.B16());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue