[a64] Optimize bulk VConst access with relative addressing

Load the pointer to the VConst table once, and use offsets from this base address from the underlying enum value.
Reduces the amount of instructions for each VConst memory load.
This commit is contained in:
Wunkolo 2024-05-25 15:29:28 -07:00
parent 4ff43ae1a8
commit fc1a13d3b2
3 changed files with 120 additions and 123 deletions

View File

@ -837,11 +837,14 @@ void A64Emitter::FreeConstData(uintptr_t data) {
memory::DeallocationType::kRelease); memory::DeallocationType::kRelease);
} }
std::byte* A64Emitter::GetVConstPtr(VConst id) { std::byte* A64Emitter::GetVConstPtr() const {
return reinterpret_cast<std::byte*>(backend_->emitter_data());
}
std::byte* A64Emitter::GetVConstPtr(VConst id) const {
// Load through fixed constant table setup by PlaceConstData. // Load through fixed constant table setup by PlaceConstData.
// It's important that the pointer is not signed, as it will be sign-extended. // It's important that the pointer is not signed, as it will be sign-extended.
return reinterpret_cast<std::byte*>(backend_->emitter_data() + return GetVConstPtr() + GetVConstOffset(id);
sizeof(vec128_t) * id);
} }
// Implies possible StashV(0, ...)! // Implies possible StashV(0, ...)!

View File

@ -204,7 +204,11 @@ class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator {
bool ConstantFitsIn32Reg(uint64_t v); bool ConstantFitsIn32Reg(uint64_t v);
void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v); void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v);
std::byte* GetVConstPtr(VConst id); std::byte* GetVConstPtr() const;
std::byte* GetVConstPtr(VConst id) const;
constexpr uintptr_t GetVConstOffset(VConst id) const {
return sizeof(vec128_t) * id;
}
void LoadConstantV(oaknut::QReg dest, float v); void LoadConstantV(oaknut::QReg dest, float v);
void LoadConstantV(oaknut::QReg dest, double v); void LoadConstantV(oaknut::QReg dest, double v);
void LoadConstantV(oaknut::QReg dest, const vec128_t& v); void LoadConstantV(oaknut::QReg dest, const vec128_t& v);

View File

@ -542,11 +542,11 @@ struct VECTOR_SHL_V128
e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint8_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -566,11 +566,11 @@ struct VECTOR_SHL_V128
e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]); e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint16_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -590,11 +590,11 @@ struct VECTOR_SHL_V128
e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]); e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShl<uint32_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -655,11 +655,11 @@ struct VECTOR_SHR_V128
e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint8_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -679,11 +679,11 @@ struct VECTOR_SHR_V128
e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]); e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint16_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -703,11 +703,11 @@ struct VECTOR_SHR_V128
e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]); e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<uint32_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -751,11 +751,11 @@ struct VECTOR_SHA_V128
e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7); e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int8_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -775,11 +775,11 @@ struct VECTOR_SHA_V128
e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF); e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int16_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -799,11 +799,11 @@ struct VECTOR_SHA_V128
e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F); e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F);
return; return;
} }
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShr<int32_t>));
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -835,11 +835,11 @@ struct VECTOR_ROTATE_LEFT_V128
I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> { I<OPCODE_VECTOR_ROTATE_LEFT, V128Op, V128Op, V128Op>> {
static void Emit(A64Emitter& e, const EmitArgType& i) { static void Emit(A64Emitter& e, const EmitArgType& i) {
if (i.src2.is_constant) { if (i.src2.is_constant) {
e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant()));
} else { } else {
e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2));
} }
e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1));
switch (i.instr->flags) { switch (i.instr->flags) {
case INT8_TYPE: case INT8_TYPE:
e.CallNativeSafe( e.CallNativeSafe(
@ -1333,23 +1333,24 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
src = i.dest; src = i.dest;
e.LoadConstantV(src, i.src1.constant()); e.LoadConstantV(src, i.src1.constant());
} }
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
// Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
// are valid - max before min to pack NaN as zero (5454082B is heavily // are valid - max before min to pack NaN as zero (5454082B is heavily
// affected by the order - packs 0xFFFFFFFF in matrix code to get a 0 // affected by the order - packs 0xFFFFFFFF in matrix code to get a 0
// constant). // constant).
e.MOVP2R(X0, e.GetVConstPtr(V3333)); e.LDR(Q0, VConstData, e.GetVConstOffset(V3333));
e.LDR(Q0, X0);
e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4()); e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4());
e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLORSat)); e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLORSat));
e.LDR(Q0, X0);
e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4()); e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4());
// Extract bytes. // Extract bytes.
// RGBA (XYZW) -> ARGB (WXYZ) // RGBA (XYZW) -> ARGB (WXYZ)
// w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLOR)); e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLOR));
e.LDR(Q0, X0);
e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16()); e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16());
} }
static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) { static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) {
@ -1433,18 +1434,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
src = i.dest; src = i.dest;
e.LoadConstantV(src, i.src1.constant()); e.LoadConstantV(src, i.src1.constant());
} }
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
// Saturate // Saturate
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
e.LDR(Q1, X0);
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max));
e.LDR(Q1, X0);
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Pack // Pack
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_2)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_2));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
} }
static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) { static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) {
@ -1454,18 +1455,18 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
src = i.dest; src = i.dest;
e.LoadConstantV(src, i.src1.constant()); e.LoadConstantV(src, i.src1.constant());
} }
// Saturate const XReg VConstData = X3;
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min)); e.MOVP2R(VConstData, e.GetVConstPtr());
e.LDR(Q1, X0);
e.FMAXNM(i.dest.reg().S4(), src.S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max)); // Saturate
e.LDR(Q1, X0); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min));
e.FMINNM(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max));
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Pack // Pack
e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_4)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_4));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
} }
static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
@ -1476,24 +1477,22 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
if (i.src1.is_constant) { if (i.src1.is_constant) {
e.LoadConstantV(src, i.src1.constant()); e.LoadConstantV(src, i.src1.constant());
} }
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
// Saturate. // Saturate.
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MinUnpacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked));
e.LDR(Q1, X0);
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaxUnpacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaxUnpacked));
e.LDR(Q1, X0);
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Remove the unneeded bits of the floats. // Remove the unneeded bits of the floats.
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskUnpacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskUnpacked));
e.LDR(Q1, X0);
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
// Shift the components up. // Shift the components up.
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift));
e.LDR(Q1, X0);
e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Combine the components. // Combine the components.
@ -1519,31 +1518,29 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
src = i.dest; src = i.dest;
e.LoadConstantV(src, i.src1.constant()); e.LoadConstantV(src, i.src1.constant());
} }
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
// Saturate. // Saturate.
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MinUnpacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked));
e.LDR(Q1, X0);
e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaxUnpacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaxUnpacked));
e.LDR(Q1, X0);
e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Remove the unneeded bits of the floats (so excess nibbles will also be // Remove the unneeded bits of the floats (so excess nibbles will also be
// cleared). // cleared).
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaskUnpacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaskUnpacked));
e.LDR(Q1, X0);
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
// Store Y and W shifted left by 4 so vpshufb can be used with them. // Store Y and W shifted left by 4 so vpshufb can be used with them.
e.SHL(Q0.S4(), i.dest.reg().S4(), 4); e.SHL(Q0.S4(), i.dest.reg().S4(), 4);
// Place XZ where they're supposed to be. // Place XZ where they're supposed to be.
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteXZ)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteXZ));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16());
// Place YW. // Place YW.
e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteYW)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteYW));
e.LDR(Q1, X0);
e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16()); e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16());
// Merge XZ and YW. // Merge XZ and YW.
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16()); e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16());
@ -1742,11 +1739,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
} }
static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) { static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) {
// ARGB (WXYZ) -> RGBA (XYZW) // ARGB (WXYZ) -> RGBA (XYZW)
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
QReg src(0); QReg src(0);
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.MOVP2R(X0, e.GetVConstPtr(VOne)); e.LDR(i.dest.reg(), VConstData, e.GetVConstOffset(VOne));
e.LDR(i.dest.reg(), X0);
return; return;
} }
src = i.dest; src = i.dest;
@ -1756,12 +1756,10 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
} }
// src = ZZYYXXWW // src = ZZYYXXWW
// Unpack to 000000ZZ,000000YY,000000XX,000000WW // Unpack to 000000ZZ,000000YY,000000XX,000000WW
e.MOVP2R(X0, e.GetVConstPtr(VUnpackD3DCOLOR)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackD3DCOLOR));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
// Add 1.0f to each. // Add 1.0f to each.
e.MOVP2R(X0, e.GetVConstPtr(VOne)); e.LDR(Q1, VConstData, e.GetVConstOffset(VOne));
e.LDR(Q1, X0);
e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
} }
@ -1850,12 +1848,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// (VD.z) = 0.0 // (VD.z) = 0.0
// (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
// src is (xx,xx,xx,VALUE) // src is (xx,xx,xx,VALUE)
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
QReg src(0); QReg src(0);
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
src = i.dest; src = i.dest;
e.MOVP2R(X0, e.GetVConstPtr(V3301)); e.LDR(i.dest, VConstData, e.GetVConstOffset(V3301));
e.LDR(i.dest, X0);
return; return;
} }
// TODO(benvanik): check other common constants/perform shuffle/or here. // TODO(benvanik): check other common constants/perform shuffle/or here.
@ -1865,8 +1865,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
src = i.src1; src = i.src1;
} }
// Shuffle bytes. // Shuffle bytes.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_2)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_2));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
// If negative, make smaller than 3 - sign extend before adding. // If negative, make smaller than 3 - sign extend before adding.
@ -1874,17 +1873,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16);
// Add 3,3,0,1. // Add 3,3,0,1.
e.MOVP2R(X0, e.GetVConstPtr(V3301)); e.LDR(Q1, VConstData, e.GetVConstOffset(V3301));
e.LDR(Q1, X0);
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Return quiet NaNs in case of negative overflow. // Return quiet NaNs in case of negative overflow.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow));
e.LDR(Q1, X0);
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
e.LDR(Q1, X0);
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
@ -1894,11 +1890,14 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// (VD.z) = 3.0 + (VB.y>>16)*2^-22 // (VD.z) = 3.0 + (VB.y>>16)*2^-22
// (VD.w) = 3.0 + (VB.y)*2^-22 // (VD.w) = 3.0 + (VB.y)*2^-22
// src is (xx,xx,VALUE,VALUE) // src is (xx,xx,VALUE,VALUE)
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
QReg src(0); QReg src(0);
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.MOVP2R(X0, e.GetVConstPtr(V3333)); e.LDR(i.dest, VConstData, e.GetVConstOffset(V3333));
e.LDR(i.dest, X0);
return; return;
} }
// TODO(benvanik): check other common constants/perform shuffle/or here. // TODO(benvanik): check other common constants/perform shuffle/or here.
@ -1908,8 +1907,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
src = i.src1; src = i.src1;
} }
// Shuffle bytes. // Shuffle bytes.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_4)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_4));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
// If negative, make smaller than 3 - sign extend before adding. // If negative, make smaller than 3 - sign extend before adding.
@ -1917,26 +1915,25 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16);
// Add 3,3,3,3. // Add 3,3,3,3.
e.MOVP2R(X0, e.GetVConstPtr(V3333)); e.LDR(Q1, VConstData, e.GetVConstOffset(V3333));
e.LDR(Q1, X0);
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Return quiet NaNs in case of negative overflow. // Return quiet NaNs in case of negative overflow.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow));
e.LDR(Q1, X0);
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
e.LDR(Q1, X0);
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }
static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) {
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
QReg src(0); QReg src(0);
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.MOVP2R(X0, e.GetVConstPtr(V3331)); e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331));
e.LDR(i.dest, X0);
return; return;
} }
src = i.dest; src = i.dest;
@ -1949,13 +1946,11 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.DUP(i.dest.reg().S4(), src.Selem()[3]); e.DUP(i.dest.reg().S4(), src.Selem()[3]);
// Keep only the needed components. // Keep only the needed components.
// Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskPacked)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskPacked));
e.LDR(Q1, X0);
e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16());
// Shift the components down. // Shift the components down.
e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift)); e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift));
e.LDR(Q1, X0);
e.NEG(Q1.S4(), Q1.S4()); e.NEG(Q1.S4(), Q1.S4());
e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// If XYZ are negative, make smaller than 3 - sign extend XYZ before adding. // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding.
@ -1963,27 +1958,26 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22); e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22);
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22);
// Add 3,3,3,1. // Add 3,3,3,1.
e.MOVP2R(X0, e.GetVConstPtr(V3331)); e.LDR(Q1, VConstData, e.GetVConstOffset(V3331));
e.LDR(Q1, X0);
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Return quiet NaNs in case of negative overflow. // Return quiet NaNs in case of negative overflow.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackUINT_2101010_Overflow)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackUINT_2101010_Overflow));
e.LDR(Q1, X0);
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
e.LDR(Q1, X0);
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
// To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
// For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.} // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.}
} }
static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) { static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) {
const XReg VConstData = X3;
e.MOVP2R(VConstData, e.GetVConstPtr());
QReg src(0); QReg src(0);
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.MOVP2R(X0, e.GetVConstPtr(V3331)); e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331));
e.LDR(i.dest, X0);
return; return;
} }
src = i.dest; src = i.dest;
@ -1993,8 +1987,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
} }
// Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW // Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW
// will have excess 4 lower bits. // will have excess 4 lower bits.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Permute)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Permute));
e.LDR(Q1, X0);
e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16());
// Drop the excess nibble of YW. // Drop the excess nibble of YW.
@ -2018,16 +2011,13 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12); e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12);
e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12);
// Add 3,3,3,1. // Add 3,3,3,1.
e.MOVP2R(X0, e.GetVConstPtr(V3331)); e.LDR(Q1, VConstData, e.GetVConstOffset(V3331));
e.LDR(Q1, X0);
e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4());
// Return quiet NaNs in case of negative overflow. // Return quiet NaNs in case of negative overflow.
e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Overflow)); e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Overflow));
e.LDR(Q1, X0);
e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4());
e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN));
e.LDR(Q1, X0);
e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16());
e.MOV(i.dest.reg().B16(), Q0.B16()); e.MOV(i.dest.reg().B16(), Q0.B16());
} }