From fc1a13d3b2d87616b1aa38b806510d410d202133 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sat, 25 May 2024 15:29:28 -0700 Subject: [PATCH] [a64] Optimize bulk VConst access with relative addressing Load the pointer to the VConst table once, and use offsets from this base address from the underlying enum value. Reduces the amount of instructions for each VConst memory load. --- src/xenia/cpu/backend/a64/a64_emitter.cc | 9 +- src/xenia/cpu/backend/a64/a64_emitter.h | 10 +- src/xenia/cpu/backend/a64/a64_seq_vector.cc | 224 ++++++++++---------- 3 files changed, 120 insertions(+), 123 deletions(-) diff --git a/src/xenia/cpu/backend/a64/a64_emitter.cc b/src/xenia/cpu/backend/a64/a64_emitter.cc index 8c4482200..a3cccc231 100644 --- a/src/xenia/cpu/backend/a64/a64_emitter.cc +++ b/src/xenia/cpu/backend/a64/a64_emitter.cc @@ -837,11 +837,14 @@ void A64Emitter::FreeConstData(uintptr_t data) { memory::DeallocationType::kRelease); } -std::byte* A64Emitter::GetVConstPtr(VConst id) { +std::byte* A64Emitter::GetVConstPtr() const { + return reinterpret_cast(backend_->emitter_data()); +} + +std::byte* A64Emitter::GetVConstPtr(VConst id) const { // Load through fixed constant table setup by PlaceConstData. // It's important that the pointer is not signed, as it will be sign-extended. - return reinterpret_cast(backend_->emitter_data() + - sizeof(vec128_t) * id); + return GetVConstPtr() + GetVConstOffset(id); } // Implies possible StashV(0, ...)! diff --git a/src/xenia/cpu/backend/a64/a64_emitter.h b/src/xenia/cpu/backend/a64/a64_emitter.h index 463064ef1..3e0b35f36 100644 --- a/src/xenia/cpu/backend/a64/a64_emitter.h +++ b/src/xenia/cpu/backend/a64/a64_emitter.h @@ -119,8 +119,8 @@ enum VConst { }; enum A64EmitterFeatureFlags { - kA64EmitLSE = 1 << 0, - kA64EmitF16C = 1 << 1, + kA64EmitLSE = 1 << 0, + kA64EmitF16C = 1 << 1, }; class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator { @@ -204,7 +204,11 @@ class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator { bool ConstantFitsIn32Reg(uint64_t v); void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v); - std::byte* GetVConstPtr(VConst id); + std::byte* GetVConstPtr() const; + std::byte* GetVConstPtr(VConst id) const; + constexpr uintptr_t GetVConstOffset(VConst id) const { + return sizeof(vec128_t) * id; + } void LoadConstantV(oaknut::QReg dest, float v); void LoadConstantV(oaknut::QReg dest, double v); void LoadConstantV(oaknut::QReg dest, const vec128_t& v); diff --git a/src/xenia/cpu/backend/a64/a64_seq_vector.cc b/src/xenia/cpu/backend/a64/a64_seq_vector.cc index c92312fed..4f2b3bd95 100644 --- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc +++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc @@ -542,11 +542,11 @@ struct VECTOR_SHL_V128 e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -566,11 +566,11 @@ struct VECTOR_SHL_V128 e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -590,11 +590,11 @@ struct VECTOR_SHL_V128 e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -655,11 +655,11 @@ struct VECTOR_SHR_V128 e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -679,11 +679,11 @@ struct VECTOR_SHR_V128 e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -703,11 +703,11 @@ struct VECTOR_SHR_V128 e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -751,11 +751,11 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -775,11 +775,11 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -799,11 +799,11 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -835,11 +835,11 @@ struct VECTOR_ROTATE_LEFT_V128 I> { static void Emit(A64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); switch (i.instr->flags) { case INT8_TYPE: e.CallNativeSafe( @@ -1333,23 +1333,24 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } + + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] // are valid - max before min to pack NaN as zero (5454082B is heavily // affected by the order - packs 0xFFFFFFFF in matrix code to get a 0 // constant). - e.MOVP2R(X0, e.GetVConstPtr(V3333)); - e.LDR(Q0, X0); + e.LDR(Q0, VConstData, e.GetVConstOffset(V3333)); e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLORSat)); - e.LDR(Q0, X0); + e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLORSat)); e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4()); // Extract bytes. // RGBA (XYZW) -> ARGB (WXYZ) // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) - e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLOR)); - e.LDR(Q0, X0); + e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLOR)); e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16()); } static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) { @@ -1433,18 +1434,18 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + // Saturate - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min)); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max)); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Pack - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_2)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_2)); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); } static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) { @@ -1454,18 +1455,18 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } - // Saturate - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min)); - e.LDR(Q1, X0); - e.FMAXNM(i.dest.reg().S4(), src.S4(), Q1.S4()); + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max)); - e.LDR(Q1, X0); - e.FMINNM(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + // Saturate + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min)); + e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max)); + e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Pack - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_4)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_4)); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); } static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { @@ -1476,24 +1477,22 @@ struct PACK : Sequence> { if (i.src1.is_constant) { e.LoadConstantV(src, i.src1.constant()); } + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); // Saturate. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MinUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked)); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaxUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaxUnpacked)); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Remove the unneeded bits of the floats. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskUnpacked)); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // Shift the components up. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift)); e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Combine the components. @@ -1519,31 +1518,29 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + // Saturate. - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MinUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked)); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaxUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaxUnpacked)); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Remove the unneeded bits of the floats (so excess nibbles will also be // cleared). - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaskUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaskUnpacked)); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // Store Y and W shifted left by 4 so vpshufb can be used with them. e.SHL(Q0.S4(), i.dest.reg().S4(), 4); // Place XZ where they're supposed to be. - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteXZ)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteXZ)); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); // Place YW. - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteYW)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteYW)); e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16()); // Merge XZ and YW. e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16()); @@ -1742,11 +1739,14 @@ struct UNPACK : Sequence> { } static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) { // ARGB (WXYZ) -> RGBA (XYZW) + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); + if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(VOne)); - e.LDR(i.dest.reg(), X0); + e.LDR(i.dest.reg(), VConstData, e.GetVConstOffset(VOne)); return; } src = i.dest; @@ -1756,12 +1756,10 @@ struct UNPACK : Sequence> { } // src = ZZYYXXWW // Unpack to 000000ZZ,000000YY,000000XX,000000WW - e.MOVP2R(X0, e.GetVConstPtr(VUnpackD3DCOLOR)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackD3DCOLOR)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // Add 1.0f to each. - e.MOVP2R(X0, e.GetVConstPtr(VOne)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VOne)); e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. } @@ -1850,12 +1848,14 @@ struct UNPACK : Sequence> { // (VD.z) = 0.0 // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) // src is (xx,xx,xx,VALUE) + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { src = i.dest; - e.MOVP2R(X0, e.GetVConstPtr(V3301)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3301)); return; } // TODO(benvanik): check other common constants/perform shuffle/or here. @@ -1865,8 +1865,7 @@ struct UNPACK : Sequence> { src = i.src1; } // Shuffle bytes. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_2)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_2)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // If negative, make smaller than 3 - sign extend before adding. @@ -1874,17 +1873,14 @@ struct UNPACK : Sequence> { e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); // Add 3,3,0,1. - e.MOVP2R(X0, e.GetVConstPtr(V3301)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3301)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -1894,11 +1890,14 @@ struct UNPACK : Sequence> { // (VD.z) = 3.0 + (VB.y>>16)*2^-22 // (VD.w) = 3.0 + (VB.y)*2^-22 // src is (xx,xx,VALUE,VALUE) + + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(V3333)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3333)); return; } // TODO(benvanik): check other common constants/perform shuffle/or here. @@ -1908,8 +1907,7 @@ struct UNPACK : Sequence> { src = i.src1; } // Shuffle bytes. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_4)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_4)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // If negative, make smaller than 3 - sign extend before adding. @@ -1917,26 +1915,25 @@ struct UNPACK : Sequence> { e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); // Add 3,3,3,3. - e.MOVP2R(X0, e.GetVConstPtr(V3333)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3333)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); } static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331)); return; } src = i.dest; @@ -1949,13 +1946,11 @@ struct UNPACK : Sequence> { e.DUP(i.dest.reg().S4(), src.Selem()[3]); // Keep only the needed components. // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskPacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskPacked)); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // Shift the components down. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift)); e.NEG(Q1.S4(), Q1.S4()); e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding. @@ -1963,27 +1958,26 @@ struct UNPACK : Sequence> { e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22); // Add 3,3,3,1. - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3331)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackUINT_2101010_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackUINT_2101010_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.} } static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) { + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331)); return; } src = i.dest; @@ -1993,8 +1987,7 @@ struct UNPACK : Sequence> { } // Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW // will have excess 4 lower bits. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Permute)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Permute)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // Drop the excess nibble of YW. @@ -2018,16 +2011,13 @@ struct UNPACK : Sequence> { e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12); // Add 3,3,3,1. - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3331)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); }