diff --git a/src/xenia/cpu/backend/a64/a64_emitter.cc b/src/xenia/cpu/backend/a64/a64_emitter.cc index 8c4482200..a3cccc231 100644 --- a/src/xenia/cpu/backend/a64/a64_emitter.cc +++ b/src/xenia/cpu/backend/a64/a64_emitter.cc @@ -837,11 +837,14 @@ void A64Emitter::FreeConstData(uintptr_t data) { memory::DeallocationType::kRelease); } -std::byte* A64Emitter::GetVConstPtr(VConst id) { +std::byte* A64Emitter::GetVConstPtr() const { + return reinterpret_cast(backend_->emitter_data()); +} + +std::byte* A64Emitter::GetVConstPtr(VConst id) const { // Load through fixed constant table setup by PlaceConstData. // It's important that the pointer is not signed, as it will be sign-extended. - return reinterpret_cast(backend_->emitter_data() + - sizeof(vec128_t) * id); + return GetVConstPtr() + GetVConstOffset(id); } // Implies possible StashV(0, ...)! diff --git a/src/xenia/cpu/backend/a64/a64_emitter.h b/src/xenia/cpu/backend/a64/a64_emitter.h index 463064ef1..3e0b35f36 100644 --- a/src/xenia/cpu/backend/a64/a64_emitter.h +++ b/src/xenia/cpu/backend/a64/a64_emitter.h @@ -119,8 +119,8 @@ enum VConst { }; enum A64EmitterFeatureFlags { - kA64EmitLSE = 1 << 0, - kA64EmitF16C = 1 << 1, + kA64EmitLSE = 1 << 0, + kA64EmitF16C = 1 << 1, }; class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator { @@ -204,7 +204,11 @@ class A64Emitter : public oaknut::CodeBlock, public oaknut::CodeGenerator { bool ConstantFitsIn32Reg(uint64_t v); void MovMem64(const oaknut::XRegSp& addr, intptr_t offset, uint64_t v); - std::byte* GetVConstPtr(VConst id); + std::byte* GetVConstPtr() const; + std::byte* GetVConstPtr(VConst id) const; + constexpr uintptr_t GetVConstOffset(VConst id) const { + return sizeof(vec128_t) * id; + } void LoadConstantV(oaknut::QReg dest, float v); void LoadConstantV(oaknut::QReg dest, double v); void LoadConstantV(oaknut::QReg dest, const vec128_t& v); diff --git a/src/xenia/cpu/backend/a64/a64_seq_vector.cc b/src/xenia/cpu/backend/a64/a64_seq_vector.cc index c92312fed..4f2b3bd95 100644 --- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc +++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc @@ -542,11 +542,11 @@ struct VECTOR_SHL_V128 e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -566,11 +566,11 @@ struct VECTOR_SHL_V128 e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -590,11 +590,11 @@ struct VECTOR_SHL_V128 e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -655,11 +655,11 @@ struct VECTOR_SHR_V128 e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -679,11 +679,11 @@ struct VECTOR_SHR_V128 e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -703,11 +703,11 @@ struct VECTOR_SHR_V128 e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -751,11 +751,11 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0] & 0x7); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -775,11 +775,11 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0] & 0xF); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -799,11 +799,11 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0] & 0x1F); return; } - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -835,11 +835,11 @@ struct VECTOR_ROTATE_LEFT_V128 I> { static void Emit(A64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { - e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), SP, e.StashConstantV(1, i.src2.constant())); } else { - e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), SP, e.StashV(1, i.src2)); } - e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), SP, e.StashV(0, i.src1)); switch (i.instr->flags) { case INT8_TYPE: e.CallNativeSafe( @@ -1333,23 +1333,24 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } + + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] // are valid - max before min to pack NaN as zero (5454082B is heavily // affected by the order - packs 0xFFFFFFFF in matrix code to get a 0 // constant). - e.MOVP2R(X0, e.GetVConstPtr(V3333)); - e.LDR(Q0, X0); + e.LDR(Q0, VConstData, e.GetVConstOffset(V3333)); e.FMAX(i.dest.reg().S4(), i.dest.reg().S4(), Q0.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLORSat)); - e.LDR(Q0, X0); + e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLORSat)); e.FMIN(i.dest.reg().S4(), src.S4(), Q0.S4()); // Extract bytes. // RGBA (XYZW) -> ARGB (WXYZ) // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) - e.MOVP2R(X0, e.GetVConstPtr(VPackD3DCOLOR)); - e.LDR(Q0, X0); + e.LDR(Q0, VConstData, e.GetVConstOffset(VPackD3DCOLOR)); e.TBL(i.dest.reg().B16(), List{i.dest.reg().B16()}, Q0.B16()); } static uint8x16_t EmulateFLOAT16_2(void*, std::byte src1[16]) { @@ -1433,18 +1434,18 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + // Saturate - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min)); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max)); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Pack - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_2)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_2)); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); } static void EmitSHORT_4(A64Emitter& e, const EmitArgType& i) { @@ -1454,18 +1455,18 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } - // Saturate - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Min)); - e.LDR(Q1, X0); - e.FMAXNM(i.dest.reg().S4(), src.S4(), Q1.S4()); + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_Max)); - e.LDR(Q1, X0); - e.FMINNM(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); + // Saturate + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Min)); + e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); + + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_Max)); + e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Pack - e.MOVP2R(X0, e.GetVConstPtr(VPackSHORT_4)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackSHORT_4)); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); } static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { @@ -1476,24 +1477,22 @@ struct PACK : Sequence> { if (i.src1.is_constant) { e.LoadConstantV(src, i.src1.constant()); } + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); // Saturate. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MinUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MinUnpacked)); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaxUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaxUnpacked)); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Remove the unneeded bits of the floats. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskUnpacked)); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // Shift the components up. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift)); e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Combine the components. @@ -1519,31 +1518,29 @@ struct PACK : Sequence> { src = i.dest; e.LoadConstantV(src, i.src1.constant()); } + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + // Saturate. - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MinUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MinUnpacked)); e.FMAX(i.dest.reg().S4(), src.S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaxUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaxUnpacked)); e.FMIN(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Remove the unneeded bits of the floats (so excess nibbles will also be // cleared). - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_MaskUnpacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_MaskUnpacked)); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // Store Y and W shifted left by 4 so vpshufb can be used with them. e.SHL(Q0.S4(), i.dest.reg().S4(), 4); // Place XZ where they're supposed to be. - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteXZ)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteXZ)); e.TBL(i.dest.reg().B16(), oaknut::List{i.dest.reg().B16()}, Q1.B16()); // Place YW. - e.MOVP2R(X0, e.GetVConstPtr(VPackULONG_4202020_PermuteYW)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackULONG_4202020_PermuteYW)); e.TBL(Q0.B16(), oaknut::List{Q0.B16()}, Q1.B16()); // Merge XZ and YW. e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q0.B16()); @@ -1742,11 +1739,14 @@ struct UNPACK : Sequence> { } static void EmitD3DCOLOR(A64Emitter& e, const EmitArgType& i) { // ARGB (WXYZ) -> RGBA (XYZW) + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); + if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(VOne)); - e.LDR(i.dest.reg(), X0); + e.LDR(i.dest.reg(), VConstData, e.GetVConstOffset(VOne)); return; } src = i.dest; @@ -1756,12 +1756,10 @@ struct UNPACK : Sequence> { } // src = ZZYYXXWW // Unpack to 000000ZZ,000000YY,000000XX,000000WW - e.MOVP2R(X0, e.GetVConstPtr(VUnpackD3DCOLOR)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackD3DCOLOR)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // Add 1.0f to each. - e.MOVP2R(X0, e.GetVConstPtr(VOne)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VOne)); e.EOR(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. } @@ -1850,12 +1848,14 @@ struct UNPACK : Sequence> { // (VD.z) = 0.0 // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) // src is (xx,xx,xx,VALUE) + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { src = i.dest; - e.MOVP2R(X0, e.GetVConstPtr(V3301)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3301)); return; } // TODO(benvanik): check other common constants/perform shuffle/or here. @@ -1865,8 +1865,7 @@ struct UNPACK : Sequence> { src = i.src1; } // Shuffle bytes. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_2)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_2)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // If negative, make smaller than 3 - sign extend before adding. @@ -1874,17 +1873,14 @@ struct UNPACK : Sequence> { e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); // Add 3,3,0,1. - e.MOVP2R(X0, e.GetVConstPtr(V3301)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3301)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); } @@ -1894,11 +1890,14 @@ struct UNPACK : Sequence> { // (VD.z) = 3.0 + (VB.y>>16)*2^-22 // (VD.w) = 3.0 + (VB.y)*2^-22 // src is (xx,xx,VALUE,VALUE) + + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(V3333)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3333)); return; } // TODO(benvanik): check other common constants/perform shuffle/or here. @@ -1908,8 +1907,7 @@ struct UNPACK : Sequence> { src = i.src1; } // Shuffle bytes. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_4)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_4)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // If negative, make smaller than 3 - sign extend before adding. @@ -1917,26 +1915,25 @@ struct UNPACK : Sequence> { e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 16); // Add 3,3,3,3. - e.MOVP2R(X0, e.GetVConstPtr(V3333)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3333)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackSHORT_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackSHORT_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); } static void EmitUINT_2101010(A64Emitter& e, const EmitArgType& i) { + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331)); return; } src = i.dest; @@ -1949,13 +1946,11 @@ struct UNPACK : Sequence> { e.DUP(i.dest.reg().S4(), src.Selem()[3]); // Keep only the needed components. // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_MaskPacked)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_MaskPacked)); e.AND(i.dest.reg().B16(), i.dest.reg().B16(), Q1.B16()); // Shift the components down. - e.MOVP2R(X0, e.GetVConstPtr(VPackUINT_2101010_Shift)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VPackUINT_2101010_Shift)); e.NEG(Q1.S4(), Q1.S4()); e.USHL(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // If XYZ are negative, make smaller than 3 - sign extend XYZ before adding. @@ -1963,27 +1958,26 @@ struct UNPACK : Sequence> { e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 22); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 22); // Add 3,3,3,1. - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3331)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackUINT_2101010_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackUINT_2101010_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.} } static void EmitULONG_4202020(A64Emitter& e, const EmitArgType& i) { + const XReg VConstData = X3; + e.MOVP2R(VConstData, e.GetVConstPtr()); + QReg src(0); if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(i.dest, X0); + e.LDR(i.dest, VConstData, e.GetVConstOffset(V3331)); return; } src = i.dest; @@ -1993,8 +1987,7 @@ struct UNPACK : Sequence> { } // Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW // will have excess 4 lower bits. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Permute)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Permute)); e.TBL(i.dest.reg().B16(), oaknut::List{src.B16()}, Q1.B16()); // Drop the excess nibble of YW. @@ -2018,16 +2011,13 @@ struct UNPACK : Sequence> { e.SHL(i.dest.reg().S4(), i.dest.reg().S4(), 12); e.SSHR(i.dest.reg().S4(), i.dest.reg().S4(), 12); // Add 3,3,3,1. - e.MOVP2R(X0, e.GetVConstPtr(V3331)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(V3331)); e.ADD(i.dest.reg().S4(), i.dest.reg().S4(), Q1.S4()); // Return quiet NaNs in case of negative overflow. - e.MOVP2R(X0, e.GetVConstPtr(VUnpackULONG_4202020_Overflow)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VUnpackULONG_4202020_Overflow)); e.CMEQ(Q0.S4(), i.dest.reg().S4(), Q1.S4()); - e.MOVP2R(X0, e.GetVConstPtr(VQNaN)); - e.LDR(Q1, X0); + e.LDR(Q1, VConstData, e.GetVConstOffset(VQNaN)); e.BSL(Q0.B16(), Q1.B16(), i.dest.reg().B16()); e.MOV(i.dest.reg().B16(), Q0.B16()); }