diff --git a/src/xenia/cpu/backend/a64/a64_seq_vector.cc b/src/xenia/cpu/backend/a64/a64_seq_vector.cc index fb13ad044..c64134fe6 100644 --- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc +++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc @@ -424,7 +424,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB); // ============================================================================ // OPCODE_VECTOR_SHL // ============================================================================ +template ::value, int> = 0> +static uint8x16_t EmulateVectorShl(void*, std::byte src1[16], + std::byte src2[16]) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + // Load NEON registers into a C array. + vst1q_u8(reinterpret_cast(value), vld1q_u8(src1)); + vst1q_u8(reinterpret_cast(shamt), vld1q_u8(src2)); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return vld1q_u8(value); +} struct VECTOR_SHL_V128 : Sequence> { static void Emit(A64Emitter& e, const EmitArgType& i) { @@ -444,11 +460,77 @@ struct VECTOR_SHL_V128 } } - static void EmitInt8(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt8(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SHL + e.SHL(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } - static void EmitInt16(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt16(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SHL + e.SHL(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u8[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } - static void EmitInt32(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt32(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SHL + e.SHL(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u8[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); @@ -491,7 +573,7 @@ static uint8x16_t EmulateVectorShr(void*, uint8x16_t src1, uint8x16_t src2) { alignas(16) T value[16 / sizeof(T)]; alignas(16) T shamt[16 / sizeof(T)]; - // Load SSE registers into a C array. + // Load NEON registers into a C array. vst1q_u8(reinterpret_cast(value), src1); vst1q_u8(reinterpret_cast(shamt), src2);