diff --git a/src/xenia/cpu/backend/a64/a64_seq_vector.cc b/src/xenia/cpu/backend/a64/a64_seq_vector.cc index c64134fe6..19345a294 100644 --- a/src/xenia/cpu/backend/a64/a64_seq_vector.cc +++ b/src/xenia/cpu/backend/a64/a64_seq_vector.cc @@ -537,7 +537,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); // ============================================================================ // OPCODE_VECTOR_SHR // ============================================================================ +template ::value, int> = 0> +static uint8x16_t EmulateVectorShr(void*, std::byte src1[16], + std::byte src2[16]) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + // Load NEON registers into a C array. + vst1q_u8(reinterpret_cast(value), vld1q_u8(src1)); + vst1q_u8(reinterpret_cast(shamt), vld1q_u8(src2)); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return vld1q_u8(value); +} struct VECTOR_SHR_V128 : Sequence> { static void Emit(A64Emitter& e, const EmitArgType& i) { @@ -557,33 +573,83 @@ struct VECTOR_SHR_V128 } } - static void EmitInt8(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt8(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 16 - n; ++n) { + if (shamt.u8[n] != shamt.u8[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use USHR + e.USHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } - static void EmitInt16(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt16(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use USHR + e.USHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } - static void EmitInt32(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt32(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use USHR + e.USHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); // ============================================================================ // OPCODE_VECTOR_SHA // ============================================================================ -template ::value, int> = 0> -static uint8x16_t EmulateVectorShr(void*, uint8x16_t src1, uint8x16_t src2) { - alignas(16) T value[16 / sizeof(T)]; - alignas(16) T shamt[16 / sizeof(T)]; - - // Load NEON registers into a C array. - vst1q_u8(reinterpret_cast(value), src1); - vst1q_u8(reinterpret_cast(shamt), src2); - - for (size_t i = 0; i < (16 / sizeof(T)); ++i) { - value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1)); - } - - // Store result and return it. - return vld1q_f32(value); -} struct VECTOR_SHA_V128 : Sequence> { static void Emit(A64Emitter& e, const EmitArgType& i) { @@ -618,18 +684,62 @@ struct VECTOR_SHA_V128 e.SSHR(i.dest.reg().B16(), i.src1.reg().B16(), shamt.u8[0]); return; } - e.LDR(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); } else { - e.LDR(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); } - e.LDR(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.MOV(i.dest.reg().B16(), Q0.B16()); } - static void EmitInt16(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt16(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SSHR + e.SSHR(i.dest.reg().H8(), i.src1.reg().H8(), shamt.u16[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } - static void EmitInt32(A64Emitter& e, const EmitArgType& i) {} + static void EmitInt32(A64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use SSHR + e.SSHR(i.dest.reg().S4(), i.src1.reg().S4(), shamt.u32[0]); + return; + } + e.ADD(e.GetNativeParam(1), XSP, e.StashConstantV(1, i.src2.constant())); + } else { + e.ADD(e.GetNativeParam(1), XSP, e.StashV(1, i.src2)); + } + e.ADD(e.GetNativeParam(0), XSP, e.StashV(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); + e.MOV(i.dest.reg().B16(), Q0.B16()); + } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128);