From d19519e63c45ffb3b392218edf8024ce1dea4b05 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Wed, 11 Feb 2015 12:46:37 -0800 Subject: [PATCH] Implementing the vavg instructions (mostly). Fixes #155. --- src/alloy/backend/x64/x64_sequences.cc | 78 ++++++++++++++++++++++ src/alloy/frontend/ppc/ppc_emit_altivec.cc | 36 ++++++---- src/alloy/hir/hir_builder.cc | 18 +++++ src/alloy/hir/hir_builder.h | 2 + src/alloy/hir/opcodes.h | 1 + src/alloy/hir/opcodes.inl | 6 ++ 6 files changed, 129 insertions(+), 12 deletions(-) diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 6425d827c..3744f751f 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4748,6 +4748,83 @@ EMITTER_OPCODE_TABLE( VECTOR_ROTATE_LEFT_V128); +// ============================================================================ +// OPCODE_VECTOR_AVERAGE +// ============================================================================ +EMITTER(VECTOR_AVERAGE, MATCH(I, V128<>, V128<>>)) { + static __m128i EmulateVectorAverageUnsignedI32(void*, __m128i src1, __m128i src2) { + alignas(16) uint32_t src1v[4]; + alignas(16) uint32_t src2v[4]; + alignas(16) uint32_t value[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); + for (size_t i = 0; i < 4; ++i) { + auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) >> 1; + value[i] = uint32_t(t); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } + static __m128i EmulateVectorAverageSignedI32(void*, __m128i src1, __m128i src2) { + alignas(16) int32_t src1v[4]; + alignas(16) int32_t src2v[4]; + alignas(16) int32_t value[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); + for (size_t i = 0; i < 4; ++i) { + auto t = (int64_t(src1v[i]) + int64_t(src2v[i]) + 1) >> 1; + value[i] = int32_t(t); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + const TypeName part_type = static_cast(i.instr->flags & 0xFF); + const uint32_t arithmetic_flags = i.instr->flags >> 8; + bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); + switch (part_type) { + case INT8_TYPE: + if (is_unsigned) { + e.vpavgb(dest, src1, src2); + } else { + assert_always(); + } + break; + case INT16_TYPE: + if (is_unsigned) { + e.vpavgw(dest, src1, src2); + } else { + assert_always(); + } + break; + case INT32_TYPE: + // No 32bit averages in AVX. + if (is_unsigned) { + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.r9, e.StashXmm(1, i.src2)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorAverageUnsignedI32)); + e.vmovaps(i.dest, e.xmm0); + } else { + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.r9, e.StashXmm(1, i.src2)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorAverageSignedI32)); + e.vmovaps(i.dest, e.xmm0); + } + break; + default: + assert_unhandled_case(part_type); + break; + } + }); + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_AVERAGE, + VECTOR_AVERAGE); + + // ============================================================================ // OPCODE_BYTE_SWAP // ============================================================================ @@ -5751,6 +5828,7 @@ void RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_AVERAGE); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index 09e5fddb8..b2f9ab80d 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -460,13 +460,17 @@ XEEMITTER(vandc128, VX128(5, 592), VX128)(PPCHIRBuilder& f, InstrData& i) { } XEEMITTER(vavgsb, 0x10000502, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = + f.VectorAverage(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT8_TYPE, 0); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vavgsh, 0x10000542, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = + f.VectorAverage(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT16_TYPE, 0); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vavgsw, 0x10000582, VX)(PPCHIRBuilder& f, InstrData& i) { @@ -474,23 +478,31 @@ XEEMITTER(vavgsw, 0x10000582, VX)(PPCHIRBuilder& f, InstrData& i) { // aop = EXTS((VRA)i:i + 31) // bop = EXTS((VRB)i:i + 31) // VRTi:i + 31 = Chop((aop + int bop + int 1) >> 1, 32) - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = + f.VectorAverage(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE, 0); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vavgub, 0x10000402, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAverage(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT8_TYPE, + ARITHMETIC_UNSIGNED); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vavguh, 0x10000442, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAverage(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT16_TYPE, + ARITHMETIC_UNSIGNED); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vavguw, 0x10000482, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAverage(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT32_TYPE, + ARITHMETIC_UNSIGNED); + f.StoreVR(i.VX.VD, v); + return 0; } int InstrEmit_vcfsx_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 06ddda5b4..45c09bad2 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -1779,6 +1779,24 @@ Value* HIRBuilder::VectorRotateLeft(Value* value1, Value* value2, return i->dest; } +Value* HIRBuilder::VectorAverage(Value* value1, Value* value2, + TypeName part_type, + uint32_t arithmetic_flags) { + ASSERT_VECTOR_TYPE(value1); + ASSERT_VECTOR_TYPE(value2); + + // This is shady. + uint32_t flags = part_type | (arithmetic_flags << 8); + assert_zero(flags >> 16); + + Instr* i = AppendInstr(OPCODE_VECTOR_AVERAGE_info, uint16_t(flags), + AllocValue(value1->type)); + i->set_src1(value1); + i->set_src2(value2); + i->src3.value = NULL; + return i->dest; +} + Value* HIRBuilder::ByteSwap(Value* value) { if (value->type == INT8_TYPE) { return value; diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index 96862fc37..aac4b9c99 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -204,6 +204,8 @@ class HIRBuilder { Value* VectorSha(Value* value1, Value* value2, TypeName part_type); Value* RotateLeft(Value* value1, Value* value2); Value* VectorRotateLeft(Value* value1, Value* value2, TypeName part_type); + Value* VectorAverage(Value* value1, Value* value2, TypeName part_type, + uint32_t arithmetic_flags); Value* ByteSwap(Value* value); Value* CountLeadingZeros(Value* value); Value* Insert(Value* value, Value* index, Value* part); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index f5b9a215b..e14069ed0 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -200,6 +200,7 @@ enum Opcode { OPCODE_VECTOR_SHA, OPCODE_ROTATE_LEFT, OPCODE_VECTOR_ROTATE_LEFT, + OPCODE_VECTOR_AVERAGE, OPCODE_BYTE_SWAP, OPCODE_CNTLZ, OPCODE_INSERT, diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index d7e27cab5..47d9eb764 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -551,6 +551,12 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V, 0) +DEFINE_OPCODE( + OPCODE_VECTOR_AVERAGE, + "vector_average", + OPCODE_SIG_V_V_V, + 0) + DEFINE_OPCODE( OPCODE_BYTE_SWAP, "byte_swap",