From ff59f23de07b95b1a66c2063bf5ad53c51ba74d3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 5 Aug 2014 18:57:34 -0700 Subject: [PATCH] VectorRotateLeft for vrl*. --- src/alloy/backend/ivm/ivm_intcode.cc | 36 +++++------ src/alloy/backend/x64/x64_emitter.cc | 1 + src/alloy/backend/x64/x64_emitter.h | 1 + src/alloy/backend/x64/x64_sequences.cc | 71 ++++++++++++++++++++++ src/alloy/frontend/ppc/ppc_emit_altivec.cc | 24 +++++--- src/alloy/hir/hir_builder.cc | 11 ++++ src/alloy/hir/hir_builder.h | 1 + src/alloy/hir/opcodes.h | 1 + src/alloy/hir/opcodes.inl | 6 ++ src/poly/math.h | 23 +++++++ 10 files changed, 150 insertions(+), 25 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 5e8d7c9c4..741b167ae 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -3640,30 +3640,26 @@ int Translate_VECTOR_SHA(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -template -T ROTL(T v, int8_t sh) { - return (T(v) << sh) | (T(v) >> ((sizeof(T) * 8) - sh)); -} uint32_t IntCode_ROTATE_LEFT_I8(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].i8 = - ROTL(ics.rf[i->src1_reg].i8, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i8 = poly::rotate_left(ics.rf[i->src1_reg].i8, + ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_ROTATE_LEFT_I16(IntCodeState& ics, const IntCode* i) { - ics.rf[i->dest_reg].i16 = - ROTL(ics.rf[i->src1_reg].i16, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i16 = poly::rotate_left(ics.rf[i->src1_reg].i16, + ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_ROTATE_LEFT_I32(IntCodeState& ics, const IntCode* i) { // TODO(benvanik): use _rtol on vc++ - ics.rf[i->dest_reg].i32 = - ROTL(ics.rf[i->src1_reg].i32, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i32 = poly::rotate_left(ics.rf[i->src1_reg].i32, + ics.rf[i->src2_reg].i8); return IA_NEXT; } uint32_t IntCode_ROTATE_LEFT_I64(IntCodeState& ics, const IntCode* i) { // TODO(benvanik): use _rtol64 on vc++ - ics.rf[i->dest_reg].i64 = - ROTL(ics.rf[i->src1_reg].i64, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i64 = poly::rotate_left(ics.rf[i->src1_reg].i64, + ics.rf[i->src2_reg].i8); return IA_NEXT; } int Translate_ROTATE_LEFT(TranslationContext& ctx, Instr* i) { @@ -3675,6 +3671,11 @@ int Translate_ROTATE_LEFT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->dest->type]); } +int Translate_VECTOR_ROTATE_LEFT(TranslationContext& ctx, Instr* i) { + assert_always(); + return 1; +} + uint32_t IntCode_BYTE_SWAP_I16(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].i16 = poly::byte_swap(ics.rf[i->src1_reg].i16); return IA_NEXT; @@ -4218,11 +4219,12 @@ static const TranslateFn dispatch_table[] = { Translate_SHL, Translate_VECTOR_SHL, Translate_SHR, Translate_VECTOR_SHR, Translate_SHA, Translate_VECTOR_SHA, - Translate_ROTATE_LEFT, Translate_BYTE_SWAP, - Translate_CNTLZ, Translate_INSERT, - Translate_EXTRACT, Translate_SPLAT, - Translate_PERMUTE, Translate_SWIZZLE, - Translate_PACK, Translate_UNPACK, + Translate_ROTATE_LEFT, Translate_VECTOR_ROTATE_LEFT, + Translate_BYTE_SWAP, Translate_CNTLZ, + Translate_INSERT, Translate_EXTRACT, + Translate_SPLAT, Translate_PERMUTE, + Translate_SWIZZLE, Translate_PACK, + Translate_UNPACK, TranslateInvalid, // Translate_COMPARE_EXCHANGE, Translate_ATOMIC_EXCHANGE, TranslateInvalid, // Translate_ATOMIC_ADD, diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index b9850b80f..a7997334c 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -550,6 +550,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), /* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f), + /* XMMPI32 */ vec128i(32, 32, 32, 32), /* XMMSignMaskI8 */ vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u), /* XMMSignMaskI16 */ vec128i(0x80008000u, 0x80008000u, diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index 414a94899..785108d71 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -60,6 +60,7 @@ enum XmmConst { XMMShiftByteMask, XMMUnsignedDwordMax, XMM255, + XMMPI32, XMMSignMaskI8, XMMSignMaskI16, XMMSignMaskI32, diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index aaa34ff2c..064c76295 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4475,6 +4475,76 @@ EMITTER_OPCODE_TABLE( ROTATE_LEFT_I64); +// ============================================================================ +// OPCODE_VECTOR_ROTATE_LEFT +// ============================================================================ +// TODO(benvanik): AVX512 has a native variable rotate (rolv). +EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I, V128<>, V128<>>)) { + static __m128i EmulateVectorRotateLeftI8(__m128i src1, __m128i src2) { + alignas(16) __m128i value; + alignas(16) __m128i shamt; + _mm_store_si128(&value, src1); + _mm_store_si128(&shamt, src2); + for (size_t i = 0; i < 16; ++i) { + value.m128i_u8[i] = poly::rotate_left( + value.m128i_u8[i], shamt.m128i_u8[i] & 0x3); + } + return _mm_load_si128(&value); + } + static __m128i EmulateVectorRotateLeftI16(__m128i src1, __m128i src2) { + alignas(16) __m128i value; + alignas(16) __m128i shamt; + _mm_store_si128(&value, src1); + _mm_store_si128(&shamt, src2); + for (size_t i = 0; i < 8; ++i) { + value.m128i_u16[i] = poly::rotate_left( + value.m128i_u16[i], shamt.m128i_u16[i] & 0xF); + } + return _mm_load_si128(&value); + } + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + // TODO(benvanik): native version (with shift magic). + e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r9, e.StashXmm(i.src2)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI8)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT16_TYPE: + // TODO(benvanik): native version (with shift magic). + e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r9, e.StashXmm(i.src2)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI16)); + e.vmovaps(i.dest, e.xmm0); + break; + case INT32_TYPE: { + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Shift left (to get high bits): + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + // Shift right (to get low bits): + e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vpsubd(temp, e.xmm0); + e.vpsrlvd(i.dest, i.src1, e.xmm0); + // Merge: + e.vpor(i.dest, e.xmm1); + break; + } + default: + assert_always(); + break; + } + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_VECTOR_ROTATE_LEFT, + VECTOR_ROTATE_LEFT_V128); + + // ============================================================================ // OPCODE_BYTE_SWAP // ============================================================================ @@ -5287,6 +5357,7 @@ void RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ); //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index 04ab5eebf..a75c20026 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -1212,22 +1212,30 @@ XEEMITTER(vrfiz128, VX128_3(6, 1008), VX128_3)(PPCHIRBuilder& f, InstrData& i) { } XEEMITTER(vrlb, 0x10000004, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // (VD) <- ROTL((VA), (VB)&0x3) + Value* v = f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT8_TYPE); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vrlh, 0x10000044, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + // (VD) <- ROTL((VA), (VB)&0xF) + Value* v = f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT16_TYPE); + f.StoreVR(i.VX.VD, v); + return 0; } +int InstrEmit_vrlw_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) { + // (VD) <- ROTL((VA), (VB)&0x1F) + Value* v = f.VectorRotateLeft(f.LoadVR(va), f.LoadVR(vb), INT32_TYPE); + f.StoreVR(vd, v); + return 0; +} XEEMITTER(vrlw, 0x10000084, VX)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vrlw_(f, i.VX.VD, i.VX.VA, i.VX.VB); } XEEMITTER(vrlw128, VX128(6, 80), VX128)(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + return InstrEmit_vrlw_(f, VX128_VD128, VX128_VA128, VX128_VB128); } XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCHIRBuilder& f, diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 30ac78358..29d404af1 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -1661,6 +1661,17 @@ Value* HIRBuilder::RotateLeft(Value* value1, Value* value2) { return i->dest; } +Value* HIRBuilder::VectorRotateLeft(Value* value1, Value* value2, TypeName part_type) { + ASSERT_VECTOR_TYPE(value1); + ASSERT_VECTOR_TYPE(value2); + + Instr* i = AppendInstr(OPCODE_VECTOR_ROTATE_LEFT_info, part_type, AllocValue(value1->type)); + i->set_src1(value1); + i->set_src2(value2); + i->src3.value = NULL; + return i->dest; +} + Value* HIRBuilder::ByteSwap(Value* value) { if (value->type == INT8_TYPE) { return value; diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index 44f149b8b..c4440e6b5 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -199,6 +199,7 @@ class HIRBuilder { Value* Sha(Value* value1, int8_t value2); Value* VectorSha(Value* value1, Value* value2, TypeName part_type); Value* RotateLeft(Value* value1, Value* value2); + Value* VectorRotateLeft(Value* value1, Value* value2, TypeName part_type); Value* ByteSwap(Value* value); Value* CountLeadingZeros(Value* value); Value* Insert(Value* value, Value* index, Value* part); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index c7b3c0e50..ca63f69eb 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -165,6 +165,7 @@ enum Opcode { OPCODE_SHA, OPCODE_VECTOR_SHA, OPCODE_ROTATE_LEFT, + OPCODE_VECTOR_ROTATE_LEFT, OPCODE_BYTE_SWAP, OPCODE_CNTLZ, OPCODE_INSERT, diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index b1c153c71..14c1dbf8b 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -539,6 +539,12 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V, 0) +DEFINE_OPCODE( + OPCODE_VECTOR_ROTATE_LEFT, + "vector_rotate_left", + OPCODE_SIG_V_V_V, + 0) + DEFINE_OPCODE( OPCODE_BYTE_SWAP, "byte_swap", diff --git a/src/poly/math.h b/src/poly/math.h index 57b0190d1..1783eb02a 100644 --- a/src/poly/math.h +++ b/src/poly/math.h @@ -108,6 +108,29 @@ inline bool bit_scan_forward(int64_t v, uint32_t* out_first_set_index) { return bit_scan_forward(static_cast(v), out_first_set_index); } +template +inline T rotate_left(T v, uint8_t sh) { + return (T(v) << sh) | (T(v) >> ((sizeof(T) * 8) - sh)); +} +#if XE_COMPILER_MSVC +template <> +inline uint8_t rotate_left(uint8_t v, uint8_t sh) { + return _rotl8(v, sh); +} +template <> +inline uint16_t rotate_left(uint16_t v, uint8_t sh) { + return _rotl16(v, sh); +} +template <> +inline uint32_t rotate_left(uint32_t v, uint8_t sh) { + return _rotl(v, sh); +} +template <> +inline uint64_t rotate_left(uint64_t v, uint8_t sh) { + return _rotl64(v, sh); +} +#endif // XE_COMPILER_MSVC + // Utilities for SSE values. template float m128_f32(const __m128& v) {