VectorRotateLeft for vrl*.

This commit is contained in:
Ben Vanik 2014-08-05 18:57:34 -07:00
parent 333fc71b44
commit ff59f23de0
10 changed files with 150 additions and 25 deletions

View File

@ -3640,30 +3640,26 @@ int Translate_VECTOR_SHA(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, fns[i->flags]); return DispatchToC(ctx, i, fns[i->flags]);
} }
template <typename T>
T ROTL(T v, int8_t sh) {
return (T(v) << sh) | (T(v) >> ((sizeof(T) * 8) - sh));
}
uint32_t IntCode_ROTATE_LEFT_I8(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_ROTATE_LEFT_I8(IntCodeState& ics, const IntCode* i) {
ics.rf[i->dest_reg].i8 = ics.rf[i->dest_reg].i8 = poly::rotate_left<uint8_t>(ics.rf[i->src1_reg].i8,
ROTL<uint8_t>(ics.rf[i->src1_reg].i8, ics.rf[i->src2_reg].i8); ics.rf[i->src2_reg].i8);
return IA_NEXT; return IA_NEXT;
} }
uint32_t IntCode_ROTATE_LEFT_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_ROTATE_LEFT_I16(IntCodeState& ics, const IntCode* i) {
ics.rf[i->dest_reg].i16 = ics.rf[i->dest_reg].i16 = poly::rotate_left<uint16_t>(ics.rf[i->src1_reg].i16,
ROTL<uint16_t>(ics.rf[i->src1_reg].i16, ics.rf[i->src2_reg].i8); ics.rf[i->src2_reg].i8);
return IA_NEXT; return IA_NEXT;
} }
uint32_t IntCode_ROTATE_LEFT_I32(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_ROTATE_LEFT_I32(IntCodeState& ics, const IntCode* i) {
// TODO(benvanik): use _rtol on vc++ // TODO(benvanik): use _rtol on vc++
ics.rf[i->dest_reg].i32 = ics.rf[i->dest_reg].i32 = poly::rotate_left<uint32_t>(ics.rf[i->src1_reg].i32,
ROTL<uint32_t>(ics.rf[i->src1_reg].i32, ics.rf[i->src2_reg].i8); ics.rf[i->src2_reg].i8);
return IA_NEXT; return IA_NEXT;
} }
uint32_t IntCode_ROTATE_LEFT_I64(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_ROTATE_LEFT_I64(IntCodeState& ics, const IntCode* i) {
// TODO(benvanik): use _rtol64 on vc++ // TODO(benvanik): use _rtol64 on vc++
ics.rf[i->dest_reg].i64 = ics.rf[i->dest_reg].i64 = poly::rotate_left<uint64_t>(ics.rf[i->src1_reg].i64,
ROTL<uint64_t>(ics.rf[i->src1_reg].i64, ics.rf[i->src2_reg].i8); ics.rf[i->src2_reg].i8);
return IA_NEXT; return IA_NEXT;
} }
int Translate_ROTATE_LEFT(TranslationContext& ctx, Instr* i) { int Translate_ROTATE_LEFT(TranslationContext& ctx, Instr* i) {
@ -3675,6 +3671,11 @@ int Translate_ROTATE_LEFT(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, fns[i->dest->type]); return DispatchToC(ctx, i, fns[i->dest->type]);
} }
int Translate_VECTOR_ROTATE_LEFT(TranslationContext& ctx, Instr* i) {
assert_always();
return 1;
}
uint32_t IntCode_BYTE_SWAP_I16(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_BYTE_SWAP_I16(IntCodeState& ics, const IntCode* i) {
ics.rf[i->dest_reg].i16 = poly::byte_swap(ics.rf[i->src1_reg].i16); ics.rf[i->dest_reg].i16 = poly::byte_swap(ics.rf[i->src1_reg].i16);
return IA_NEXT; return IA_NEXT;
@ -4218,11 +4219,12 @@ static const TranslateFn dispatch_table[] = {
Translate_SHL, Translate_VECTOR_SHL, Translate_SHL, Translate_VECTOR_SHL,
Translate_SHR, Translate_VECTOR_SHR, Translate_SHR, Translate_VECTOR_SHR,
Translate_SHA, Translate_VECTOR_SHA, Translate_SHA, Translate_VECTOR_SHA,
Translate_ROTATE_LEFT, Translate_BYTE_SWAP, Translate_ROTATE_LEFT, Translate_VECTOR_ROTATE_LEFT,
Translate_CNTLZ, Translate_INSERT, Translate_BYTE_SWAP, Translate_CNTLZ,
Translate_EXTRACT, Translate_SPLAT, Translate_INSERT, Translate_EXTRACT,
Translate_PERMUTE, Translate_SWIZZLE, Translate_SPLAT, Translate_PERMUTE,
Translate_PACK, Translate_UNPACK, Translate_SWIZZLE, Translate_PACK,
Translate_UNPACK,
TranslateInvalid, // Translate_COMPARE_EXCHANGE, TranslateInvalid, // Translate_COMPARE_EXCHANGE,
Translate_ATOMIC_EXCHANGE, Translate_ATOMIC_EXCHANGE,
TranslateInvalid, // Translate_ATOMIC_ADD, TranslateInvalid, // Translate_ATOMIC_ADD,

View File

@ -550,6 +550,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) {
/* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u,
0xFFFFFFFFu, 0x00000000u), 0xFFFFFFFFu, 0x00000000u),
/* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f), /* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f),
/* XMMPI32 */ vec128i(32, 32, 32, 32),
/* XMMSignMaskI8 */ vec128i(0x80808080u, 0x80808080u, /* XMMSignMaskI8 */ vec128i(0x80808080u, 0x80808080u,
0x80808080u, 0x80808080u), 0x80808080u, 0x80808080u),
/* XMMSignMaskI16 */ vec128i(0x80008000u, 0x80008000u, /* XMMSignMaskI16 */ vec128i(0x80008000u, 0x80008000u,

View File

@ -60,6 +60,7 @@ enum XmmConst {
XMMShiftByteMask, XMMShiftByteMask,
XMMUnsignedDwordMax, XMMUnsignedDwordMax,
XMM255, XMM255,
XMMPI32,
XMMSignMaskI8, XMMSignMaskI8,
XMMSignMaskI16, XMMSignMaskI16,
XMMSignMaskI32, XMMSignMaskI32,

View File

@ -4475,6 +4475,76 @@ EMITTER_OPCODE_TABLE(
ROTATE_LEFT_I64); ROTATE_LEFT_I64);
// ============================================================================
// OPCODE_VECTOR_ROTATE_LEFT
// ============================================================================
// TODO(benvanik): AVX512 has a native variable rotate (rolv).
EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I<OPCODE_VECTOR_ROTATE_LEFT, V128<>, V128<>, V128<>>)) {
static __m128i EmulateVectorRotateLeftI8(__m128i src1, __m128i src2) {
alignas(16) __m128i value;
alignas(16) __m128i shamt;
_mm_store_si128(&value, src1);
_mm_store_si128(&shamt, src2);
for (size_t i = 0; i < 16; ++i) {
value.m128i_u8[i] = poly::rotate_left<uint8_t>(
value.m128i_u8[i], shamt.m128i_u8[i] & 0x3);
}
return _mm_load_si128(&value);
}
static __m128i EmulateVectorRotateLeftI16(__m128i src1, __m128i src2) {
alignas(16) __m128i value;
alignas(16) __m128i shamt;
_mm_store_si128(&value, src1);
_mm_store_si128(&shamt, src2);
for (size_t i = 0; i < 8; ++i) {
value.m128i_u16[i] = poly::rotate_left<uint16_t>(
value.m128i_u16[i], shamt.m128i_u16[i] & 0xF);
}
return _mm_load_si128(&value);
}
static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) {
case INT8_TYPE:
// TODO(benvanik): native version (with shift magic).
e.lea(e.r8, e.StashXmm(i.src1));
e.lea(e.r9, e.StashXmm(i.src2));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI8));
e.vmovaps(i.dest, e.xmm0);
break;
case INT16_TYPE:
// TODO(benvanik): native version (with shift magic).
e.lea(e.r8, e.StashXmm(i.src1));
e.lea(e.r9, e.StashXmm(i.src2));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI16));
e.vmovaps(i.dest, e.xmm0);
break;
case INT32_TYPE: {
Xmm temp = i.dest;
if (i.dest == i.src1 || i.dest == i.src2) {
temp = e.xmm2;
}
// Shift left (to get high bits):
e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
e.vpsllvd(e.xmm1, i.src1, e.xmm0);
// Shift right (to get low bits):
e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32));
e.vpsubd(temp, e.xmm0);
e.vpsrlvd(i.dest, i.src1, e.xmm0);
// Merge:
e.vpor(i.dest, e.xmm1);
break;
}
default:
assert_always();
break;
}
}
};
EMITTER_OPCODE_TABLE(
OPCODE_VECTOR_ROTATE_LEFT,
VECTOR_ROTATE_LEFT_V128);
// ============================================================================ // ============================================================================
// OPCODE_BYTE_SWAP // OPCODE_BYTE_SWAP
// ============================================================================ // ============================================================================
@ -5287,6 +5357,7 @@ void RegisterSequences() {
REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR);
REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA);
REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_ROTATE_LEFT);
REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT);
REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP);
REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ);
//REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT);

View File

@ -1212,22 +1212,30 @@ XEEMITTER(vrfiz128, VX128_3(6, 1008), VX128_3)(PPCHIRBuilder& f, InstrData& i) {
} }
XEEMITTER(vrlb, 0x10000004, VX)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(vrlb, 0x10000004, VX)(PPCHIRBuilder& f, InstrData& i) {
XEINSTRNOTIMPLEMENTED(); // (VD) <- ROTL((VA), (VB)&0x3)
return 1; Value* v = f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT8_TYPE);
f.StoreVR(i.VX.VD, v);
return 0;
} }
XEEMITTER(vrlh, 0x10000044, VX)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(vrlh, 0x10000044, VX)(PPCHIRBuilder& f, InstrData& i) {
XEINSTRNOTIMPLEMENTED(); // (VD) <- ROTL((VA), (VB)&0xF)
return 1; Value* v = f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT16_TYPE);
f.StoreVR(i.VX.VD, v);
return 0;
} }
int InstrEmit_vrlw_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) {
// (VD) <- ROTL((VA), (VB)&0x1F)
Value* v = f.VectorRotateLeft(f.LoadVR(va), f.LoadVR(vb), INT32_TYPE);
f.StoreVR(vd, v);
return 0;
}
XEEMITTER(vrlw, 0x10000084, VX)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(vrlw, 0x10000084, VX)(PPCHIRBuilder& f, InstrData& i) {
XEINSTRNOTIMPLEMENTED(); return InstrEmit_vrlw_(f, i.VX.VD, i.VX.VA, i.VX.VB);
return 1;
} }
XEEMITTER(vrlw128, VX128(6, 80), VX128)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(vrlw128, VX128(6, 80), VX128)(PPCHIRBuilder& f, InstrData& i) {
XEINSTRNOTIMPLEMENTED(); return InstrEmit_vrlw_(f, VX128_VD128, VX128_VA128, VX128_VB128);
return 1;
} }
XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCHIRBuilder& f, XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCHIRBuilder& f,

View File

@ -1661,6 +1661,17 @@ Value* HIRBuilder::RotateLeft(Value* value1, Value* value2) {
return i->dest; return i->dest;
} }
Value* HIRBuilder::VectorRotateLeft(Value* value1, Value* value2, TypeName part_type) {
ASSERT_VECTOR_TYPE(value1);
ASSERT_VECTOR_TYPE(value2);
Instr* i = AppendInstr(OPCODE_VECTOR_ROTATE_LEFT_info, part_type, AllocValue(value1->type));
i->set_src1(value1);
i->set_src2(value2);
i->src3.value = NULL;
return i->dest;
}
Value* HIRBuilder::ByteSwap(Value* value) { Value* HIRBuilder::ByteSwap(Value* value) {
if (value->type == INT8_TYPE) { if (value->type == INT8_TYPE) {
return value; return value;

View File

@ -199,6 +199,7 @@ class HIRBuilder {
Value* Sha(Value* value1, int8_t value2); Value* Sha(Value* value1, int8_t value2);
Value* VectorSha(Value* value1, Value* value2, TypeName part_type); Value* VectorSha(Value* value1, Value* value2, TypeName part_type);
Value* RotateLeft(Value* value1, Value* value2); Value* RotateLeft(Value* value1, Value* value2);
Value* VectorRotateLeft(Value* value1, Value* value2, TypeName part_type);
Value* ByteSwap(Value* value); Value* ByteSwap(Value* value);
Value* CountLeadingZeros(Value* value); Value* CountLeadingZeros(Value* value);
Value* Insert(Value* value, Value* index, Value* part); Value* Insert(Value* value, Value* index, Value* part);

View File

@ -165,6 +165,7 @@ enum Opcode {
OPCODE_SHA, OPCODE_SHA,
OPCODE_VECTOR_SHA, OPCODE_VECTOR_SHA,
OPCODE_ROTATE_LEFT, OPCODE_ROTATE_LEFT,
OPCODE_VECTOR_ROTATE_LEFT,
OPCODE_BYTE_SWAP, OPCODE_BYTE_SWAP,
OPCODE_CNTLZ, OPCODE_CNTLZ,
OPCODE_INSERT, OPCODE_INSERT,

View File

@ -539,6 +539,12 @@ DEFINE_OPCODE(
OPCODE_SIG_V_V_V, OPCODE_SIG_V_V_V,
0) 0)
DEFINE_OPCODE(
OPCODE_VECTOR_ROTATE_LEFT,
"vector_rotate_left",
OPCODE_SIG_V_V_V,
0)
DEFINE_OPCODE( DEFINE_OPCODE(
OPCODE_BYTE_SWAP, OPCODE_BYTE_SWAP,
"byte_swap", "byte_swap",

View File

@ -108,6 +108,29 @@ inline bool bit_scan_forward(int64_t v, uint32_t* out_first_set_index) {
return bit_scan_forward(static_cast<uint64_t>(v), out_first_set_index); return bit_scan_forward(static_cast<uint64_t>(v), out_first_set_index);
} }
template <typename T>
inline T rotate_left(T v, uint8_t sh) {
return (T(v) << sh) | (T(v) >> ((sizeof(T) * 8) - sh));
}
#if XE_COMPILER_MSVC
template <>
inline uint8_t rotate_left(uint8_t v, uint8_t sh) {
return _rotl8(v, sh);
}
template <>
inline uint16_t rotate_left(uint16_t v, uint8_t sh) {
return _rotl16(v, sh);
}
template <>
inline uint32_t rotate_left(uint32_t v, uint8_t sh) {
return _rotl(v, sh);
}
template <>
inline uint64_t rotate_left(uint64_t v, uint8_t sh) {
return _rotl64(v, sh);
}
#endif // XE_COMPILER_MSVC
// Utilities for SSE values. // Utilities for SSE values.
template <int N> template <int N>
float m128_f32(const __m128& v) { float m128_f32(const __m128& v) {