diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 567a3494a..89fba86a8 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1340,46 +1340,6 @@ int Translate_LOAD(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->dest->type]); } -uint32_t IntCode_LOAD_VECTOR_LEFT_V128(IntCodeState& ics, const IntCode* i) { - const uint32_t address = ics.rf[i->src1_reg].u32; - const size_t eb = address & 0xF; - const size_t size = 16 - eb; - const uint8_t* p = ics.membase + address; - vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t i = 0; i < size; i++) { - dest.b16[size - 1 - i] = p[i]; - } - DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load_vector_left v128 %.8X\n", - dest.f4[0], dest.f4[1], dest.f4[2], dest.f4[3], - dest.i4[0], dest.i4[1], dest.i4[2], dest.i4[3], - address); - DFLUSH(); - return IA_NEXT; -} -int Translate_LOAD_VECTOR_LEFT(TranslationContext& ctx, Instr* i) { - return DispatchToC(ctx, i, IntCode_LOAD_VECTOR_LEFT_V128); -} - -uint32_t IntCode_LOAD_VECTOR_RIGHT_V128(IntCodeState& ics, const IntCode* i) { - const uint32_t address = ics.rf[i->src1_reg].u32; - const size_t eb = address & 0xF; - const size_t size = eb; - const uint8_t* p = ics.membase + address; - vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t i = 0; i < size; i++) { - dest.b16[i] = p[size - 1 - i]; - } - DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load_vector_right v128 %.8X\n", - dest.f4[0], dest.f4[1], dest.f4[2], dest.f4[3], - dest.i4[0], dest.i4[1], dest.i4[2], dest.i4[3], - address); - DFLUSH(); - return IA_NEXT; -} -int Translate_LOAD_VECTOR_RIGHT(TranslationContext& ctx, Instr* i) { - return DispatchToC(ctx, i, IntCode_LOAD_VECTOR_RIGHT_V128); -} - uint32_t IntCode_LOAD_ACQUIRE_I8(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; xe_atomic_exchange_32(address, ics.reserve_address); @@ -1614,38 +1574,6 @@ int Translate_STORE_RELEASE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->src2.value->type]); } -uint32_t IntCode_STORE_VECTOR_LEFT_V128(IntCodeState& ics, const IntCode* i) { - const uint32_t address = ics.rf[i->src1_reg].u32; - const size_t eb = address & 0xF; - const size_t size = 16 - eb; - uint8_t* p = ics.membase + address; - const vec128_t& src = ics.rf[i->src2_reg].v128; - // Note that if the input is already 16b aligned no bytes are stored. - for (size_t i = 0; i < size; i++) { - p[i] = src.b16[15 - i]; - } - return IA_NEXT; -} -int Translate_STORE_VECTOR_LEFT(TranslationContext& ctx, Instr* i) { - return DispatchToC(ctx, i, IntCode_STORE_VECTOR_LEFT_V128); -} - -uint32_t IntCode_STORE_VECTOR_RIGHT_V128(IntCodeState& ics, const IntCode* i) { - const uint32_t address = ics.rf[i->src1_reg].u32; - const size_t eb = address & 0xF; - const size_t size = eb; - uint8_t* p = ics.membase + (address & ~0xF); - const vec128_t& src = ics.rf[i->src2_reg].v128; - // Note that if the input is already 16b aligned no bytes are stored. - for (size_t i = 0; i < size; i++) { - p[size - 1 - i] = src.b16[i]; - } - return IA_NEXT; -} -int Translate_STORE_VECTOR_RIGHT(TranslationContext& ctx, Instr* i) { - return DispatchToC(ctx, i, IntCode_STORE_VECTOR_RIGHT_V128); -} - uint32_t IntCode_PREFETCH(IntCodeState& ics, const IntCode* i) { return IA_NEXT; } @@ -3222,12 +3150,8 @@ static const TranslateFn dispatch_table[] = { Translate_LOAD, Translate_LOAD_ACQUIRE, - Translate_LOAD_VECTOR_LEFT, - Translate_LOAD_VECTOR_RIGHT, Translate_STORE, Translate_STORE_RELEASE, - Translate_STORE_VECTOR_LEFT, - Translate_STORE_VECTOR_RIGHT, Translate_PREFETCH, TranslateInvalid, //Translate_MAX, diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index 86dab38e2..88c94ae5f 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -239,7 +239,14 @@ XEEMITTER(stvxl128, VX128_1(4, 963), VX128_1)(PPCFunctionBuilder& f, Inst // https://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/C40E4C6133B31EE8872570B500791108/$file/vector_simd_pem_v_2.07c_26Oct2006_cell.pdf int InstrEmit_lvlx_(PPCFunctionBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { Value* ea = ra ? f.Add(f.LoadGPR(ra), f.LoadGPR(rb)) : f.LoadGPR(rb); - Value* v = f.ByteSwap(f.LoadVectorLeft(ea, VEC128_TYPE)); + Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant((int8_t)0xF)); + // ea &= ~0xF (load takes care of this) + // v = (new << eb) + Value* v = f.Permute( + f.LoadVectorShl(eb), + f.ByteSwap(f.Load(ea, VEC128_TYPE)), + f.LoadZero(VEC128_TYPE), + INT8_TYPE); f.StoreVR(vd, v); return 0; } @@ -258,7 +265,14 @@ XEEMITTER(lvlxl128, VX128_1(4, 1539), VX128_1)(PPCFunctionBuilder& f, Inst int InstrEmit_lvrx_(PPCFunctionBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { Value* ea = ra ? f.Add(f.LoadGPR(ra), f.LoadGPR(rb)) : f.LoadGPR(rb); - Value* v = f.ByteSwap(f.LoadVectorRight(ea, VEC128_TYPE)); + Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant((int8_t)0xF)); + // ea &= ~0xF (load takes care of this) + // v = (new >> (16 - eb)) + Value* v = f.Permute( + f.LoadVectorShr(f.Sub(f.LoadConstant((int8_t)16), eb)), + f.LoadZero(VEC128_TYPE), + f.ByteSwap(f.Load(ea, VEC128_TYPE)), + INT8_TYPE); f.StoreVR(vd, v); return 0; } @@ -276,9 +290,30 @@ XEEMITTER(lvrxl128, VX128_1(4, 1603), VX128_1)(PPCFunctionBuilder& f, Inst } int InstrEmit_stvlx_(PPCFunctionBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + // NOTE: if eb == 0 (so 16b aligned) this equals new_value + // we could optimize this to prevent the other load/mask, in that case. Value* ea = ra ? f.Add(f.LoadGPR(ra), f.LoadGPR(rb)) : f.LoadGPR(rb); - Value* v = f.ByteSwap(f.LoadVR(vd)); - f.StoreVectorLeft(ea, v); + Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant((int8_t)0xF)); + Value* new_value = f.ByteSwap(f.LoadVR(vd)); + // ea &= ~0xF (load takes care of this) + Value* old_value = f.Load(ea, VEC128_TYPE); + // v = (new >> eb) | (old & (ONE << (16 - eb))) + Value* v = f.Permute( + f.LoadVectorShr(eb), + f.LoadZero(VEC128_TYPE), + new_value, + INT8_TYPE); + v = f.Or( + v, + f.And( + old_value, + f.Permute( + f.LoadVectorShl(f.Sub(f.LoadConstant((int8_t)16), eb)), + f.Not(f.LoadZero(VEC128_TYPE)), + f.LoadZero(VEC128_TYPE), + INT8_TYPE))); + // ea &= ~0xF (store takes care of this) + f.Store(ea, v); return 0; } XEEMITTER(stvlx, 0x7C00050E, X )(PPCFunctionBuilder& f, InstrData& i) { @@ -295,9 +330,31 @@ XEEMITTER(stvlxl128, VX128_1(4, 1795), VX128_1)(PPCFunctionBuilder& f, Inst } int InstrEmit_stvrx_(PPCFunctionBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { + // NOTE: if eb == 0 (so 16b aligned) this equals new_value + // we could optimize this to prevent the other load/mask, in that case. Value* ea = ra ? f.Add(f.LoadGPR(ra), f.LoadGPR(rb)) : f.LoadGPR(rb); - Value* v = f.ByteSwap(f.LoadVR(vd)); - f.StoreVectorRight(ea, v); + Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant((int8_t)0xF)); + Value* ebits = f.Mul(eb, f.LoadConstant((int8_t)8)); + Value* new_value = f.ByteSwap(f.LoadVR(vd)); + // ea &= ~0xF (load takes care of this) + Value* old_value = f.Load(ea, VEC128_TYPE); + // v = (new << (16 - eb)) | (old & (ONE >> eb)) + Value* v = f.Permute( + f.LoadVectorShl(f.Sub(f.LoadConstant((int8_t)16), eb)), + new_value, + f.LoadZero(VEC128_TYPE), + INT8_TYPE); + v = f.Or( + v, + f.And( + old_value, + f.Permute( + f.LoadVectorShr(eb), + f.LoadZero(VEC128_TYPE), + f.Not(f.LoadZero(VEC128_TYPE)), + INT8_TYPE))); + // ea &= ~0xF (store takes care of this) + f.Store(ea, v); return 0; } XEEMITTER(stvrx, 0x7C00054E, X )(PPCFunctionBuilder& f, InstrData& i) { diff --git a/src/alloy/hir/function_builder.cc b/src/alloy/hir/function_builder.cc index eb3750b20..8c909e716 100644 --- a/src/alloy/hir/function_builder.cc +++ b/src/alloy/hir/function_builder.cc @@ -781,28 +781,6 @@ Value* FunctionBuilder::Load( return i->dest; } -Value* FunctionBuilder::LoadVectorLeft( - Value* address, TypeName type, uint32_t load_flags) { - ASSERT_ADDRESS_TYPE(address); - Instr* i = AppendInstr( - OPCODE_LOAD_VECTOR_LEFT_info, load_flags, - AllocValue(type)); - i->set_src1(address); - i->src2.value = i->src3.value = NULL; - return i->dest; -} - -Value* FunctionBuilder::LoadVectorRight( - Value* address, TypeName type, uint32_t load_flags) { - ASSERT_ADDRESS_TYPE(address); - Instr* i = AppendInstr( - OPCODE_LOAD_VECTOR_RIGHT_info, load_flags, - AllocValue(type)); - i->set_src1(address); - i->src2.value = i->src3.value = NULL; - return i->dest; -} - Value* FunctionBuilder::LoadAcquire( Value* address, TypeName type, uint32_t load_flags) { ASSERT_ADDRESS_TYPE(address); @@ -834,26 +812,6 @@ Value* FunctionBuilder::StoreRelease( return i->dest; } -void FunctionBuilder::StoreVectorLeft( - Value* address, Value* value, uint32_t store_flags) { - ASSERT_ADDRESS_TYPE(address); - ASSERT_VECTOR_TYPE(value); - Instr* i = AppendInstr(OPCODE_STORE_VECTOR_LEFT_info, store_flags); - i->set_src1(address); - i->set_src2(value); - i->src3.value = NULL; -} - -void FunctionBuilder::StoreVectorRight( - Value* address, Value* value, uint32_t store_flags) { - ASSERT_ADDRESS_TYPE(address); - ASSERT_VECTOR_TYPE(value); - Instr* i = AppendInstr(OPCODE_STORE_VECTOR_RIGHT_info, store_flags); - i->set_src1(address); - i->set_src2(value); - i->src3.value = NULL; -} - void FunctionBuilder::Prefetch( Value* address, size_t length, uint32_t prefetch_flags) { ASSERT_ADDRESS_TYPE(address); diff --git a/src/alloy/hir/function_builder.h b/src/alloy/hir/function_builder.h index 62647542c..37d2ae89a 100644 --- a/src/alloy/hir/function_builder.h +++ b/src/alloy/hir/function_builder.h @@ -118,15 +118,8 @@ public: Value* Load(Value* address, TypeName type, uint32_t load_flags = 0); Value* LoadAcquire(Value* address, TypeName type, uint32_t load_flags = 0); - Value* LoadVectorLeft(Value* address, TypeName type, - uint32_t load_flags = 0); - Value* LoadVectorRight(Value* address, TypeName type, - uint32_t load_flags = 0); void Store(Value* address, Value* value, uint32_t store_flags = 0); Value* StoreRelease(Value* address, Value* value, uint32_t store_flags = 0); - void StoreVectorLeft(Value* address, Value* value, uint32_t store_flags = 0); - void StoreVectorRight(Value* address, Value* value, - uint32_t store_flags = 0); void Prefetch(Value* address, size_t length, uint32_t prefetch_flags = 0); Value* Max(Value* value1, Value* value2); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 1d0b1d560..3ee12a0a2 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -105,12 +105,8 @@ enum Opcode { OPCODE_LOAD, OPCODE_LOAD_ACQUIRE, - OPCODE_LOAD_VECTOR_LEFT, - OPCODE_LOAD_VECTOR_RIGHT, OPCODE_STORE, OPCODE_STORE_RELEASE, - OPCODE_STORE_VECTOR_LEFT, - OPCODE_STORE_VECTOR_RIGHT, OPCODE_PREFETCH, OPCODE_MAX, diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index b3b8f8250..d09853282 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -194,18 +194,6 @@ DEFINE_OPCODE( OPCODE_SIG_V_V, OPCODE_FLAG_MEMORY | OPCODE_FLAG_VOLATILE); -DEFINE_OPCODE( - OPCODE_LOAD_VECTOR_LEFT, - "load_vector_left", - OPCODE_SIG_V_V, - OPCODE_FLAG_MEMORY); - -DEFINE_OPCODE( - OPCODE_LOAD_VECTOR_RIGHT, - "load_vector_right", - OPCODE_SIG_V_V, - OPCODE_FLAG_MEMORY); - DEFINE_OPCODE( OPCODE_STORE, "store", @@ -218,18 +206,6 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V, OPCODE_FLAG_MEMORY | OPCODE_FLAG_VOLATILE); -DEFINE_OPCODE( - OPCODE_STORE_VECTOR_LEFT, - "store_vector_left", - OPCODE_SIG_X_V_V, - OPCODE_FLAG_MEMORY); - -DEFINE_OPCODE( - OPCODE_STORE_VECTOR_RIGHT, - "store_vector_right", - OPCODE_SIG_X_V_V, - OPCODE_FLAG_MEMORY); - DEFINE_OPCODE( OPCODE_PREFETCH, "prefetch", diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc index 9fc2f2bf3..ab53318cd 100644 --- a/src/alloy/hir/value.cc +++ b/src/alloy/hir/value.cc @@ -401,6 +401,10 @@ void Value::Not() { case INT64_TYPE: constant.i64 = ~constant.i64; break; + case VEC128_TYPE: + constant.v128.low = ~constant.v128.low; + constant.v128.high = ~constant.v128.high; + break; default: XEASSERTALWAYS(); break;