From 3fbebcfa08dd098b5def1127bdfc2fe21bd1ca8f Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Thu, 9 Jan 2014 21:57:07 -0800 Subject: [PATCH] VectorAdd and saturation checks. --- src/alloy/backend/ivm/ivm_function.cc | 1 + src/alloy/backend/ivm/ivm_intcode.cc | 149 +++++++++++++++++++++ src/alloy/backend/ivm/ivm_intcode.h | 1 + src/alloy/frontend/ppc/ppc_context.h | 2 + src/alloy/frontend/ppc/ppc_emit_altivec.cc | 63 ++++++--- src/alloy/frontend/ppc/ppc_hir_builder.cc | 8 ++ src/alloy/frontend/ppc/ppc_hir_builder.h | 2 + src/alloy/hir/hir_builder.cc | 27 ++++ src/alloy/hir/hir_builder.h | 7 +- src/alloy/hir/opcodes.h | 2 + src/alloy/hir/opcodes.inl | 12 +- 11 files changed, 251 insertions(+), 23 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_function.cc b/src/alloy/backend/ivm/ivm_function.cc index e52c5039f..1706a27dc 100644 --- a/src/alloy/backend/ivm/ivm_function.cc +++ b/src/alloy/backend/ivm/ivm_function.cc @@ -115,6 +115,7 @@ int IVMFunction::CallImpl(ThreadState* thread_state, uint64_t return_address) { ics.membase = memory->membase(); ics.reserve_address = memory->reserve_address(); ics.did_carry = 0; + ics.did_saturate = 0; ics.access_callbacks = thread_state->runtime()->access_callbacks(); ics.thread_state = thread_state; ics.return_address = return_address; diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 1dd567e0a..04249b95e 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -1164,8 +1164,10 @@ uint32_t IntCode_VECTOR_CONVERT_F2I_SAT(IntCodeState& ics, const IntCode* i) { float src = src1.f4[n]; if (src < 0) { dest.i4[n] = 0; + ics.did_saturate = 1; } else if (src > UINT_MAX) { dest.i4[n] = UINT_MAX; + ics.did_saturate = 1; } else { dest.i4[n] = (uint32_t)src; } @@ -1175,8 +1177,10 @@ uint32_t IntCode_VECTOR_CONVERT_F2I_SAT(IntCodeState& ics, const IntCode* i) { float src = src1.f4[n]; if (src < INT_MIN) { dest.i4[n] = INT_MIN; + ics.did_saturate = 1; } else if (src > INT_MAX) { dest.i4[n] = INT_MAX; + ics.did_saturate = 1; } else { dest.i4[n] = (int32_t)src; } @@ -2018,6 +2022,14 @@ int Translate_DID_CARRY(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, IntCode_DID_CARRY); } +uint32_t IntCode_DID_SATURATE(IntCodeState& ics, const IntCode* i) { + ics.rf[i->dest_reg].i8 = ics.did_saturate; + return IA_NEXT; +} +int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) { + return DispatchToC(ctx, i, IntCode_DID_SATURATE); +} + #define VECTOR_COMPARER(type, value, count, op) \ const vec128_t& src1 = ics.rf[i->src1_reg].v128; \ const vec128_t& src2 = ics.rf[i->src2_reg].v128; \ @@ -2157,6 +2169,7 @@ uint32_t IntCode_ADD_F64_F64(IntCodeState& ics, const IntCode* i) { return IA_NEXT; } uint32_t IntCode_ADD_V128_V128(IntCodeState& ics, const IntCode* i) { + XEASSERT(!i->flags); const vec128_t& src1 = ics.rf[i->src1_reg].v128; const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; @@ -2235,6 +2248,140 @@ int Translate_ADD_CARRY(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->dest->type]); } +uint32_t Translate_VECTOR_ADD_I8(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + const uint32_t arithmetic_flags = i->flags >> 8; + if (arithmetic_flags & ARITHMETIC_SATURATE) { + if (arithmetic_flags & ARITHMETIC_UNSIGNED) { + for (int n = 0; n < 16; n++) { + uint16_t v = src1.b16[n] + src2.b16[n]; + if (v > 0xFF) { + dest.b16[n] = 0xFF; + ics.did_saturate = 1; + } else { + dest.b16[n] = (uint8_t)v; + } + } + } else { + for (int n = 0; n < 16; n++) { + int16_t v = (int8_t)src1.b16[n] + (int8_t)src2.b16[n]; + if (v > 0x7F) { + dest.b16[n] = 0x7F; + ics.did_saturate = 1; + } else if (v < -0x80) { + dest.b16[n] = -0x80; + ics.did_saturate = 1; + } else { + dest.b16[n] = (uint8_t)v; + } + } + } + } else { + for (int n = 0; n < 16; n++) { + dest.b16[n] = src1.b16[n] + src2.b16[n]; + } + } + return IA_NEXT; +} +uint32_t Translate_VECTOR_ADD_I16(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + const uint32_t arithmetic_flags = i->flags >> 8; + if (arithmetic_flags & ARITHMETIC_SATURATE) { + if (arithmetic_flags & ARITHMETIC_UNSIGNED) { + for (int n = 0; n < 8; n++) { + uint32_t v = src1.s8[n] + src2.s8[n]; + if (v > 0xFFFF) { + dest.s8[n] = 0xFFFF; + ics.did_saturate = 1; + } else { + dest.s8[n] = (uint16_t)v; + } + } + } else { + for (int n = 0; n < 8; n++) { + int32_t v = (int16_t)src1.s8[n] + (int16_t)src2.s8[n]; + if (v > 0x7FFF) { + dest.s8[n] = 0x7FFF; + ics.did_saturate = 1; + } else if (v < -0x8000) { + dest.s8[n] = -0x8000; + ics.did_saturate = 1; + } else { + dest.s8[n] = (uint16_t)v; + } + } + } + } else { + for (int n = 0; n < 8; n++) { + dest.s8[n] = src1.s8[n] + src2.s8[n]; + } + } + return IA_NEXT; +} +uint32_t Translate_VECTOR_ADD_I32(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + const uint32_t arithmetic_flags = i->flags >> 8; + if (arithmetic_flags & ARITHMETIC_SATURATE) { + if (arithmetic_flags & ARITHMETIC_UNSIGNED) { + for (int n = 0; n < 4; n++) { + uint64_t v = src1.i4[n] + src2.i4[n]; + if (v > 0xFFFFFFFF) { + dest.i4[n] = 0xFFFFFFFF; + ics.did_saturate = 1; + } else { + dest.i4[n] = (uint32_t)v; + } + } + } else { + for (int n = 0; n < 4; n++) { + int64_t v = (int32_t)src1.i4[n] + (int32_t)src2.i4[n]; + if (v > 0x7FFFFFFF) { + dest.i4[n] = 0x7FFFFFFF; + ics.did_saturate = 1; + } else if (v < 0x80000000ull) { + dest.i4[n] = 0x80000000; + ics.did_saturate = 1; + } else { + dest.i4[n] = (uint32_t)v; + } + } + } + } else { + for (int n = 0; n < 4; n++) { + dest.i4[n] = src1.i4[n] + src2.i4[n]; + } + } + return IA_NEXT; +} +uint32_t Translate_VECTOR_ADD_F32(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + for (int n = 0; n < 4; n++) { + dest.f4[n] = src1.f4[n] + src2.f4[n]; + } + return IA_NEXT; +} +int Translate_VECTOR_ADD(TranslationContext& ctx, Instr* i) { + TypeName part_type = (TypeName)(i->flags & 0xFF); + static IntCodeFn fns[] = { + Translate_VECTOR_ADD_I8, + Translate_VECTOR_ADD_I16, + Translate_VECTOR_ADD_I32, + IntCode_INVALID_TYPE, + Translate_VECTOR_ADD_F32, + IntCode_INVALID_TYPE, + IntCode_INVALID_TYPE, + }; + return DispatchToC(ctx, i, fns[part_type]); +} + #define SUB_DID_CARRY(a, b) \ ((b) == 0) || CHECK_DID_CARRY(a, 0 - b) uint32_t IntCode_SUB_I8_I8(IntCodeState& ics, const IntCode* i) { @@ -3670,6 +3817,7 @@ static const TranslateFn dispatch_table[] = { Translate_COMPARE_UGE, Translate_DID_CARRY, TranslateInvalid, //Translate_DID_OVERFLOW, + Translate_DID_SATURATE, Translate_VECTOR_COMPARE_EQ, Translate_VECTOR_COMPARE_SGT, Translate_VECTOR_COMPARE_SGE, @@ -3678,6 +3826,7 @@ static const TranslateFn dispatch_table[] = { Translate_ADD, Translate_ADD_CARRY, + Translate_VECTOR_ADD, Translate_SUB, Translate_MUL, Translate_MUL_HI, diff --git a/src/alloy/backend/ivm/ivm_intcode.h b/src/alloy/backend/ivm/ivm_intcode.h index d609e94a5..296a180cc 100644 --- a/src/alloy/backend/ivm/ivm_intcode.h +++ b/src/alloy/backend/ivm/ivm_intcode.h @@ -45,6 +45,7 @@ typedef struct { uint8_t* membase; uint32_t* reserve_address; int8_t did_carry; + int8_t did_saturate; runtime::RegisterAccessCallbacks* access_callbacks; runtime::ThreadState* thread_state; uint64_t return_address; diff --git a/src/alloy/frontend/ppc/ppc_context.h b/src/alloy/frontend/ppc/ppc_context.h index a7e984754..08205e349 100644 --- a/src/alloy/frontend/ppc/ppc_context.h +++ b/src/alloy/frontend/ppc/ppc_context.h @@ -175,6 +175,8 @@ typedef struct XECACHEALIGN64 PPCContext_s { } bits; } fpscr; // Floating-point status and control register + uint8_t vscr_sat; + double f[32]; // Floating-point registers vec128_t v[128]; // VMX128 vector registers diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index fcdf79b5f..57ec71c7a 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -404,48 +404,75 @@ XEEMITTER(vaddfp128, VX128(5, 16), VX128 )(PPCHIRBuilder& f, InstrData } XEEMITTER(vaddsbs, 0x10000300, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT8_TYPE, ARITHMETIC_SATURATE); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vaddshs, 0x10000340, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT16_TYPE, ARITHMETIC_SATURATE); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vaddsws, 0x10000380, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT32_TYPE, ARITHMETIC_SATURATE); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vaddubm, 0x10000000, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT8_TYPE, ARITHMETIC_UNSIGNED); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vaddubs, 0x10000200, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT8_TYPE, ARITHMETIC_UNSIGNED | ARITHMETIC_SATURATE); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vadduhm, 0x10000040, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT16_TYPE, ARITHMETIC_UNSIGNED); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vadduhs, 0x10000240, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT16_TYPE, ARITHMETIC_UNSIGNED | ARITHMETIC_SATURATE); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vadduwm, 0x10000080, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT32_TYPE, ARITHMETIC_UNSIGNED); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } XEEMITTER(vadduws, 0x10000280, VX )(PPCHIRBuilder& f, InstrData& i) { - XEINSTRNOTIMPLEMENTED(); - return 1; + Value* v = f.VectorAdd(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), + INT32_TYPE, ARITHMETIC_UNSIGNED | ARITHMETIC_SATURATE); + f.StoreSAT(f.DidSaturate(v)); + f.StoreVR(i.VX.VD, v); + return 0; } int InstrEmit_vand_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) { diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.cc b/src/alloy/frontend/ppc/ppc_hir_builder.cc index 3be8296c7..15fd487f7 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.cc +++ b/src/alloy/frontend/ppc/ppc_hir_builder.cc @@ -286,6 +286,14 @@ void PPCHIRBuilder::StoreCA(Value* value) { StoreContext(offsetof(PPCContext, xer_ca), value); } +Value* PPCHIRBuilder::LoadSAT() { + return LoadContext(offsetof(PPCContext, vscr_sat), INT8_TYPE); +} + +void PPCHIRBuilder::StoreSAT(Value* value) { + StoreContext(offsetof(PPCContext, vscr_sat), value); +} + Value* PPCHIRBuilder::LoadGPR(uint32_t reg) { return LoadContext( offsetof(PPCContext, r) + reg * 8, INT64_TYPE); diff --git a/src/alloy/frontend/ppc/ppc_hir_builder.h b/src/alloy/frontend/ppc/ppc_hir_builder.h index ba9b6c4da..f2eecb86a 100644 --- a/src/alloy/frontend/ppc/ppc_hir_builder.h +++ b/src/alloy/frontend/ppc/ppc_hir_builder.h @@ -55,6 +55,8 @@ public: //void StoreOV(Value* value); Value* LoadCA(); void StoreCA(Value* value); + Value* LoadSAT(); + void StoreSAT(Value* value); Value* LoadGPR(uint32_t reg); void StoreGPR(uint32_t reg, Value* value); diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 3a58cb4a9..481347635 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -1052,6 +1052,15 @@ Value* HIRBuilder::DidOverflow(Value* value) { return i->dest; } +Value* HIRBuilder::DidSaturate(Value* value) { + Instr* i = AppendInstr( + OPCODE_DID_SATURATE_info, 0, + AllocValue(INT8_TYPE)); + i->set_src1(value); + i->src2.value = i->src3.value = NULL; + return i->dest; +} + Value* HIRBuilder::VectorCompareXX( const OpcodeInfo& opcode, Value* value1, Value* value2, TypeName part_type) { @@ -1140,6 +1149,24 @@ Value* HIRBuilder::AddWithCarry( return i->dest; } +Value* HIRBuilder::VectorAdd(Value* value1, Value* value2, TypeName part_type, + uint32_t arithmetic_flags) { + ASSERT_VECTOR_TYPE(value1); + ASSERT_VECTOR_TYPE(value2); + + // This is shady. + uint32_t flags = part_type | (arithmetic_flags << 8); + XEASSERTZERO(flags >> 16); + + Instr* i = AppendInstr( + OPCODE_VECTOR_ADD_info, (uint16_t)flags, + AllocValue(value1->type)); + i->set_src1(value1); + i->set_src2(value2); + i->src3.value = NULL; + return i->dest; +} + Value* HIRBuilder::Sub( Value* value1, Value* value2, uint32_t arithmetic_flags) { ASSERT_TYPES_EQUAL(value1, value2); diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h index a6f5f5692..c59e4d50d 100644 --- a/src/alloy/hir/hir_builder.h +++ b/src/alloy/hir/hir_builder.h @@ -94,10 +94,6 @@ public: Value* Convert(Value* value, TypeName target_type, RoundMode round_mode = ROUND_TO_ZERO); Value* Round(Value* value, RoundMode round_mode); - - // TODO(benvanik): make this cleaner -- not happy with it. - // It'd be nice if Convert() supported this, however then we'd need a - // VEC128_INT32_TYPE or something. Value* VectorConvertI2F(Value* value, uint32_t arithmetic_flags = 0); Value* VectorConvertF2I(Value* value, uint32_t arithmetic_flags = 0); @@ -143,6 +139,7 @@ public: Value* CompareUGE(Value* value1, Value* value2); Value* DidCarry(Value* value); Value* DidOverflow(Value* value); + Value* DidSaturate(Value* value); Value* VectorCompareEQ(Value* value1, Value* value2, TypeName part_type); Value* VectorCompareSGT(Value* value1, Value* value2, TypeName part_type); Value* VectorCompareSGE(Value* value1, Value* value2, TypeName part_type); @@ -152,6 +149,8 @@ public: Value* Add(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* AddWithCarry(Value* value1, Value* value2, Value* value3, uint32_t arithmetic_flags = 0); + Value* VectorAdd(Value* value1, Value* value2, TypeName part_type, + uint32_t arithmetic_flags = 0); Value* Sub(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); Value* Mul(Value* value1, Value* value2, uint32_t arithmetic_flags = 0); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index 9d184fc3d..114ee5a1c 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -130,6 +130,7 @@ enum Opcode { OPCODE_COMPARE_UGE, OPCODE_DID_CARRY, OPCODE_DID_OVERFLOW, + OPCODE_DID_SATURATE, OPCODE_VECTOR_COMPARE_EQ, OPCODE_VECTOR_COMPARE_SGT, OPCODE_VECTOR_COMPARE_SGE, @@ -138,6 +139,7 @@ enum Opcode { OPCODE_ADD, OPCODE_ADD_CARRY, + OPCODE_VECTOR_ADD, OPCODE_SUB, OPCODE_MUL, OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type. diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl index 75e27014b..97ca4ff3b 100644 --- a/src/alloy/hir/opcodes.inl +++ b/src/alloy/hir/opcodes.inl @@ -292,12 +292,16 @@ DEFINE_OPCODE( "did_carry", OPCODE_SIG_V_V, 0); - DEFINE_OPCODE( OPCODE_DID_OVERFLOW, "did_overflow", OPCODE_SIG_V_V, 0); +DEFINE_OPCODE( + OPCODE_DID_SATURATE, + "did_saturate", + OPCODE_SIG_V_V, + 0); DEFINE_OPCODE( OPCODE_VECTOR_COMPARE_EQ, @@ -337,6 +341,12 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V_V, OPCODE_FLAG_COMMUNATIVE); +DEFINE_OPCODE( + OPCODE_VECTOR_ADD, + "vector_add", + OPCODE_SIG_V_V_V, + OPCODE_FLAG_COMMUNATIVE); + DEFINE_OPCODE( OPCODE_SUB, "sub",