From c451fda819547f73b8edd3f750ce384feb39dfc6 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sat, 17 Nov 2018 21:21:26 -0600 Subject: [PATCH] [x64] Template-ize a few vector emulation functions --- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 286 ++++++++------------ 1 file changed, 108 insertions(+), 178 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 9e8bf19de..0ee776b0d 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -670,6 +670,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SUB, VECTOR_SUB); // ============================================================================ // OPCODE_VECTOR_SHL // ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorShl(void*, __m128i src1, __m128i src2) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] << (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + struct VECTOR_SHL_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -688,16 +705,7 @@ struct VECTOR_SHL_V128 break; } } - static __m128i EmulateVectorShlI8(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t value[16]; - alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = value[i] << (shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { @@ -707,19 +715,10 @@ struct VECTOR_SHL_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI8)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.vmovaps(i.dest, e.xmm0); } - static __m128i EmulateVectorShlI16(void*, __m128i src1, __m128i src2) { - alignas(16) uint16_t value[8]; - alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = value[i] << (shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { Xmm src1; if (i.src1.is_constant) { @@ -773,22 +772,13 @@ struct VECTOR_SHL_V128 } else { e.lea(e.r9, e.StashXmm(1, i.src2)); } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI16)); + e.lea(e.r8, e.StashXmm(0, src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.vmovaps(i.dest, e.xmm0); e.L(end); } - static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) { - alignas(16) uint32_t value[4]; - alignas(16) uint32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = value[i] << (shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { Xmm src1; if (i.src1.is_constant) { @@ -860,7 +850,7 @@ struct VECTOR_SHL_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI32)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShl)); e.vmovaps(i.dest, e.xmm0); e.L(end); @@ -872,6 +862,23 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHL, VECTOR_SHL_V128); // ============================================================================ // OPCODE_VECTOR_SHR // ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorShr(void*, __m128i src1, __m128i src2) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = value[i] >> (shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + struct VECTOR_SHR_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -890,16 +897,7 @@ struct VECTOR_SHR_V128 break; } } - static __m128i EmulateVectorShrI8(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t value[16]; - alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = value[i] >> (shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): native version (with shift magic). if (i.src2.is_constant) { @@ -909,19 +907,10 @@ struct VECTOR_SHR_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI8)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.vmovaps(i.dest, e.xmm0); } - static __m128i EmulateVectorShrI16(void*, __m128i src1, __m128i src2) { - alignas(16) uint16_t value[8]; - alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = value[i] >> (shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); @@ -968,21 +957,12 @@ struct VECTOR_SHR_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI16)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.vmovaps(i.dest, e.xmm0); e.L(end); } - static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) { - alignas(16) uint32_t value[4]; - alignas(16) uint32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = value[i] >> (shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { Xmm src1; if (i.src1.is_constant) { @@ -1054,7 +1034,7 @@ struct VECTOR_SHR_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI32)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.vmovaps(i.dest, e.xmm0); e.L(end); @@ -1068,15 +1048,21 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); // ============================================================================ struct VECTOR_SHA_V128 : Sequence> { - static __m128i EmulateVectorShaI8(void*, __m128i src1, __m128i src2) { - alignas(16) int8_t value[16]; - alignas(16) int8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = value[i] >> (shamt[i] & 0x7); + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitInt8(e, i); + break; + case INT16_TYPE: + EmitInt16(e, i); + break; + case INT32_TYPE: + EmitInt32(e, i); + break; + default: + assert_always(); + break; } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } static void EmitInt8(X64Emitter& e, const EmitArgType& i) { @@ -1088,21 +1074,10 @@ struct VECTOR_SHA_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI8)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.vmovaps(i.dest, e.xmm0); } - static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) { - alignas(16) int16_t value[8]; - alignas(16) int16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = value[i] >> (shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt16(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); @@ -1149,23 +1124,12 @@ struct VECTOR_SHA_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI16)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.vmovaps(i.dest, e.xmm0); e.L(end); } - static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) { - alignas(16) int32_t value[4]; - alignas(16) int32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = value[i] >> (shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static void EmitInt32(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); @@ -1222,69 +1186,39 @@ struct VECTOR_SHA_V128 e.lea(e.r9, e.StashXmm(1, i.src2)); } e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI32)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShr)); e.vmovaps(i.dest, e.xmm0); e.L(end); } } - - static void Emit(X64Emitter& e, const EmitArgType& i) { - switch (i.instr->flags) { - case INT8_TYPE: - EmitInt8(e, i); - break; - case INT16_TYPE: - EmitInt16(e, i); - break; - case INT32_TYPE: - EmitInt32(e, i); - break; - default: - assert_always(); - break; - } - } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHA, VECTOR_SHA_V128); // ============================================================================ // OPCODE_VECTOR_ROTATE_LEFT // ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorRotateLeft(void*, __m128i src1, __m128i src2) { + alignas(16) T value[16 / sizeof(T)]; + alignas(16) T shamt[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + value[i] = xe::rotate_left(value[i], shamt[i] & ((sizeof(T) * 8) - 1)); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + // TODO(benvanik): AVX512 has a native variable rotate (rolv). struct VECTOR_ROTATE_LEFT_V128 : Sequence> { - static __m128i EmulateVectorRotateLeftI8(void*, __m128i src1, __m128i src2) { - alignas(16) uint8_t value[16]; - alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 16; ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & 0x7); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static __m128i EmulateVectorRotateLeftI16(void*, __m128i src1, __m128i src2) { - alignas(16) uint16_t value[8]; - alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 8; ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & 0xF); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) { - alignas(16) uint32_t value[4]; - alignas(16) uint32_t shamt[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); - for (size_t i = 0; i < 4; ++i) { - value[i] = xe::rotate_left(value[i], shamt[i] & 0x1F); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { case INT8_TYPE: @@ -1296,7 +1230,8 @@ struct VECTOR_ROTATE_LEFT_V128 } else { e.lea(e.r9, e.StashXmm(1, i.src2)); } - e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI8)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); e.vmovaps(i.dest, e.xmm0); break; case INT16_TYPE: @@ -1308,7 +1243,8 @@ struct VECTOR_ROTATE_LEFT_V128 } else { e.lea(e.r9, e.StashXmm(1, i.src2)); } - e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI16)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); e.vmovaps(i.dest, e.xmm0); break; case INT32_TYPE: { @@ -1335,7 +1271,8 @@ struct VECTOR_ROTATE_LEFT_V128 } else { e.lea(e.r9, e.StashXmm(1, i.src2)); } - e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI32)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorRotateLeft)); e.vmovaps(i.dest, e.xmm0); } break; @@ -1351,35 +1288,28 @@ EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT, VECTOR_ROTATE_LEFT_V128); // ============================================================================ // OPCODE_VECTOR_AVERAGE // ============================================================================ +template ::value, int> = 0> +static __m128i EmulateVectorAverage(void*, __m128i src1, __m128i src2) { + alignas(16) T src1v[16 / sizeof(T)]; + alignas(16) T src2v[16 / sizeof(T)]; + alignas(16) T value[16 / sizeof(T)]; + + // Load SSE registers into a C array. + _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); + + for (size_t i = 0; i < (16 / sizeof(T)); ++i) { + auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) / 2; + value[i] = T(t); + } + + // Store result and return it. + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); +} + struct VECTOR_AVERAGE : Sequence> { - static __m128i EmulateVectorAverageUnsignedI32(void*, __m128i src1, - __m128i src2) { - alignas(16) uint32_t src1v[4]; - alignas(16) uint32_t src2v[4]; - alignas(16) uint32_t value[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); - for (size_t i = 0; i < 4; ++i) { - auto t = (uint64_t(src1v[i]) + uint64_t(src2v[i]) + 1) >> 1; - value[i] = uint32_t(t); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } - static __m128i EmulateVectorAverageSignedI32(void*, __m128i src1, - __m128i src2) { - alignas(16) int32_t src1v[4]; - alignas(16) int32_t src2v[4]; - alignas(16) int32_t value[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); - for (size_t i = 0; i < 4; ++i) { - auto t = (int64_t(src1v[i]) + int64_t(src2v[i]) + 1) >> 1; - value[i] = int32_t(t); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } static void Emit(X64Emitter& e, const EmitArgType& i) { EmitCommutativeBinaryXmmOp( e, i, @@ -1414,7 +1344,7 @@ struct VECTOR_AVERAGE } e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe( - reinterpret_cast(EmulateVectorAverageUnsignedI32)); + reinterpret_cast(EmulateVectorAverage)); e.vmovaps(i.dest, e.xmm0); } else { if (i.src2.is_constant) { @@ -1425,7 +1355,7 @@ struct VECTOR_AVERAGE } e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe( - reinterpret_cast(EmulateVectorAverageSignedI32)); + reinterpret_cast(EmulateVectorAverage)); e.vmovaps(i.dest, e.xmm0); } break;