diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 5ee8bed75..b4593f222 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -4493,12 +4493,39 @@ EMITTER(SHL_I64, MATCH(I, I64<>, I8<>>)) { EmitShlXX(e, i); } }; +EMITTER(SHL_V128, MATCH(I, V128<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.mov(e.r9, i.src2.constant()); + } else { + e.mov(e.r9, i.src2); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateShlV128)); + e.vmovaps(i.dest, e.xmm0); + } + static __m128i EmulateShlV128(void*, __m128i src1, uint8_t src2) { + // Almost all instances are shamt = 1, but non-constant. + // shamt is [0,7] + uint8_t shamt = src2 & 0x7; + alignas(16) vec128_t value; + _mm_store_si128(reinterpret_cast<__m128i*>(&value), src1); + for (int i = 0; i < 15; ++i) { + value.u8[i ^ 0x3] = (value.u8[i ^ 0x3] << shamt) | + (value.u8[(i + 1) ^ 0x3] >> (8 - shamt)); + } + value.u8[15 ^ 0x3] = value.u8[15 ^ 0x3] << shamt; + return _mm_load_si128(reinterpret_cast<__m128i*>(&value)); + } +}; EMITTER_OPCODE_TABLE( OPCODE_SHL, SHL_I8, SHL_I16, SHL_I32, - SHL_I64); + SHL_I64, + SHL_V128); // ============================================================================