x86emitter: Add some AVX/AVX2 instructions and YMM registers

This commit is contained in:
Connor McLaughlin 2022-10-12 17:24:20 +10:00 committed by refractionpcsx2
parent ac10e00d7c
commit 375c0a02bb
10 changed files with 455 additions and 5 deletions

View File

@ -32,6 +32,7 @@ target_sources(common PRIVATE
Timer.cpp Timer.cpp
ThreadPool.cpp ThreadPool.cpp
WindowInfo.cpp WindowInfo.cpp
emitter/avx.cpp
emitter/bmi.cpp emitter/bmi.cpp
emitter/cpudetect.cpp emitter/cpudetect.cpp
emitter/fpu.cpp emitter/fpu.cpp

View File

@ -105,6 +105,7 @@
<ClCompile Include="Windows\WinThreads.cpp" /> <ClCompile Include="Windows\WinThreads.cpp" />
<ClCompile Include="Misc.cpp" /> <ClCompile Include="Misc.cpp" />
<ClCompile Include="Semaphore.cpp" /> <ClCompile Include="Semaphore.cpp" />
<ClCompile Include="emitter\avx.cpp" />
<ClCompile Include="emitter\bmi.cpp" /> <ClCompile Include="emitter\bmi.cpp" />
<ClCompile Include="emitter\cpudetect.cpp" /> <ClCompile Include="emitter\cpudetect.cpp" />
<ClCompile Include="emitter\fpu.cpp" /> <ClCompile Include="emitter\fpu.cpp" />
@ -188,6 +189,7 @@
<ClInclude Include="Vulkan\Util.h" /> <ClInclude Include="Vulkan\Util.h" />
<ClInclude Include="WindowInfo.h" /> <ClInclude Include="WindowInfo.h" />
<ClInclude Include="Threading.h" /> <ClInclude Include="Threading.h" />
<ClInclude Include="emitter\implement\avx.h" />
<ClInclude Include="emitter\implement\bmi.h" /> <ClInclude Include="emitter\implement\bmi.h" />
<ClInclude Include="emitter\cpudetect_internal.h" /> <ClInclude Include="emitter\cpudetect_internal.h" />
<ClInclude Include="emitter\instructions.h" /> <ClInclude Include="emitter\instructions.h" />

View File

@ -34,6 +34,9 @@
<ClCompile Include="emitter\LnxCpuDetect.cpp"> <ClCompile Include="emitter\LnxCpuDetect.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="emitter\avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="Linux\LnxHostSys.cpp"> <ClCompile Include="Linux\LnxHostSys.cpp">
<Filter>Source Files</Filter> <Filter>Source Files</Filter>
</ClCompile> </ClCompile>
@ -297,6 +300,9 @@
<ClInclude Include="emitter\implement\movs.h"> <ClInclude Include="emitter\implement\movs.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="emitter\implement\avx.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="Threading.h"> <ClInclude Include="Threading.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>

174
common/emitter/avx.cpp Normal file
View File

@ -0,0 +1,174 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2022 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "common/emitter/internal.h"
#include "common/emitter/tools.h"
namespace x86Emitter
{
const xImplAVX_Move xVMOVAPS = {0x00, 0x28, 0x29};
const xImplAVX_Move xVMOVUPS = {0x00, 0x10, 0x11};
const xImplAVX_ArithFloat xVADD = {
{0x00, 0x58}, // VADDPS
{0x66, 0x58}, // VADDPD
{0xF3, 0x58}, // VADDSS
{0xF2, 0x58}, // VADDSD
};
const xImplAVX_ArithFloat xVSUB = {
{0x00, 0x5C}, // VSUBPS
{0x66, 0x5C}, // VSUBPD
{0xF3, 0x5C}, // VSUBSS
{0xF2, 0x5C}, // VSUBSD
};
const xImplAVX_ArithFloat xVMUL = {
{0x00, 0x59}, // VMULPS
{0x66, 0x59}, // VMULPD
{0xF3, 0x59}, // VMULSS
{0xF2, 0x59}, // VMULSD
};
const xImplAVX_ArithFloat xVDIV = {
{0x00, 0x5E}, // VDIVPS
{0x66, 0x5E}, // VDIVPD
{0xF3, 0x5E}, // VDIVSS
{0xF2, 0x5E}, // VDIVSD
};
const xImplAVX_CmpFloat xVCMP = {
{SSE2_Equal},
{SSE2_Less},
{SSE2_LessOrEqual},
{SSE2_Unordered},
{SSE2_NotEqual},
{SSE2_NotLess},
{SSE2_NotLessOrEqual},
{SSE2_Ordered},
};
const xImplAVX_ThreeArgYMM xVPAND = {0x66, 0xDB};
const xImplAVX_ThreeArgYMM xVPANDN = {0x66, 0xDF};
const xImplAVX_ThreeArgYMM xVPOR = {0x66, 0xEB};
const xImplAVX_ThreeArgYMM xVPXOR = {0x66, 0xEF};
const xImplAVX_CmpInt xVPCMP = {
{0x66, 0x74}, // VPCMPEQB
{0x66, 0x75}, // VPCMPEQW
{0x66, 0x76}, // VPCMPEQD
{0x66, 0x64}, // VPCMPGTB
{0x66, 0x65}, // VPCMPGTW
{0x66, 0x66}, // VPCMPGTD
};
void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x00, 0x50, to, xRegister32(), from);
}
void xVMOVMSKPD(const xRegister32& to, const xRegisterSSE& from)
{
xOpWriteC5(0x66, 0x50, to, xRegister32(), from);
}
void xVZEROUPPER()
{
// rather than dealing with nonexistant operands..
xWrite8(0xc5);
xWrite8(0xf8);
xWrite8(0x77);
}
void xImplAVX_Move::operator()(const xRegisterSSE& to, const xRegisterSSE& from) const
{
if (to != from)
xOpWriteC5(Prefix, LoadOpcode, to, xRegisterSSE(), from);
}
void xImplAVX_Move::operator()(const xRegisterSSE& to, const xIndirectVoid& from) const
{
xOpWriteC5(Prefix, LoadOpcode, to, xRegisterSSE(), from);
}
void xImplAVX_Move::operator()(const xIndirectVoid& to, const xRegisterSSE& from) const
{
xOpWriteC5(Prefix, StoreOpcode, from, xRegisterSSE(), to);
}
void xImplAVX_ThreeArg::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
pxAssert(!to.IsWideSIMD() && !from1.IsWideSIMD() && !from2.IsWideSIMD());
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_ThreeArg::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
pxAssert(!to.IsWideSIMD() && !from1.IsWideSIMD());
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_ThreeArgYMM::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_ThreeArgYMM::operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(Prefix, Opcode, to, from1, from2);
}
void xImplAVX_CmpFloatHelper::PS(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0x00, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::PS(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0x00, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::PD(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0x66, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::PD(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0x66, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SS(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0xF3, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SS(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0xF3, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SD(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const
{
xOpWriteC5(0xF2, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
void xImplAVX_CmpFloatHelper::SD(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const
{
xOpWriteC5(0xF2, 0xC2, to, from1, from2);
xWrite8(static_cast<u8>(CType));
}
} // namespace x86Emitter

View File

@ -0,0 +1,113 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2022 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
namespace x86Emitter
{
struct xImplAVX_Move
{
u8 Prefix;
u8 LoadOpcode;
u8 StoreOpcode;
void operator()(const xRegisterSSE& to, const xRegisterSSE& from) const;
void operator()(const xRegisterSSE& to, const xIndirectVoid& from) const;
void operator()(const xIndirectVoid& to, const xRegisterSSE& from) const;
};
struct xImplAVX_ThreeArg
{
u8 Prefix;
u8 Opcode;
void operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const;
void operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const;
};
struct xImplAVX_ThreeArgYMM : xImplAVX_ThreeArg
{
void operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const;
void operator()(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const;
};
struct xImplAVX_ArithFloat
{
xImplAVX_ThreeArgYMM PS;
xImplAVX_ThreeArgYMM PD;
xImplAVX_ThreeArg SS;
xImplAVX_ThreeArg SD;
};
struct xImplAVX_CmpFloatHelper
{
SSE2_ComparisonType CType;
void PS(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const;
void PS(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const;
void PD(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const;
void PD(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const;
void SS(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const;
void SS(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const;
void SD(const xRegisterSSE& to, const xRegisterSSE& from1, const xRegisterSSE& from2) const;
void SD(const xRegisterSSE& to, const xRegisterSSE& from1, const xIndirectVoid& from2) const;
};
struct xImplAVX_CmpFloat
{
xImplAVX_CmpFloatHelper EQ;
xImplAVX_CmpFloatHelper LT;
xImplAVX_CmpFloatHelper LE;
xImplAVX_CmpFloatHelper UO;
xImplAVX_CmpFloatHelper NE;
xImplAVX_CmpFloatHelper GE;
xImplAVX_CmpFloatHelper GT;
xImplAVX_CmpFloatHelper OR;
};
struct xImplAVX_CmpInt
{
// Compare packed bytes for equality.
// If a data element in dest is equal to the corresponding date element src, the
// corresponding data element in dest is set to all 1s; otherwise, it is set to all 0s.
const xImplAVX_ThreeArgYMM EQB;
// Compare packed words for equality.
// If a data element in dest is equal to the corresponding date element src, the
// corresponding data element in dest is set to all 1s; otherwise, it is set to all 0s.
const xImplAVX_ThreeArgYMM EQW;
// Compare packed doublewords [32-bits] for equality.
// If a data element in dest is equal to the corresponding date element src, the
// corresponding data element in dest is set to all 1s; otherwise, it is set to all 0s.
const xImplAVX_ThreeArgYMM EQD;
// Compare packed signed bytes for greater than.
// If a data element in dest is greater than the corresponding date element src, the
// corresponding data element in dest is set to all 1s; otherwise, it is set to all 0s.
const xImplAVX_ThreeArgYMM GTB;
// Compare packed signed words for greater than.
// If a data element in dest is greater than the corresponding date element src, the
// corresponding data element in dest is set to all 1s; otherwise, it is set to all 0s.
const xImplAVX_ThreeArgYMM GTW;
// Compare packed signed doublewords [32-bits] for greater than.
// If a data element in dest is greater than the corresponding date element src, the
// corresponding data element in dest is set to all 1s; otherwise, it is set to all 0s.
const xImplAVX_ThreeArgYMM GTD;
};
} // namespace x86Emitter

View File

@ -622,4 +622,24 @@ namespace x86Emitter
extern const SimdImpl_Pack xPACK; extern const SimdImpl_Pack xPACK;
extern const xImplSimd_PInsert xPINSR; extern const xImplSimd_PInsert xPINSR;
extern const SimdImpl_PExtract xPEXTR; extern const SimdImpl_PExtract xPEXTR;
// ------------------------------------------------------------------------
extern const xImplAVX_Move xVMOVAPS;
extern const xImplAVX_Move xVMOVUPS;
extern const xImplAVX_ArithFloat xVADD;
extern const xImplAVX_ArithFloat xVSUB;
extern const xImplAVX_ArithFloat xVMUL;
extern const xImplAVX_ArithFloat xVDIV;
extern const xImplAVX_CmpFloat xVCMP;
extern const xImplAVX_ThreeArgYMM xVPAND;
extern const xImplAVX_ThreeArgYMM xVPANDN;
extern const xImplAVX_ThreeArgYMM xVPOR;
extern const xImplAVX_ThreeArgYMM xVPXOR;
extern const xImplAVX_CmpInt xVPCMP;
extern void xVMOVMSKPS(const xRegister32& to, const xRegisterSSE& from);
extern void xVMOVMSKPD(const xRegister32& to, const xRegisterSSE& from);
extern void xVZEROUPPER();
} // namespace x86Emitter } // namespace x86Emitter

View File

@ -123,12 +123,18 @@ namespace x86Emitter
{ {
pxAssert(prefix == 0 || prefix == 0x66 || prefix == 0xF3 || prefix == 0xF2); pxAssert(prefix == 0 || prefix == 0x66 || prefix == 0xF3 || prefix == 0xF2);
const xRegisterInt& reg = param1.IsReg() ? param1 : param2; const xRegisterBase& reg = param1.IsReg() ? param1 : param2;
u8 nR = reg.IsExtended() ? 0x00 : 0x80; u8 nR = reg.IsExtended() ? 0x00 : 0x80;
u8 L = reg.IsWideSIMD() ? 4 : 0; u8 L;
u8 nv = (~param2.GetId() & 0xF) << 3; // Needed for 256-bit movemask.
if constexpr (std::is_same_v<T3, xRegisterSSE>)
L = param3.IsWideSIMD() ? 4 : 0;
else
L = reg.IsWideSIMD() ? 4 : 0;
u8 nv = (param2.IsEmpty() ? 0xF : ((~param2.GetId() & 0xF))) << 3;
u8 p = u8 p =
prefix == 0xF2 ? 3 : prefix == 0xF2 ? 3 :

View File

@ -120,6 +120,16 @@ const xRegisterSSE
xmm12(12), xmm13(13), xmm12(12), xmm13(13),
xmm14(14), xmm15(15); xmm14(14), xmm15(15);
const xRegisterSSE
ymm0(0, xRegisterYMMTag()), ymm1(1, xRegisterYMMTag()),
ymm2(2, xRegisterYMMTag()), ymm3(3, xRegisterYMMTag()),
ymm4(4, xRegisterYMMTag()), ymm5(5, xRegisterYMMTag()),
ymm6(6, xRegisterYMMTag()), ymm7(7, xRegisterYMMTag()),
ymm8(8, xRegisterYMMTag()), ymm9(9, xRegisterYMMTag()),
ymm10(10, xRegisterYMMTag()), ymm11(11, xRegisterYMMTag()),
ymm12(12, xRegisterYMMTag()), ymm13(13, xRegisterYMMTag()),
ymm14(14, xRegisterYMMTag()), ymm15(15, xRegisterYMMTag());
const xAddressReg const xAddressReg
rax(0), rbx(3), rax(0), rbx(3),
rcx(1), rdx(2), rcx(1), rdx(2),

View File

@ -420,6 +420,8 @@ namespace x86Emitter
// This register type is provided to allow legal syntax for instructions that accept // This register type is provided to allow legal syntax for instructions that accept
// an XMM register as a parameter, but do not allow for a GPR. // an XMM register as a parameter, but do not allow for a GPR.
struct xRegisterYMMTag {};
class xRegisterSSE : public xRegisterBase class xRegisterSSE : public xRegisterBase
{ {
typedef xRegisterBase _parent; typedef xRegisterBase _parent;
@ -430,11 +432,16 @@ namespace x86Emitter
: _parent(16, regId) : _parent(16, regId)
{ {
} }
xRegisterSSE(int regId, xRegisterYMMTag)
: _parent(32, regId)
{
}
bool operator==(const xRegisterSSE& src) const { return this->Id == src.Id; } bool operator==(const xRegisterSSE& src) const { return this->Id == src.Id; }
bool operator!=(const xRegisterSSE& src) const { return this->Id != src.Id; } bool operator!=(const xRegisterSSE& src) const { return this->Id != src.Id; }
static const inline xRegisterSSE& GetInstance(uint id); static const inline xRegisterSSE& GetInstance(uint id);
static const inline xRegisterSSE& GetYMMInstance(uint id);
}; };
class xRegisterCL : public xRegister8 class xRegisterCL : public xRegister8
@ -570,13 +577,19 @@ namespace x86Emitter
extern const xRegisterEmpty xEmptyReg; extern const xRegisterEmpty xEmptyReg;
// clang-format off // clang-format off
extern const xRegisterSSE
extern const xRegisterSSE
xmm0, xmm1, xmm2, xmm3, xmm0, xmm1, xmm2, xmm3,
xmm4, xmm5, xmm6, xmm7, xmm4, xmm5, xmm6, xmm7,
xmm8, xmm9, xmm10, xmm11, xmm8, xmm9, xmm10, xmm11,
xmm12, xmm13, xmm14, xmm15; xmm12, xmm13, xmm14, xmm15;
// TODO: This needs to be _M_SSE >= 0x500'ed, but we can't do it atm because common doesn't have variants.
extern const xRegisterSSE
ymm0, ymm1, ymm2, ymm3,
ymm4, ymm5, ymm6, ymm7,
ymm8, ymm9, ymm10, ymm11,
ymm12, ymm13, ymm14, ymm15;
extern const xAddressReg extern const xAddressReg
rax, rbx, rcx, rdx, rax, rbx, rcx, rdx,
rsi, rdi, rbp, rsp, rsi, rdi, rbp, rsp,
@ -627,6 +640,19 @@ extern const xRegister32
return *m_tbl_xmmRegs[id]; return *m_tbl_xmmRegs[id];
} }
const xRegisterSSE& xRegisterSSE::GetYMMInstance(uint id)
{
static const xRegisterSSE* const m_tbl_ymmRegs[] =
{
&ymm0, &ymm1, &ymm2, &ymm3,
&ymm4, &ymm5, &ymm6, &ymm7,
&ymm8, &ymm9, &ymm10, &ymm11,
&ymm12, &ymm13, &ymm14, &ymm15};
pxAssert(id < iREGCNT_XMM);
return *m_tbl_ymmRegs[id];
}
// -------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------
// xAddressVoid // xAddressVoid
// -------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------
@ -949,3 +975,4 @@ extern const xRegister32
#include "implement/jmpcall.h" #include "implement/jmpcall.h"
#include "implement/bmi.h" #include "implement/bmi.h"
#include "implement/avx.h"

View File

@ -167,3 +167,94 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST_64(xBLEND.PD(xmm8, xmm9, 0xaa), "66 45 0f 3a 0d c1 aa"); CODEGEN_TEST_64(xBLEND.PD(xmm8, xmm9, 0xaa), "66 45 0f 3a 0d c1 aa");
CODEGEN_TEST_64(xEXTRACTPS(ptr32[base], xmm1, 2), "66 0f 3a 17 0d f6 ff ff ff 02"); CODEGEN_TEST_64(xEXTRACTPS(ptr32[base], xmm1, 2), "66 0f 3a 17 0d f6 ff ff ff 02");
} }
TEST(CodegenTests, AVXTest)
{
CODEGEN_TEST_64(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST_64(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST_64(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");
CODEGEN_TEST_64(xVMOVUPS(xmm0, ptr32[rdi]), "c5 f8 10 07");
CODEGEN_TEST_64(xVMOVUPS(ptr32[rdi], xmm0), "c5 f8 11 07");
CODEGEN_TEST_64(xVADD.PS(xmm0, xmm1, xmm2), "c5 f0 58 c2");
CODEGEN_TEST_64(xVADD.PD(xmm0, xmm1, xmm2), "c5 f1 58 c2");
CODEGEN_TEST_64(xVADD.SS(xmm0, xmm1, xmm2), "c5 f2 58 c2");
CODEGEN_TEST_64(xVADD.SD(xmm0, xmm1, xmm2), "c5 f3 58 c2");
CODEGEN_TEST_64(xVSUB.PS(xmm0, xmm1, xmm2), "c5 f0 5c c2");
CODEGEN_TEST_64(xVSUB.PD(xmm0, xmm1, xmm2), "c5 f1 5c c2");
CODEGEN_TEST_64(xVSUB.SS(xmm0, xmm1, xmm2), "c5 f2 5c c2");
CODEGEN_TEST_64(xVSUB.SD(xmm0, xmm1, xmm2), "c5 f3 5c c2");
CODEGEN_TEST_64(xVMUL.PS(xmm0, xmm1, xmm2), "c5 f0 59 c2");
CODEGEN_TEST_64(xVMUL.PD(xmm0, xmm1, xmm2), "c5 f1 59 c2");
CODEGEN_TEST_64(xVMUL.SS(xmm0, xmm1, xmm2), "c5 f2 59 c2");
CODEGEN_TEST_64(xVMUL.SD(xmm0, xmm1, xmm2), "c5 f3 59 c2");
CODEGEN_TEST_64(xVDIV.PS(xmm0, xmm1, xmm2), "c5 f0 5e c2");
CODEGEN_TEST_64(xVDIV.PD(xmm0, xmm1, xmm2), "c5 f1 5e c2");
CODEGEN_TEST_64(xVDIV.SS(xmm0, xmm1, xmm2), "c5 f2 5e c2");
CODEGEN_TEST_64(xVDIV.SD(xmm0, xmm1, xmm2), "c5 f3 5e c2");
// Don't need to test all variants, since they just change the condition immediate.
CODEGEN_TEST_64(xVCMP.EQ.PS(xmm0, xmm1, xmm2), "c5 f0 c2 c2 00");
CODEGEN_TEST_64(xVCMP.EQ.PD(xmm0, xmm1, xmm2), "c5 f1 c2 c2 00");
CODEGEN_TEST_64(xVCMP.EQ.SS(xmm0, xmm1, xmm2), "c5 f2 c2 c2 00");
CODEGEN_TEST_64(xVCMP.EQ.SD(xmm0, xmm1, xmm2), "c5 f3 c2 c2 00");
CODEGEN_TEST_64(xVCMP.LE.PS(xmm0, xmm1, xmm2), "c5 f0 c2 c2 02");
CODEGEN_TEST_64(xVCMP.LE.PD(xmm0, xmm1, xmm2), "c5 f1 c2 c2 02");
CODEGEN_TEST_64(xVCMP.LE.SS(xmm0, xmm1, xmm2), "c5 f2 c2 c2 02");
CODEGEN_TEST_64(xVCMP.LE.SD(xmm0, xmm1, xmm2), "c5 f3 c2 c2 02");
CODEGEN_TEST_64(xVPCMP.EQB(xmm0, xmm1, xmm2), "c5 f1 74 c2");
CODEGEN_TEST_64(xVPCMP.EQW(xmm0, xmm1, xmm2), "c5 f1 75 c2");
CODEGEN_TEST_64(xVPCMP.EQD(xmm0, xmm1, xmm2), "c5 f1 76 c2");
CODEGEN_TEST_64(xVPCMP.GTB(xmm0, xmm1, xmm2), "c5 f1 64 c2");
CODEGEN_TEST_64(xVPCMP.GTW(xmm0, xmm1, xmm2), "c5 f1 65 c2");
CODEGEN_TEST_64(xVPCMP.GTD(xmm0, xmm1, xmm2), "c5 f1 66 c2");
CODEGEN_TEST_64(xVPAND(xmm0, xmm1, xmm2), "c5 f1 db c2");
CODEGEN_TEST_64(xVPANDN(xmm0, xmm1, xmm2), "c5 f1 df c2");
CODEGEN_TEST_64(xVPOR(xmm0, xmm1, xmm2), "c5 f1 eb c2");
CODEGEN_TEST_64(xVPXOR(xmm0, xmm1, xmm2), "c5 f1 ef c2");
CODEGEN_TEST_64(xVMOVMSKPS(eax, xmm1), "c5 f8 50 c1");
CODEGEN_TEST_64(xVMOVMSKPD(eax, xmm1), "c5 f9 50 c1");
}
TEST(CodegenTests, AVX256Test)
{
CODEGEN_TEST_64(xVMOVAPS(ymm0, ymm1), "c5 fc 28 c1");
CODEGEN_TEST_64(xVMOVAPS(ymm0, ptr32[rdi]), "c5 fc 28 07");
CODEGEN_TEST_64(xVMOVAPS(ptr32[rdi], ymm0), "c5 fc 29 07");
CODEGEN_TEST_64(xVMOVUPS(ymm0, ptr32[rdi]), "c5 fc 10 07");
CODEGEN_TEST_64(xVMOVUPS(ptr32[rdi], ymm0), "c5 fc 11 07");
CODEGEN_TEST_64(xVZEROUPPER(), "c5 f8 77");
CODEGEN_TEST_64(xVADD.PS(ymm0, ymm1, ymm2), "c5 f4 58 c2");
CODEGEN_TEST_64(xVADD.PD(ymm0, ymm1, ymm2), "c5 f5 58 c2");
CODEGEN_TEST_64(xVSUB.PS(ymm0, ymm1, ymm2), "c5 f4 5c c2");
CODEGEN_TEST_64(xVSUB.PD(ymm0, ymm1, ymm2), "c5 f5 5c c2");
CODEGEN_TEST_64(xVMUL.PS(ymm0, ymm1, ymm2), "c5 f4 59 c2");
CODEGEN_TEST_64(xVMUL.PD(ymm0, ymm1, ymm2), "c5 f5 59 c2");
CODEGEN_TEST_64(xVDIV.PS(ymm0, ymm1, ymm2), "c5 f4 5e c2");
CODEGEN_TEST_64(xVDIV.PD(ymm0, ymm1, ymm2), "c5 f5 5e c2");
CODEGEN_TEST_64(xVCMP.EQ.PS(ymm0, ymm1, ymm2), "c5 f4 c2 c2 00");
CODEGEN_TEST_64(xVCMP.EQ.PD(ymm0, ymm1, ymm2), "c5 f5 c2 c2 00");
CODEGEN_TEST_64(xVCMP.LE.PS(ymm0, ymm1, ymm2), "c5 f4 c2 c2 02");
CODEGEN_TEST_64(xVCMP.LE.PD(ymm0, ymm1, ymm2), "c5 f5 c2 c2 02");
CODEGEN_TEST_64(xVPCMP.EQB(ymm0, ymm1, ymm2), "c5 f5 74 c2");
CODEGEN_TEST_64(xVPCMP.EQW(ymm0, ymm1, ymm2), "c5 f5 75 c2");
CODEGEN_TEST_64(xVPCMP.EQD(ymm0, ymm1, ymm2), "c5 f5 76 c2");
CODEGEN_TEST_64(xVPCMP.GTB(ymm0, ymm1, ymm2), "c5 f5 64 c2");
CODEGEN_TEST_64(xVPCMP.GTW(ymm0, ymm1, ymm2), "c5 f5 65 c2");
CODEGEN_TEST_64(xVPCMP.GTD(ymm0, ymm1, ymm2), "c5 f5 66 c2");
CODEGEN_TEST_64(xVPAND(ymm0, ymm1, ymm2), "c5 f5 db c2");
CODEGEN_TEST_64(xVPANDN(ymm0, ymm1, ymm2), "c5 f5 df c2");
CODEGEN_TEST_64(xVPOR(ymm0, ymm1, ymm2), "c5 f5 eb c2");
CODEGEN_TEST_64(xVPXOR(ymm0, ymm1, ymm2), "c5 f5 ef c2");
CODEGEN_TEST_64(xVMOVMSKPS(eax, ymm1), "c5 fc 50 c1");
CODEGEN_TEST_64(xVMOVMSKPD(eax, ymm1), "c5 fd 50 c1");
}