Mostly code cleanups, XBYAK 2.99, VEX conversion for the sw renderer (3-5% faster), GSState::Move fix for dark cloud 2 invention crash.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4287 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-02-07 01:59:05 +00:00
parent e2d36a53a4
commit ca7abd983a
56 changed files with 6404 additions and 3150 deletions

View File

@ -193,6 +193,7 @@ static const int __pagesize = PCSX2_PAGESIZE;
# define __aligned(alig) __declspec(align(alig))
# define __aligned16 __declspec(align(16))
# define __aligned32 __declspec(align(32))
# define __pagealigned __declspec(align(PCSX2_PAGESIZE))
// Deprecated; use __align instead.

View File

@ -153,7 +153,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
{
GSDevice* dev = NULL;
if( renderer == -1 )
if(renderer == -1)
{
renderer = theApp.GetConfig("renderer", 0);
}
@ -167,6 +167,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
// GSopen call then they'll get corrupted graphics, but that's not my problem.
delete s_gs;
s_gs = NULL;
}
@ -178,20 +179,25 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
case 12: case 13: new GSDeviceNull(); break;
}
if( !dev ) return -1;
if(!dev) return -1;
if( !s_gs )
if(!s_gs)
{
switch(renderer)
{
default:
case 0: s_gs = new GSRendererDX9(); break;
case 3: s_gs = new GSRendererDX11(); break;
case 0:
s_gs = new GSRendererDX9();
break;
case 3:
s_gs = new GSRendererDX11();
break;
case 2: case 5: case 8: case 11: case 13:
s_gs = new GSRendererNull(); break;
s_gs = new GSRendererNull();
break;
case 1: case 4: case 7: case 10: case 12:
s_gs = new GSRendererSW(); break;
s_gs = new GSRendererSW();
break;
}
s_renderer = renderer;
@ -519,72 +525,6 @@ EXPORT_C GSsetFrameLimit(int limit)
#ifdef _WINDOWS
// Returns false if the window's been closed or an invalid packet was encountered.
static __forceinline bool LoopDatPacket_Thingamajig(HWND hWnd, uint8 (&regs)[0x2000], vector<uint8>& buff, FILE* fp, long start)
{
switch(fgetc(fp))
{
case EOF:
fseek(fp, start, 0);
return !!IsWindowVisible(hWnd);
case 0:
{
uint32 index = fgetc(fp);
uint32 size;
fread(&size, 4, 1, fp);
switch(index)
{
case 0:
{
if(buff.size() < 0x4000) buff.resize(0x4000);
uint32 addr = 0x4000 - size;
fread(&buff[0] + addr, size, 1, fp);
GSgifTransfer1(&buff[0], addr);
}
break;
case 1:
if(buff.size() < size) buff.resize(size);
fread(&buff[0], size, 1, fp);
GSgifTransfer2(&buff[0], size / 16);
break;
case 2:
if(buff.size() < size) buff.resize(size);
fread(&buff[0], size, 1, fp);
GSgifTransfer3(&buff[0], size / 16);
break;
}
}
break;
case 1:
GSvsync(fgetc(fp));
return !!IsWindowVisible(hWnd);
case 2:
{
uint32 size;
fread(&size, 4, 1, fp);
if(buff.size() < size) buff.resize(size);
GSreadFIFO2(&buff[0], size / 16);
}
break;
case 3:
fread(regs, 0x2000, 1, fp);
break;
default:
return false;
}
return true;
}
// lpszCmdLine:
// First parameter is the renderer.
// Second parameter is the gs file to load and run.
@ -634,7 +574,73 @@ EXPORT_C GSReplay(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
GSvsync(1);
while( LoopDatPacket_Thingamajig(hWnd, regs, buff, fp, start) ) ;
bool exit = false;
while(!exit)
{
uint32 index;
uint32 size;
uint32 addr;
int pos;
switch(fgetc(fp))
{
case EOF:
fseek(fp, start, 0);
exit = !IsWindowVisible(hWnd);
break;
case 0:
index = fgetc(fp);
fread(&size, 4, 1, fp);
switch(index)
{
case 0:
if(buff.size() < 0x4000) buff.resize(0x4000);
addr = 0x4000 - size;
fread(buff.data() + addr, size, 1, fp);
GSgifTransfer1(buff.data(), addr);
break;
case 1:
if(buff.size() < size) buff.resize(size);
fread(buff.data(), size, 1, fp);
GSgifTransfer2(buff.data(), size / 16);
break;
case 2:
if(buff.size() < size) buff.resize(size);
fread(buff.data(), size, 1, fp);
GSgifTransfer3(buff.data(), size / 16);
break;
case 3:
if(buff.size() < size) buff.resize(size);
fread(buff.data(), size, 1, fp);
GSgifTransfer(buff.data(), size / 16);
break;
}
break;
case 1:
GSvsync(fgetc(fp));
exit = !IsWindowVisible(hWnd);
break;
case 2:
fread(&size, 4, 1, fp);
if(buff.size() < size) buff.resize(size);
GSreadFIFO2(&buff[0], size / 16);
break;
case 3:
fread(regs, 0x2000, 1, fp);
break;
}
}
GSclose();
GSshutdown();
@ -672,7 +678,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
{PSM_PSMZ16S, "16ZS"},
};
uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16);
uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;
@ -809,7 +815,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
{
GSLocalMemory mem;
uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16);
uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;

View File

@ -77,6 +77,7 @@ enum GIF_REG
GIF_REG_CLAMP_1 = 0x08,
GIF_REG_CLAMP_2 = 0x09,
GIF_REG_FOG = 0x0a,
GIF_REG_INVALID = 0x0b,
GIF_REG_XYZF3 = 0x0c,
GIF_REG_XYZ3 = 0x0d,
GIF_REG_A_D = 0x0e,
@ -1077,7 +1078,7 @@ REG128_SET(GIFPackedReg)
GIFPackedNOP NOP;
REG_SET_END
__aligned16 struct GIFPath
__aligned32 struct GIFPath
{
GIFTag tag;
uint32 reg;
@ -1107,9 +1108,12 @@ __aligned16 struct GIFPath
if((++reg & 0xf) == nreg)
{
reg = 0;
if(--nloop == 0)
{
return false;
}
}
return true;
}

View File

@ -1201,7 +1201,7 @@ public:
#else
/*
__aligned16 uint32 block[8 * 8];
__aligned32 uint32 block[8 * 8];
UnpackBlock4HL(src, srcpitch, block);
@ -1316,7 +1316,7 @@ public:
#else
/*
__aligned16 uint32 block[8 * 8];
__aligned32 uint32 block[8 * 8];
UnpackBlock4HH(src, srcpitch, block);
@ -1467,7 +1467,7 @@ public:
#else
__aligned16 uint8 block[16 * 16];
__aligned32 uint8 block[16 * 16];
ReadBlock8<true>(src, (uint8*)block, sizeof(block) / 16);
@ -1542,7 +1542,7 @@ public:
#else
__aligned16 uint8 block[(32 / 2) * 16];
__aligned32 uint8 block[(32 / 2) * 16];
ReadBlock4<true>(src, (uint8*)block, sizeof(block) / 16);
@ -1583,7 +1583,7 @@ public:
#else
__aligned16 uint32 block[8 * 8];
__aligned32 uint32 block[8 * 8];
ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
@ -1624,7 +1624,7 @@ public:
#else
__aligned16 uint32 block[8 * 8];
__aligned32 uint32 block[8 * 8];
ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
@ -1665,7 +1665,7 @@ public:
#else
__aligned16 uint32 block[8 * 8];
__aligned32 uint32 block[8 * 8];
ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);

View File

@ -68,7 +68,8 @@ void GSCaptureDlg::OnInit()
ComboBoxAppend(IDC_CODECS, "Uncompressed", 0, true);
CoInitialize(0);
CoInitialize(0); // this is obviously wrong here, each thread should call this on start, and where is CoUninitalize?
BeginEnumSysDev(CLSID_VideoCompressorCategory, moniker)
{
Codec c;
@ -195,6 +196,7 @@ bool GSCaptureDlg::OnCommand(HWND hWnd, UINT id, UINT code)
if (ris != 2)
{
wstring s = wstring(c.DisplayName.m_str);
theApp.SetConfig("CaptureVideoCodecDisplayName", string(s.begin(), s.end()).c_str());
}
else

View File

@ -126,7 +126,7 @@ void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ASSERT(TEX0.CSA == 0);
@ -135,7 +135,7 @@ void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TE
void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ASSERT(TEX0.CSA < 16);

View File

@ -28,7 +28,7 @@
class GSLocalMemory;
__aligned16 class GSClut : public GSAlignedClass<16>
__aligned32 class GSClut : public GSAlignedClass<32>
{
GSLocalMemory* m_mem;
@ -37,7 +37,7 @@ __aligned16 class GSClut : public GSAlignedClass<16>
uint32* m_buff32;
uint64* m_buff64;
__aligned16 struct WriteState
__aligned32 struct WriteState
{
GIFRegTEX0 TEX0;
GIFRegTEXCLUT TEXCLUT;
@ -45,7 +45,7 @@ __aligned16 class GSClut : public GSAlignedClass<16>
bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
} m_write;
__aligned16 struct ReadState
__aligned32 struct ReadState
{
GIFRegTEX0 TEX0;
GIFRegTEXA TEXA;

View File

@ -145,8 +145,11 @@ void GSDevice::Recycle(GSTexture* t)
if(t)
{
t->last_frame_used = m_frame;
m_pool.push_front(t);
//printf("%d\n",m_pool.size());
while(m_pool.size() > 300)
{
delete m_pool.back();
@ -159,9 +162,11 @@ void GSDevice::Recycle(GSTexture* t)
void GSDevice::AgePool()
{
m_frame++;
while (m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
while(m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
{
delete m_pool.back();
m_pool.pop_back();
}
}

View File

@ -46,7 +46,7 @@ struct InterlaceConstantBuffer
#pragma pack(pop)
class GSDevice : public GSAlignedClass<16>
class GSDevice : public GSAlignedClass<32>
{
list<GSTexture*> m_pool;
@ -66,7 +66,7 @@ protected:
struct {size_t stride, start, count, limit;} m_vertices;
uint32 m_msaa;
DXGI_SAMPLE_DESC m_msaa_desc;
unsigned m_frame; // for ageing the pool
unsigned int m_frame; // for ageing the pool
virtual GSTexture* Create(int type, int w, int h, bool msaa, int format) = 0;

View File

@ -229,8 +229,10 @@ bool GSDevice11::Create(GSWnd* wnd)
}
}
if (m_msaa_desc.Count == 1)
if(m_msaa_desc.Count == 1)
{
m_msaa = 0;
}
// convert
@ -378,7 +380,7 @@ bool GSDevice11::Create(GSWnd* wnd)
if(m_wnd->IsManaged())
{
SetExclusive( !theApp.GetConfig("windowed", 1) );
SetExclusive(!theApp.GetConfig("windowed", 1));
}
return true;
@ -392,11 +394,14 @@ bool GSDevice11::Reset(int w, int h)
if(m_swapchain)
{
DXGI_SWAP_CHAIN_DESC scd;
memset(&scd, 0, sizeof(scd));
m_swapchain->GetDesc(&scd);
m_swapchain->ResizeBuffers(scd.BufferCount, w, h, scd.BufferDesc.Format, 0);
CComPtr<ID3D11Texture2D> backbuffer;
if(FAILED(m_swapchain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backbuffer)))
{
return false;
@ -422,9 +427,12 @@ void GSDevice11::SetExclusive(bool isExcl)
m_swapchain->ResizeTarget(&desc);
*/
HRESULT hr = m_swapchain->SetFullscreenState( isExcl, NULL );
HRESULT hr = m_swapchain->SetFullscreenState(isExcl, NULL);
if(hr == DXGI_ERROR_NOT_CURRENTLY_AVAILABLE)
{
fprintf(stderr, "(GSdx10) SetExclusive(%s) failed; request unavailable.", isExcl ? "true" : "false");
}
}
void GSDevice11::Flip()
@ -885,10 +893,13 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
{
ID3D11ShaderResourceView* srv = NULL;
if (sr) srv = *(GSTexture11*)sr;
if (m_state.ps_srv[i] != srv) {
if(sr) srv = *(GSTexture11*)sr;
if(m_state.ps_srv[i] != srv)
{
m_state.ps_srv[i] = srv;
m_srv_changed = true;
}
}
@ -914,13 +925,17 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
m_ctx->PSSetShader(ps, NULL, 0);
}
if (m_srv_changed) {
if (m_srv_changed)
{
m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
m_srv_changed = false;
}
if (m_ss_changed) {
if(m_ss_changed)
{
m_ctx->PSSetSamplers(0, 3, m_state.ps_ss);
m_ss_changed = false;
}
@ -982,8 +997,8 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
vp.TopLeftX = 0;
vp.TopLeftY = 0;
vp.Width = (FLOAT)rt->GetWidth();
vp.Height = (FLOAT)rt->GetHeight();
vp.Width = (float)rt->GetWidth();
vp.Height = (float)rt->GetHeight();
vp.MinDepth = 0.0f;
vp.MaxDepth = 1.0f;

View File

@ -31,7 +31,6 @@ GSDevice9::GSDevice9()
memset(&m_pp, 0, sizeof(m_pp));
memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
memset(&m_state, 0, sizeof(m_state));
m_state.bf = 0xffffffff;
@ -39,81 +38,109 @@ GSDevice9::GSDevice9()
GSDevice9::~GSDevice9()
{
for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
for_each(m_om_bs.begin(), m_om_bs.end(), delete_second());
for_each(m_om_dss.begin(), m_om_dss.end(), delete_second());
for_each(m_ps_ss.begin(), m_ps_ss.end(), delete_second());
for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
}
// if supported and null != msaa_desc, msaa_desc will contain requested Count and Quality
static bool IsMsaaSupported(IDirect3D9* d3d, D3DFORMAT depth_format, uint msaaCount, DXGI_SAMPLE_DESC* msaa_desc = NULL)
{
if(msaaCount > 16) return false;
//if supported and null!=msaa_desc, msaa_desc will contain requested Count and Quality
static bool IsMsaaSupported(CComPtr<IDirect3D9>& d3d, D3DFORMAT depth_format, uint msaaCount, OUT DXGI_SAMPLE_DESC* msaa_desc=NULL){
D3DCAPS9 d3dcaps;
if (msaaCount>16) return false;
memset(&d3dcaps, 0, sizeof(d3dcaps));
d3d->GetDeviceCaps(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, &d3dcaps);
DWORD quality[2] = {0, 0};
if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] >0
&& SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] >0
){
if (msaa_desc){
if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] > 0
&& SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] > 0)
{
if(msaa_desc)
{
msaa_desc->Count = msaaCount;
msaa_desc->Quality = std::min<DWORD>(quality[0] - 1, quality[1] - 1);
}
return true;
}
return false;
}
static bool TestDepthFormat(CComPtr<IDirect3D9> &d3d, D3DFORMAT format)
static bool TestDepthFormat(IDirect3D9* d3d, D3DFORMAT format)
{
if (FAILED(d3d->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format)))
if(FAILED(d3d->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format)))
{
return false;
if (FAILED(d3d->CheckDepthStencilMatch(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format)))
}
if(FAILED(d3d->CheckDepthStencilMatch(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format)))
{
return false;
}
return true;
}
static D3DFORMAT BestD3dFormat(IDirect3D9* d3d, int msaaCount = 0, DXGI_SAMPLE_DESC* msaa_desc = NULL)
{
// In descending order of preference
//In descending order of preference
static D3DFORMAT s_DX9formatsToSearch[]={D3DFMT_D32, D3DFMT_D32F_LOCKABLE, D3DFMT_D24S8};
static D3DFORMAT fmts[] =
{
D3DFMT_D32,
D3DFMT_D32F_LOCKABLE,
D3DFMT_D24S8
};
static D3DFORMAT BestD3dFormat(CComPtr<IDirect3D9>& d3d, int msaaCount=0, OUT DXGI_SAMPLE_DESC* msaa_desc=NULL){
if(!d3d) return D3DFMT_UNKNOWN;
if (1==msaaCount) msaaCount=0;
if(1 == msaaCount) msaaCount = 0;
for (int i=0; i<sizeof(s_DX9formatsToSearch); i++)
if (TestDepthFormat(d3d, s_DX9formatsToSearch[i]) && (!msaaCount || IsMsaaSupported(d3d, s_DX9formatsToSearch[i], msaaCount, msaa_desc)))
return s_DX9formatsToSearch[i];
for(int i = 0; i < sizeof(fmts); i++)
{
if(TestDepthFormat(d3d, fmts[i]) && (!msaaCount || IsMsaaSupported(d3d, fmts[i], msaaCount, msaa_desc)))
{
return fmts[i];
}
}
return D3DFMT_UNKNOWN;
}
//return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0
uint GSDevice9::GetMaxDepth(uint msaa=0){
// return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0
uint GSDevice9::GetMaxDepth(uint msaa = 0)
{
CComPtr<IDirect3D9> d3d;
d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
D3DFORMAT f=BestD3dFormat(d3d, msaa);
switch (f){
case D3DFMT_D32: case D3DFMT_D32F_LOCKABLE: return 32;
case D3DFMT_D24S8: return 24;
switch(BestD3dFormat(d3d, msaa))
{
case D3DFMT_D32:
case D3DFMT_D32F_LOCKABLE:
return 32;
case D3DFMT_D24S8:
return 24;
}
return 0;
}
void GSDevice9::ForceValidMsaaConfig(){
if (0==GetMaxDepth(theApp.GetConfig("msaa", 0)))
theApp.SetConfig("msaa", 0);//replace invalid msaa value in ini file with 0.
void GSDevice9::ForceValidMsaaConfig()
{
if(0 == GetMaxDepth(theApp.GetConfig("msaa", 0)))
{
theApp.SetConfig("msaa", 0); // replace invalid msaa value in ini file with 0.
}
};
bool GSDevice9::Create(GSWnd* wnd)
@ -128,17 +155,26 @@ bool GSDevice9::Create(GSWnd* wnd)
m_d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
if(!m_d3d) return false;
ForceValidMsaaConfig();
//Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
// the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable).
m_depth_format=BestD3dFormat(m_d3d, m_msaa, &m_msaa_desc);
if (D3DFMT_UNKNOWN == m_depth_format){
//can't find a format with requested msaa, try without.
m_depth_format = BestD3dFormat(m_d3d, 0);
if (D3DFMT_UNKNOWN == m_depth_format)
return false;
m_msaa=0;
ForceValidMsaaConfig();
// Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
// the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable).
m_depth_format = BestD3dFormat(m_d3d, m_msaa, &m_msaa_desc);
if(D3DFMT_UNKNOWN == m_depth_format)
{
// can't find a format with requested msaa, try without.
m_depth_format = BestD3dFormat(m_d3d, 0);
if(D3DFMT_UNKNOWN == m_depth_format)
{
return false;
}
m_msaa = 0;
}
memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
@ -180,7 +216,6 @@ bool GSDevice9::Create(GSWnd* wnd)
return false;
}
if(!Reset(1, 1))
{
return false;
@ -274,7 +309,8 @@ bool GSDevice9::Create(GSWnd* wnd)
void GSDevice9::SetVsync(bool enable)
{
if( m_vsync == enable ) return;
if(m_vsync == enable) return;
__super::SetVsync(enable);
// Clever trick: Delete the backbuffer, so that the next Present will fail and
@ -282,6 +318,7 @@ void GSDevice9::SetVsync(bool enable)
// vsync settings. :)
delete m_backbuffer;
m_backbuffer = NULL;
}
@ -293,6 +330,7 @@ bool GSDevice9::Reset(int w, int h)
HRESULT hr;
int mode = (!m_wnd->IsManaged() || theApp.GetConfig("windowed", 1)) ? Windowed : Fullscreen;
if(mode == DontCare)
{
mode = m_pp.Windowed ? Windowed : Fullscreen;
@ -707,11 +745,11 @@ void GSDevice9::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, c
IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices));
IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP);
IASetInputLayout(m_convert.il);
// vs
VSSetShader(m_convert.vs, NULL, 0);
IASetInputLayout(m_convert.il);
// ps
@ -904,7 +942,7 @@ void GSDevice9::VSSetShader(IDirect3DVertexShader9* vs, const float* vs_cb, int
{
if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
m_state.vs_cb = (float*)_aligned_malloc(size, 16);
m_state.vs_cb = (float*)_aligned_malloc(size, 32);
}
m_state.vs_cb_len = vs_cb_len;
@ -926,10 +964,13 @@ void GSDevice9::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
void GSDevice9::PSSetShaderResource(int i, GSTexture* sr)
{
IDirect3DTexture9* srv = NULL;
if (sr) srv = *(GSTexture9*)sr;
if (m_state.ps_srvs[i] != srv) {
if(sr) srv = *(GSTexture9*)sr;
if(m_state.ps_srvs[i] != srv)
{
m_state.ps_srvs[i] = srv;
m_dev->SetTexture(i, srv);
}
}
@ -953,7 +994,7 @@ void GSDevice9::PSSetShader(IDirect3DPixelShader9* ps, const float* ps_cb, int p
{
if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
m_state.ps_cb = (float*)_aligned_malloc(size, 16);
m_state.ps_cb = (float*)_aligned_malloc(size, 32);
}
m_state.ps_cb_len = ps_cb_len;

View File

@ -30,7 +30,7 @@ class GSDeviceDX : public GSDevice
public:
#pragma pack(push, 1)
__aligned16 struct VSConstantBuffer
__aligned32 struct VSConstantBuffer
{
GSVector4 VertexScale;
GSVector4 VertexOffset;
@ -86,7 +86,7 @@ public:
VSSelector() : key(0) {}
};
__aligned16 struct PSConstantBuffer
__aligned32 struct PSConstantBuffer
{
GSVector4 FogColor_AREF;
GSVector4 HalfTexel;

File diff suppressed because it is too large Load Diff

View File

@ -67,10 +67,10 @@ class GSDrawScanlineCodeGenerator : public CodeGenerator
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
void clamp16(const Xmm& a, const Xmm& temp);
void alltrue();
void blend8(const Xmm& a, const Xmm& b);
void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
void blend8r(const Xmm& b, const Xmm& a);
void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
void blend8(const Xmm& a, const Xmm& b);
void blend8r(const Xmm& b, const Xmm& a);
public:
GSDrawScanlineCodeGenerator(GSScanlineEnvironment& env, uint64 key, void* ptr, size_t maxsize);

View File

@ -26,7 +26,7 @@
#pragma pack(push, 1)
__aligned16 class GSDrawingContext
__aligned32 class GSDrawingContext
{
public:
GIFRegXYOFFSET XYOFFSET;
@ -43,7 +43,7 @@ public:
GIFRegFRAME FRAME;
GIFRegZBUF ZBUF;
__aligned16 struct
__aligned32 struct
{
GSVector4i dx10;
GSVector4 dx9;

View File

@ -25,7 +25,7 @@
#pragma pack(push, 1)
__aligned16 class GSDrawingEnvironment
__aligned32 class GSDrawingEnvironment
{
public:
GIFRegPRIM PRIM;

View File

@ -56,14 +56,14 @@ uint32 GSLocalMemory::pageOffset16SZ[32][64][64];
uint32 GSLocalMemory::pageOffset8[32][64][128];
uint32 GSLocalMemory::pageOffset4[32][128][128];
int GSLocalMemory::rowOffset32[2048];
int GSLocalMemory::rowOffset32Z[2048];
int GSLocalMemory::rowOffset16[2048];
int GSLocalMemory::rowOffset16S[2048];
int GSLocalMemory::rowOffset16Z[2048];
int GSLocalMemory::rowOffset16SZ[2048];
int GSLocalMemory::rowOffset8[2][2048];
int GSLocalMemory::rowOffset4[2][2048];
int GSLocalMemory::rowOffset32[4096];
int GSLocalMemory::rowOffset32Z[4096];
int GSLocalMemory::rowOffset16[4096];
int GSLocalMemory::rowOffset16S[4096];
int GSLocalMemory::rowOffset16Z[4096];
int GSLocalMemory::rowOffset16SZ[4096];
int GSLocalMemory::rowOffset8[2][4096];
int GSLocalMemory::rowOffset4[2][4096];
short GSLocalMemory::blockOffset32[256];
short GSLocalMemory::blockOffset32Z[256];
@ -116,44 +116,44 @@ GSLocalMemory::GSLocalMemory()
for(int x = 0; x < countof(rowOffset32); x++)
{
rowOffset32[x] = (int)PixelAddress32(x, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32);
rowOffset32[x] = (int)PixelAddress32(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32);
}
for(int x = 0; x < countof(rowOffset32Z); x++)
{
rowOffset32Z[x] = (int)PixelAddress32Z(x, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32);
rowOffset32Z[x] = (int)PixelAddress32Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32);
}
for(int x = 0; x < countof(rowOffset16); x++)
{
rowOffset16[x] = (int)PixelAddress16(x, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32);
rowOffset16[x] = (int)PixelAddress16(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32);
}
for(int x = 0; x < countof(rowOffset16S); x++)
{
rowOffset16S[x] = (int)PixelAddress16S(x, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32);
rowOffset16S[x] = (int)PixelAddress16S(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32);
}
for(int x = 0; x < countof(rowOffset16Z); x++)
{
rowOffset16Z[x] = (int)PixelAddress16Z(x, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32);
rowOffset16Z[x] = (int)PixelAddress16Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32);
}
for(int x = 0; x < countof(rowOffset16SZ); x++)
{
rowOffset16SZ[x] = (int)PixelAddress16SZ(x, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32);
rowOffset16SZ[x] = (int)PixelAddress16SZ(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32);
}
for(int x = 0; x < countof(rowOffset8[0]); x++)
{
rowOffset8[0][x] = (int)PixelAddress8(x, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32);
rowOffset8[1][x] = (int)PixelAddress8(x, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32);
rowOffset8[0][x] = (int)PixelAddress8(x & 0x7ff, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32);
rowOffset8[1][x] = (int)PixelAddress8(x & 0x7ff, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32);
}
for(int x = 0; x < countof(rowOffset4[0]); x++)
{
rowOffset4[0][x] = (int)PixelAddress4(x, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32);
rowOffset4[1][x] = (int)PixelAddress4(x, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32);
rowOffset4[0][x] = (int)PixelAddress4(x & 0x7ff, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32);
rowOffset4[1][x] = (int)PixelAddress4(x & 0x7ff, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32);
}
for(int x = 0; x < countof(blockOffset32); x++)
@ -459,7 +459,7 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
return i->second;
}
GSOffset* o = (GSOffset*)_aligned_malloc(sizeof(GSOffset), 16);
GSOffset* o = (GSOffset*)_aligned_malloc(sizeof(GSOffset), 32);
o->hash = hash;
@ -474,9 +474,9 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
pixelAddress pa = m_psm[psm].pa;
for(int i = 0; i < 2048; i++)
for(int i = 0; i < 4096; i++)
{
o->pixel.row[i] = (int)pa(0, i, bp, bw);
o->pixel.row[i] = (int)pa(0, i & 0x7ff, bp, bw);
}
for(int i = 0; i < 8; i++)
@ -513,7 +513,7 @@ GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const G
return i->second;
}
GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 16);
GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 32);
o->hash = hash;
@ -628,7 +628,7 @@ void GSLocalMemory::WriteImageLeftRight(int l, int r, int y, int h, const uint8*
template<int psm, int bsx, int bsy, int trbpp>
void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
{
__aligned16 uint8 buff[64]; // merge buffer for one column
__aligned32 uint8 buff[64]; // merge buffer for one column
uint32 bp = BITBLTBUF.DBP;
uint32 bw = BITBLTBUF.DBW;
@ -1438,7 +1438,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
@ -1451,7 +1451,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
@ -1548,7 +1548,7 @@ void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
@ -1561,7 +1561,7 @@ void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
@ -1576,14 +1576,14 @@ void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i
void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
if(TEXA.AEM)
{
@ -1597,7 +1597,7 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1606,7 +1606,7 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1615,49 +1615,49 @@ void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, con
void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
if(TEXA.AEM)
{
@ -1671,7 +1671,7 @@ void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, con
void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1680,7 +1680,7 @@ void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, con
void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned16 uint16 block[16 * 8];
__aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1823,28 +1823,28 @@ void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadBlock4P(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(16);
ALIGN_STACK(32);
ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
}
@ -1855,7 +1855,7 @@ HRESULT GSLocalMemory::SaveBMP(const string& fn, uint32 bp, uint32 bw, uint32 ps
{
int pitch = w * 4;
int size = pitch * h;
void* bits = ::_aligned_malloc(size, 16);
void* bits = _aligned_malloc(size, 32);
GIFRegTEX0 TEX0;

View File

@ -39,7 +39,7 @@ struct GSOffset
struct
{
int row[2048]; // yn (n = 0 1 2 ...)
int row[4096]; // yn (n = 0 1 2 ...) NOTE: this wraps around above 2048, only transfers should address the upper half (dark cloud 2 inventing)
int* col[8]; // rowOffset*
} pixel;
@ -116,14 +116,14 @@ protected:
static uint32 pageOffset8[32][64][128];
static uint32 pageOffset4[32][128][128];
static int rowOffset32[2048];
static int rowOffset32Z[2048];
static int rowOffset16[2048];
static int rowOffset16S[2048];
static int rowOffset16Z[2048];
static int rowOffset16SZ[2048];
static int rowOffset8[2][2048];
static int rowOffset4[2][2048];
static int rowOffset32[4096];
static int rowOffset32Z[4096];
static int rowOffset16[4096];
static int rowOffset16S[4096];
static int rowOffset16Z[4096];
static int rowOffset16SZ[4096];
static int rowOffset8[2][4096];
static int rowOffset4[2][4096];
static short blockOffset32[256];
static short blockOffset32Z[256];

View File

@ -29,18 +29,20 @@
// Using a spinning finish on the main (MTGS) thread is apparently a big win still, over trying
// to wait out all the pending m_finished semaphores. It leaves one spinwait in the rasterizer,
// but that's still worlds better than 2-6 spinning threads like before.
#define UseSpinningFinish 1
#define UseSpinningFinish
// Set this to 1 to remove a lot of non-const div/modulus ops from the rasterization process.
// Might likely be a measurable speedup but limits threading to 1, 2, 4, and 8 threads.
// note by rama: Speedup is around 5% on average.
#define UseConstThreadCount 0
#if UseConstThreadCount
// #define UseConstThreadCount
#ifdef UseConstThreadCount
// ThreadsConst - const number of threads. User-configured threads (in GSdx panel) must match
// this value if UseConstThreadCount is enabled. [yeah, it's hacky for now]
static const int ThreadsConst = 2;
static const int ThreadMaskConst = ThreadsConst-1;
static const int ThreadMaskConst = ThreadsConst - 1;
#endif
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
@ -57,11 +59,15 @@ GSRasterizer::~GSRasterizer()
__forceinline bool GSRasterizer::IsOneOfMyScanlines(int scanline) const
{
#if UseConstThreadCount
return (ThreadMaskConst==0) || ((scanline & ThreadMaskConst) == m_id);
#else
#ifdef UseConstThreadCount
return ThreadMaskConst == 0 || (scanline & ThreadMaskConst) == m_id;
#else
return (scanline % m_threads) == m_id;
#endif
#endif
}
void GSRasterizer::Draw(const GSRasterizerData* data)
@ -871,7 +877,7 @@ void GSRasterizerMT::ThreadProc()
{
// _mm_setcsr(MXCSR);
while( true )
while(true)
{
sem_wait(&m_semaphore);
@ -879,10 +885,15 @@ void GSRasterizerMT::ThreadProc()
__super::Draw(m_data);
if( UseSpinningFinish )
_interlockedbittestandreset( &m_sync, m_id );
else
#ifdef UseSpinningFinish
_interlockedbittestandreset(&m_sync, m_id);
#else
sem_post(&m_finished);
#endif
}
sem_post(&m_stopped);
@ -917,33 +928,36 @@ void GSRasterizerList::Draw(const GSRasterizerData* data)
m_sync = m_syncstart;
for(unsigned i=1; i<size(); ++i)
for(size_t i = 1; i < size(); i++)
{
(*this)[i]->Draw(data);
}
(*this)[0]->Draw(data);
if( UseSpinningFinish )
{
#ifdef UseSpinningFinish
while(m_sync) _mm_pause();
}
else
#else
for(size_t i = 1; i < size(); i++)
{
for(unsigned i=1; i<size(); ++i )
sem_wait(&m_finished);
}
#endif
m_stats.ticks = __rdtsc() - start;
for(unsigned i=0; i<size(); ++i)
for(size_t i = 0; i < size(); i++)
{
GSRasterizerStats s;
(*this)[i]->GetStats(s);
m_stats.pixels += s.pixels;
m_stats.prims = max(m_stats.prims, s.prims);
m_stats.prims = std::max<int>(m_stats.prims, s.prims);
}
}

View File

@ -30,7 +30,7 @@
#include "pthread.h"
#include "semaphore.h"
__aligned16 class GSRasterizerData
__aligned32 class GSRasterizerData
{
public:
GSVector4i scissor;
@ -50,7 +50,7 @@ public:
virtual void PrintStats() = 0;
};
class IDrawScanline : public GSAlignedClass<16>
class IDrawScanline : public GSAlignedClass<32>
{
public:
typedef void (__fastcall *DrawScanlineStaticPtr)(int right, int left, int top, const GSVertexSW& v);
@ -153,9 +153,11 @@ public:
push_back(new GSRasterizer(new DS(parent, 0), 0, threads));
m_syncstart = 0;
for(int i = 1; i < threads; i++)
{
push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_finished, m_sync));
_interlockedbittestandset(&m_syncstart, i);
}
}

View File

@ -24,7 +24,7 @@
GSRenderer::GSRenderer()
: GSState()
, m_tex_buff( (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 16) )
, m_tex_buff((uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32))
, m_vt(this)
, m_dev(NULL)
, m_shader(0)
@ -61,9 +61,10 @@ GSRenderer::~GSRenderer()
m_dev->Reset(1, 1, GSDevice::Windowed);
}*/
_aligned_free( m_tex_buff );
_aligned_free(m_tex_buff);
delete m_dev;
DeleteCriticalSection(&m_pGSsetTitle_Crit);
}
@ -220,13 +221,6 @@ bool GSRenderer::Merge(int field)
r.bottom = r.top + y;
}
// Breaks the blur filter, and actually makes games blurry again.
// This might have to do with earlier changes to device size detection.
/*if(blurdetected && i == 1)
{
r += GSVector4i(0, 1).xyxy();
}*/
GSVector4 scale = GSVector4(tex[i]->GetScale()).xyxy();
src[i] = GSVector4(r) * scale / GSVector4(tex[i]->GetSize()).xyxy();
@ -380,8 +374,8 @@ void GSRenderer::VSync(int field)
EnterCriticalSection(&m_pGSsetTitle_Crit);
strncpy(m_GStitleInfoBuffer, s.c_str(), ArraySize(m_GStitleInfoBuffer)-1);
m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer)-1] = 0;// make sure null terminated even if text overflows
strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1);
m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer) - 1] = 0;// make sure null terminated even if text overflows
LeaveCriticalSection(&m_pGSsetTitle_Crit);
}

View File

@ -158,12 +158,13 @@ protected:
void GrowVertexBuffer()
{
m_maxcount = max(10000, m_maxcount * 3/2);
m_vertices = (Vertex*)_aligned_realloc(m_vertices, sizeof(Vertex) * m_maxcount, 16);
m_vertices = (Vertex*)_aligned_realloc(m_vertices, sizeof(Vertex) * m_maxcount, 32);
m_maxcount -= 100;
}
// Returns a pointer to the drawing vertex. Can return NULL!
template<uint32 prim> __fi Vertex* BaseDrawingKick(int& count)
template<uint32 prim> __forceinline Vertex* DrawingKick(bool skip, int& count)
{
switch(prim)
{
@ -237,7 +238,7 @@ protected:
__assume(0);
}
return v;
return !skip ? v : NULL;
}
virtual void Draw() = 0;

View File

@ -249,7 +249,9 @@ public:
ps_sel.clr1 = om_bsel.IsCLR1();
ps_sel.fba = context->FBA.FBA;
ps_sel.aout = context->FRAME.PSM == PSM_PSMCT16 || context->FRAME.PSM == PSM_PSMCT16S || (context->FRAME.FBMSK & 0xff000000) == 0x7f000000 ? 1 : 0;
if (UserHacks_AlphaHack) ps_sel.aout = 1;
if(PRIM->FGE)
{
ps_sel.fog = 1;

View File

@ -38,20 +38,20 @@ bool GSRendererDX11::CreateDevice(GSDevice* dev)
return true;
}
void GSRendererDX11::DoVertexKick()
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererDX11::VertexKick(bool skip)
{
const bool tme = PRIM->TME;
const bool fst = PRIM->FST;
GSVertexHW11& dst = m_vl.AddTail();
dst.vi[0] = m_v.vi[0];
dst.vi[1] = m_v.vi[1];
#ifdef USE_UPSCALE_HACKS
if(tme && fst)
{
//GSVector4::storel(&dst.ST, m_v.GetUV());
int Udiff = 0;
int Vdiff = 0;
int Uadjust = 0;
@ -95,6 +95,7 @@ void GSRendererDX11::DoVertexKick()
else if (Vdiff <= 1) { Vadjust = 1; }
}
}
dst.ST.S = (float)m_v.UV.U - Uadjust;
dst.ST.T = (float)m_v.UV.V - Vadjust;
}
@ -104,22 +105,20 @@ void GSRendererDX11::DoVertexKick()
//dst.XYZ.X += 5;
//dst.XYZ.Y += 5;
}
#else
if(tme && fst)
{
GSVector4::storel(&dst.ST, m_v.GetUV());
}
#endif
}
template< uint32 prim >
void GSRendererDX11::DrawingKick( bool skip )
{
int count;
GSVertexHW11* v = BaseDrawingKick<prim>(count);
if (skip || !v) return;
int count = 0;
if(GSVertexHW11* v = DrawingKick<prim>(skip, count))
{
GSVector4i scissor = m_context->scissor.dx10;
GSVector4i pmin, pmax;
@ -202,6 +201,7 @@ void GSRendererDX11::DrawingKick( bool skip )
}
m_count += count;
}
}
void GSRendererDX11::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)

View File

@ -36,8 +36,5 @@ public:
bool CreateDevice(GSDevice* dev);
template<uint32 prim>
void DrawingKick( bool skip );
void DoVertexKick();
template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip);
};

View File

@ -57,11 +57,9 @@ bool GSRendererDX9::CreateDevice(GSDevice* dev)
return true;
}
void GSRendererDX9::DoVertexKick()
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererDX9::VertexKick(bool skip)
{
const bool tme = PRIM->TME;
const bool fst = PRIM->FST;
GSVertexHW9& dst = m_vl.AddTail();
dst.p = GSVector4(((GSVector4i)m_v.XYZ).upl16());
@ -142,20 +140,17 @@ void GSRendererDX9::DoVertexKick()
dst.c0 = m_v.RGBAQ.u32[0];
dst.c1 = m_v.FOG.u32[1];
}
template< uint32 prim >
void GSRendererDX9::DrawingKick( bool skip )
{
int count;
//
// BaseDrawingKick can never return NULL here because the DrawingKick function
// tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only
// condition where this function would return NULL).
GSVertexHW9* v = BaseDrawingKick<prim>(count);
if (skip || !v) return;
int count = 0;
if(GSVertexHW9* v = DrawingKick<prim>(skip, count))
{
GSVector4 scissor = m_context->scissor.dx9;
GSVector4 pmin, pmax;
@ -228,6 +223,7 @@ void GSRendererDX9::DrawingKick( bool skip )
}
m_count += count;
}
}
void GSRendererDX9::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)

View File

@ -43,8 +43,5 @@ public:
bool CreateDevice(GSDevice* dev);
template<uint32 prim>
void DrawingKick( bool skip );
void DoVertexKick();
template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip);
};

View File

@ -43,10 +43,7 @@ public:
InitVertexKick<GSRendererNull>();
}
virtual ~GSRendererNull() {}
template<uint32 prim>
void DrawingKick( bool skip ) {}
void DoVertexKick() {}
template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip)
{
}
};

View File

@ -94,6 +94,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
if(m_dev->ResizeTexture(&m_texture[i], w, h))
{
uint8* buff = GetTextureBufferLock();
static int pitch = 1024 * 4;
GSVector4i r(0, 0, w, h);
@ -113,6 +114,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
s_n++;
}
ReleaseTextureBufferLock();
}
@ -427,24 +429,22 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
}
}
void GSRendererSW::DoVertexKick()
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererSW::VertexKick(bool skip)
{
const bool tme = PRIM->TME;
const bool fst = PRIM->FST;
const GSDrawingContext& context = *m_context;
const GSDrawingContext* context = m_context;
GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]);
xy = xy.insert16<3>(m_v.FOG.F);
xy = xy.upl16();
xy -= context.XYOFFSET;
xy -= context->XYOFFSET;
GSVertexSW& dst = m_vl.AddTail();
GSVertexSW v;
dst.p = GSVector4(xy) * g_pos_scale;
v.p = GSVector4(xy) * g_pos_scale;
dst.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7);
v.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7);
if(tme)
{
@ -452,37 +452,31 @@ void GSRendererSW::DoVertexKick()
if(fst)
{
dst.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
v.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
q = 1.0f;
}
else
{
dst.t = GSVector4(m_v.ST.S, m_v.ST.T);
dst.t *= GSVector4(0x10000 << context.TEX0.TW, 0x10000 << context.TEX0.TH);
v.t = GSVector4(m_v.ST.S, m_v.ST.T);
v.t *= GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH);
q = m_v.RGBAQ.Q;
}
dst.t = dst.t.xyxy(GSVector4::load(q));
v.t = v.t.xyxy(GSVector4::load(q));
}
GSVertexSW& dst = m_vl.AddTail();
dst = v;
dst.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the uint32 => float => uint32 conversion
}
int count = 0;
template< uint32 prim >
void GSRendererSW::DrawingKick( bool skip )
{
int count;
// BaseDrawingKick can never return NULL here because the DrawingKick function
// tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only
// condition where this function would return NULL).
GSVertexSW* v = BaseDrawingKick<prim>(count);
if (skip || !v) return;
if(!m_dump)
if(GSVertexSW* v = DrawingKick<prim>(skip, count))
{
if(!m_dump)
{
GSVector4 pmin, pmax;
switch(prim)
@ -505,7 +499,7 @@ void GSRendererSW::DrawingKick( bool skip )
break;
}
GSVector4 scissor = m_context->scissor.ex;
GSVector4 scissor = context->scissor.ex;
GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
@ -534,8 +528,7 @@ void GSRendererSW::DrawingKick( bool skip )
{
return;
}
}
}
switch(prim)
{
case GS_POINTLIST:
@ -602,4 +595,5 @@ void GSRendererSW::DrawingKick( bool skip )
}
m_count += count;
}
}

View File

@ -47,13 +47,6 @@ public:
GSRendererSW();
virtual ~GSRendererSW();
template<uint32 prim>
void DrawingKick( bool skip );
void DoVertexKick();
void InvalidateTextureCache()
{
m_tc->RemoveAll();
}
template<uint32 prim, uint32 tme, uint32 fst>
void VertexKick(bool skip);
};

View File

@ -99,7 +99,7 @@ union GSScanlineSelector
}
};
__aligned16 struct GSScanlineParam
__aligned32 struct GSScanlineParam
{
GSScanlineSelector sel;
@ -115,7 +115,7 @@ __aligned16 struct GSScanlineParam
uint32 fm, zm;
};
__aligned16 struct GSScanlineEnvironment
__aligned32 struct GSScanlineEnvironment
{
void* vm;
const void* tex;

View File

@ -88,7 +88,9 @@ void GSSettingsDlg::OnInit()
ComboBoxAppend(IDC_RESOLUTION, "Please select...", (LPARAM)&m_modes.back(), true);
CComPtr<IDirect3D9> d3d;
d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
if(d3d)
{
uint32 w = theApp.GetConfig("ModeWidth", 0);
@ -151,10 +153,13 @@ void GSSettingsDlg::OnInit()
SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETRANGE, 0, MAKELPARAM(8192, 256));
SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("resy", 1024), 0));
int r=theApp.GetConfig("Renderer", 0);
if (r>=0 && r<=2){//DX9
int r = theApp.GetConfig("Renderer", 0);
if(r >= 0 && r <= 2) // DX9
{
GSDevice9::ForceValidMsaaConfig();
m_lastValidMsaa=theApp.GetConfig("msaa", 0);
m_lastValidMsaa = theApp.GetConfig("msaa", 0);
}
SendMessage(GetDlgItem(m_hWnd, IDC_MSAA), UDM_SETRANGE, 0, MAKELPARAM(16, 0));

View File

@ -48,7 +48,14 @@ void GSSetupPrimCodeGenerator::Generate()
{
for(int i = 0; i < 5; i++)
{
movaps(Xmm(3 + i), xmmword[&m_shift[i]]);
if(m_cpu.has(util::Cpu::tAVX))
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
else
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
}
}
@ -68,11 +75,119 @@ void GSSetupPrimCodeGenerator::Depth()
return;
}
if(m_cpu.has(util::Cpu::tAVX))
{
if(!m_sel.sprite)
{
// GSVector4 t = dscan.p;
movaps(xmm0, xmmword[edx + 16]);
vmovaps(xmm0, ptr[edx + 16]);
if(m_en.f)
{
// GSVector4 df = p.wwww();
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
vmulps(xmm2, xmm1, xmm3);
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_env.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_env.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_env.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_env.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[&m_env.d[i].z], xmm1);
}
}
}
else
{
// GSVector4 p = vertices[0].p;
vmovaps(xmm0, ptr[ecx + 16]);
if(m_en.f)
{
// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
vcvttps2dq(xmm1, xmm0);
vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_env.p.f], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
vmovss(xmm1, dword[&half]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_env.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[&m_env.p.z], xmm0);
}
}
}
else
{
if(!m_sel.sprite)
{
// GSVector4 t = dscan.p;
movaps(xmm0, ptr[edx + 16]);
if(m_en.f)
{
@ -88,7 +203,7 @@ void GSSetupPrimCodeGenerator::Depth()
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(xmmword[&m_env.d4.f], xmm2);
movdqa(ptr[&m_env.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
@ -99,7 +214,7 @@ void GSSetupPrimCodeGenerator::Depth()
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(xmmword[&m_env.d[i].f], xmm2);
movdqa(ptr[&m_env.d[i].f], xmm2);
}
}
@ -113,7 +228,7 @@ void GSSetupPrimCodeGenerator::Depth()
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(xmmword[&m_env.d4.z], xmm1);
movdqa(ptr[&m_env.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
@ -121,7 +236,7 @@ void GSSetupPrimCodeGenerator::Depth()
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(xmmword[&m_env.d[i].z], xmm1);
movdqa(ptr[&m_env.d[i].z], xmm1);
}
}
}
@ -129,17 +244,16 @@ void GSSetupPrimCodeGenerator::Depth()
{
// GSVector4 p = vertices[0].p;
movaps(xmm0, xmmword[ecx + 16]);
movaps(xmm0, ptr[ecx + 16]);
if(m_en.f)
{
// m_env.p.f = GSVector4i(p).zzzzh().zzzz();
movaps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(xmmword[&m_env.p.f], xmm1);
movdqa(ptr[&m_env.p.f], xmm1);
}
if(m_en.z)
@ -174,7 +288,8 @@ void GSSetupPrimCodeGenerator::Depth()
cvttps2dq(xmm0, xmm0);
}
movdqa(xmmword[&m_env.p.z], xmm0);
movdqa(ptr[&m_env.p.z], xmm0);
}
}
}
}
@ -186,9 +301,73 @@ void GSSetupPrimCodeGenerator::Texture()
return;
}
if(m_cpu.has(util::Cpu::tAVX))
{
// GSVector4 t = dscan.t;
movaps(xmm0, xmmword[edx + 32]);
vmovaps(xmm0, ptr[edx + 32]);
vmulps(xmm1, xmm0, xmm3);
if(m_sel.fst)
{
// m_env.d4.st = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_env.d4.st], xmm1);
}
else
{
// m_env.d4.stq = t * 4.0f;
vmovaps(ptr[&m_env.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
{
// m_env.d[i].si/ti = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_env.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_env.d[i].ti], xmm2); break;
}
}
else
{
// m_env.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[&m_env.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_env.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_env.d[i].q], xmm2); break;
}
}
}
}
}
else
{
// GSVector4 t = dscan.t;
movaps(xmm0, ptr[edx + 32]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
@ -198,13 +377,13 @@ void GSSetupPrimCodeGenerator::Texture()
// m_env.d4.st = GSVector4i(t * 4.0f);
cvttps2dq(xmm1, xmm1);
movdqa(xmmword[&m_env.d4.st], xmm1);
movdqa(ptr[&m_env.d4.st], xmm1);
}
else
{
// m_env.d4.stq = t * 4.0f;
movaps(xmmword[&m_env.d4.stq], xmm1);
movaps(ptr[&m_env.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
@ -231,8 +410,8 @@ void GSSetupPrimCodeGenerator::Texture()
switch(j)
{
case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break;
case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break;
case 0: movdqa(ptr[&m_env.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_env.d[i].ti], xmm2); break;
}
}
else
@ -241,9 +420,10 @@ void GSSetupPrimCodeGenerator::Texture()
switch(j)
{
case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break;
case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break;
case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break;
case 0: movaps(ptr[&m_env.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_env.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_env.d[i].q], xmm2); break;
}
}
}
}
@ -257,11 +437,115 @@ void GSSetupPrimCodeGenerator::Color()
return;
}
if(m_cpu.has(util::Cpu::tAVX))
{
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, xmmword[edx]);
vmovaps(xmm0, ptr[edx]);
// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
vmulps(xmm1, xmm0, xmm3);
vcvttps2dq(xmm1, xmm1);
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
vpackssdw(xmm1, xmm1);
vmovdqa(ptr[&m_env.d4.c], xmm1);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_env.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_env.d[i].rb], xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_env.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_env.d[i].ga], xmm0);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
vcvttps2dq(xmm0, ptr[ecx]);
// c = c.upl16(c.zwxy());
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
vpunpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
vpsrlw(xmm0, 7);
}
// m_env.c.rb = c.xxxx();
// m_env.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_env.c.rb], xmm1);
vmovdqa(ptr[&m_env.c.ga], xmm2);
}
}
else
{
if(m_sel.iip)
{
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]);
movaps(xmm1, xmm0);
// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
@ -271,7 +555,7 @@ void GSSetupPrimCodeGenerator::Color()
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(xmmword[&m_env.d4.c], xmm2);
movdqa(ptr[&m_env.d4.c], xmm2);
// xmm3 is not needed anymore
@ -300,12 +584,12 @@ void GSSetupPrimCodeGenerator::Color()
// m_env.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(xmmword[&m_env.d[i].rb], xmm2);
movdqa(ptr[&m_env.d[i].rb], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
@ -333,20 +617,19 @@ void GSSetupPrimCodeGenerator::Color()
// m_env.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(xmmword[&m_env.d[i].ga], xmm2);
movdqa(ptr[&m_env.d[i].ga], xmm2);
}
}
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
movaps(xmm0, xmmword[ecx]);
movaps(xmm0, ptr[ecx]);
cvttps2dq(xmm0, xmm0);
// c = c.upl16(c.zwxy());
movdqa(xmm1, xmm0);
pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
@ -359,11 +642,12 @@ void GSSetupPrimCodeGenerator::Color()
// m_env.c.rb = c.xxxx();
// m_env.c.ga = c.zzzz();
movdqa(xmm1, xmm0);
pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(xmmword[&m_env.c.rb], xmm0);
movdqa(xmmword[&m_env.c.ga], xmm1);
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_env.c.rb], xmm1);
movdqa(ptr[&m_env.c.ga], xmm2);
}
}
}

View File

@ -84,7 +84,7 @@ GSState::GSState()
m_sssize += sizeof(m_tr.x);
m_sssize += sizeof(m_tr.y);
m_sssize += m_mem.m_vmsize;
m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * ArraySize(m_path);
m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * countof(m_path);
m_sssize += sizeof(m_q);
PRIM = &m_env.PRIM;
@ -103,6 +103,7 @@ GSState::~GSState()
void GSState::SetRegsMem(uint8* basemem)
{
ASSERT(basemem);
m_regs = (GSPrivRegSet*)basemem;
}
@ -111,15 +112,16 @@ void GSState::SetIrqCallback(void (*irq)())
m_irq = irq;
}
void GSState::SetMultithreaded( bool isMT )
void GSState::SetMultithreaded(bool mt)
{
// Some older versions of PCSX2 didn't properly set the irq callback to NULL
// in multithreaded mode (possibly because ZeroGS itself would assert in such
// cases), and didn't bind them to a dummy callback either. PCSX2 handles all
// IRQs internally when multithreaded anyway -- so let's ignore them here:
m_mt = isMT;
if( isMT )
m_mt = mt;
if(mt)
{
m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerNull;
m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerNull;
@ -136,11 +138,11 @@ void GSState::SetMultithreaded( bool isMT )
void GSState::SetFrameSkip(int skip)
{
if(m_frameskip == skip) return;
m_frameskip = skip;
if(skip)
{
#if !UsePackedRegSwitch
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = &GSState::GIFPackedRegHandlerNOP;
@ -148,7 +150,6 @@ void GSState::SetFrameSkip(int skip)
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP;
#endif
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerNOP;
@ -163,7 +164,6 @@ void GSState::SetFrameSkip(int skip)
}
else
{
#if !UsePackedRegSwitch
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
@ -171,7 +171,6 @@ void GSState::SetFrameSkip(int skip)
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
#endif
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
@ -188,7 +187,7 @@ void GSState::SetFrameSkip(int skip)
void GSState::Reset()
{
memset(&m_path[0], 0, sizeof(m_path[0]) * ArraySize(m_path));
memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
memset(&m_v, 0, sizeof(m_v));
// PRIM = &m_env.PRIM;
@ -203,7 +202,6 @@ void GSState::Reset()
void GSState::ResetHandlers()
{
#if !UsePackedRegSwitch
for(int i = 0; i < countof(m_fpGIFPackedRegHandlers); i++)
{
m_fpGIFPackedRegHandlers[i] = &GSState::GIFPackedRegHandlerNull;
@ -224,7 +222,6 @@ void GSState::ResetHandlers()
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
m_fpGIFPackedRegHandlers[GIF_REG_A_D] = &GSState::GIFPackedRegHandlerA_D;
m_fpGIFPackedRegHandlers[GIF_REG_NOP] = &GSState::GIFPackedRegHandlerNOP;
#endif
for(int i = 0; i < countof(m_fpGIFRegHandlers); i++)
{
@ -284,7 +281,7 @@ void GSState::ResetHandlers()
m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR] = &GSState::GIFRegHandlerTRXDIR;
m_fpGIFRegHandlers[GIF_A_D_REG_HWREG] = &GSState::GIFRegHandlerHWREG;
SetMultithreaded( m_mt );
SetMultithreaded(m_mt);
}
GSVector4i GSState::GetDisplayRect(int i)
@ -375,22 +372,24 @@ int GSState::GetFPS()
// GIFPackedRegHandler*
void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* r)
{
// ASSERT(0);
}
void __fi GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
{
#if _M_SSE >= 0x301
GSVector4i mask = GSVector4i::load(0x0c080400);
GSVector4i v = GSVector4i::load<false>(r).shuffle8(mask);
m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v);
#elif _M_SSE >= 0x200
GSVector4i v = GSVector4i::load<false>(r) & GSVector4i::x000000ff();
m_v.RGBAQ.u32[0] = v.rgba32();
#else
@ -405,7 +404,7 @@ void __fi GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
m_v.RGBAQ.Q = m_q;
}
void __fi GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
{
#if defined(_M_AMD64)
@ -426,7 +425,7 @@ void __fi GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
m_q = r->STQ.Q;
}
void __fi GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
{
#if _M_SSE >= 0x200
@ -441,7 +440,7 @@ void __fi GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
#endif
}
void __fi GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
{
m_v.XYZ.X = r->XYZF2.X;
m_v.XYZ.Y = r->XYZF2.Y;
@ -451,7 +450,7 @@ void __fi GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
VertexKick(r->XYZF2.ADC);
}
void __fi GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
{
m_v.XYZ.X = r->XYZ2.X;
m_v.XYZ.Y = r->XYZ2.Y;
@ -460,17 +459,17 @@ void __fi GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
VertexKick(r->XYZ2.ADC);
}
void __fi GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* r)
{
m_v.FOG.F = r->FOG.F;
}
void __fi GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* r)
{
(this->*m_fpGIFRegHandlers[r->A_D.ADDR])(&r->r);
}
void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* r)
__forceinline void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* r)
{
}
@ -502,6 +501,8 @@ __forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim)
m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
ResetPrim();
}
@ -510,22 +511,22 @@ void GSState::GIFRegHandlerPRIM(const GIFReg* r)
ApplyPRIM(r->PRIM);
}
void GSState::GIFRegHandlerRGBAQ(const GIFReg* r)
__forceinline void GSState::GIFRegHandlerRGBAQ(const GIFReg* r)
{
m_v.RGBAQ = (GSVector4i)r->RGBAQ;
}
void GSState::GIFRegHandlerST(const GIFReg* r)
__forceinline void GSState::GIFRegHandlerST(const GIFReg* r)
{
m_v.ST = (GSVector4i)r->ST;
}
void GSState::GIFRegHandlerUV(const GIFReg* r)
__forceinline void GSState::GIFRegHandlerUV(const GIFReg* r)
{
m_v.UV.u32[0] = r->UV.u32[0] & 0x3fff3fff;
}
__fi void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
{
/*
m_v.XYZ.X = r->XYZF.X;
@ -540,14 +541,14 @@ __fi void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
VertexKick(false);
}
__fi void GSState::GIFRegHandlerXYZ2(const GIFReg* r)
void GSState::GIFRegHandlerXYZ2(const GIFReg* r)
{
m_v.XYZ = (GSVector4i)r->XYZ;
VertexKick(false);
}
__fi void GSState::ApplyTEX0( uint i, GIFRegTEX0& TEX0 )
void GSState::ApplyTEX0(uint i, GIFRegTEX0& TEX0)
{
// even if TEX0 did not change, a new palette may have been uploaded and will overwrite the currently queued for drawing
@ -578,7 +579,7 @@ __fi void GSState::ApplyTEX0( uint i, GIFRegTEX0& TEX0 )
}
}
template<int i> __fi void GSState::GIFRegHandlerTEX0(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* r)
{
GIFRegTEX0 TEX0 = r->TEX0;
@ -588,7 +589,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX0(const GIFReg* r)
ApplyTEX0( i, TEX0 );
}
template<int i> __fi void GSState::GIFRegHandlerCLAMP(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerCLAMP(const GIFReg* r)
{
if(PRIM->CTXT == i && r->CLAMP != m_env.CTXT[i].CLAMP)
{
@ -603,7 +604,7 @@ void GSState::GIFRegHandlerFOG(const GIFReg* r)
m_v.FOG = (GSVector4i)r->FOG;
}
__fi void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
{
/*
m_v.XYZ.X = r->XYZF.X;
@ -618,7 +619,7 @@ __fi void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
VertexKick(true);
}
__fi void GSState::GIFRegHandlerXYZ3(const GIFReg* r)
void GSState::GIFRegHandlerXYZ3(const GIFReg* r)
{
m_v.XYZ = (GSVector4i)r->XYZ;
@ -629,7 +630,7 @@ void GSState::GIFRegHandlerNOP(const GIFReg* r)
{
}
template<int i> __fi void GSState::GIFRegHandlerTEX1(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerTEX1(const GIFReg* r)
{
if(PRIM->CTXT == i && r->TEX1 != m_env.CTXT[i].TEX1)
{
@ -639,7 +640,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX1(const GIFReg* r)
m_env.CTXT[i].TEX1 = (GSVector4i)r->TEX1;
}
template<int i> __fi void GSState::GIFRegHandlerTEX2(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerTEX2(const GIFReg* r)
{
// m_env.CTXT[i].TEX2 = r->TEX2; // not used
@ -656,7 +657,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX2(const GIFReg* r)
ApplyTEX0(i, TEX0);
}
template<int i> __fi void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
{
GSVector4i o = (GSVector4i)r->XYOFFSET & GSVector4i::x0000ffff();
@ -670,7 +671,7 @@ template<int i> __fi void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
m_env.CTXT[i].UpdateScissor();
}
__fi void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
{
if(r->PRMODECONT != m_env.PRMODECONT)
{
@ -684,9 +685,11 @@ __fi void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
// if(PRIM->PRIM == 7) printf("Invalid PRMODECONT/PRIM\n");
m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
}
__fi void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
{
if(!m_env.PRMODECONT.AC)
{
@ -698,9 +701,11 @@ __fi void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
m_env.PRMODE._PRIM = _PRIM;
m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
}
__fi void GSState::GIFRegHandlerTEXCLUT(const GIFReg* r)
void GSState::GIFRegHandlerTEXCLUT(const GIFReg* r)
{
if(r->TEXCLUT != m_env.TEXCLUT)
{
@ -730,7 +735,7 @@ template<int i> void GSState::GIFRegHandlerMIPTBP1(const GIFReg* r)
m_env.CTXT[i].MIPTBP1 = (GSVector4i)r->MIPTBP1;
}
template<int i> __fi void GSState::GIFRegHandlerMIPTBP2(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerMIPTBP2(const GIFReg* r)
{
if(PRIM->CTXT == i && r->MIPTBP2 != m_env.CTXT[i].MIPTBP2)
{
@ -767,7 +772,7 @@ void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* r)
// InvalidateTextureCache();
}
template<int i> __fi void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
{
if(PRIM->CTXT == i && r->SCISSOR != m_env.CTXT[i].SCISSOR)
{
@ -779,7 +784,7 @@ template<int i> __fi void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
m_env.CTXT[i].UpdateScissor();
}
template<int i> __fi void GSState::GIFRegHandlerALPHA(const GIFReg* r)
template<int i> void GSState::GIFRegHandlerALPHA(const GIFReg* r)
{
ASSERT(r->ALPHA.A != 3);
ASSERT(r->ALPHA.B != 3);
@ -1142,66 +1147,6 @@ void GSState::Read(uint8* mem, int len)
m_mem.ReadImageX(m_tr.x, m_tr.y, mem, len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
}
// Use version 1 of the optimized local > local transfer, as per revision 887.
// Later (more optimized?) versions cause a crash in Dark Cloud 2.
#if 1
void GSState::Move()
{
// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
// guitar hero copies the far end of the board to do a similar blend too
int sx = m_env.TRXPOS.SSAX;
int dx = m_env.TRXPOS.DSAX;
int sy = m_env.TRXPOS.SSAY;
int dy = m_env.TRXPOS.DSAY;
int w = m_env.TRXREG.RRW;
int h = m_env.TRXREG.RRH;
int xinc = 1;
int yinc = 1;
InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
InvalidateVideoMem(m_env.BITBLTBUF, GSVector4i(dx, dy, dx + w, dy + h));
if(sx < dx) sx += w-1, dx += w-1, xinc = -1;
if(sy < dy) sy += h-1, dy += h-1, yinc = -1;
const GSLocalMemory::psm_t& spsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM];
const GSLocalMemory::psm_t& dpsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM];
if(m_env.BITBLTBUF.SPSM == PSM_PSMCT32 && m_env.BITBLTBUF.DPSM == PSM_PSMCT32)
{
for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
{
DWORD sbase = spsm.pa(0, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW);
int* soffset = spsm.rowOffset[sy & 7];
DWORD dbase = dpsm.pa(0, dy, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
int* doffset = dpsm.rowOffset[dy & 7];
for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
{
m_mem.WritePixel32(dbase + doffset[dx], m_mem.ReadPixel32(sbase + soffset[sx]));
}
}
}
else
{
for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
{
DWORD sbase = spsm.pa(0, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW);
int* soffset = spsm.rowOffset[sy & 7];
DWORD dbase = dpsm.pa(0, dy, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
int* doffset = dpsm.rowOffset[dy & 7];
for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
{
(m_mem.*dpsm.wpa)(dbase + doffset[dx], (m_mem.*spsm.rpa)(sbase + soffset[sx]));
}
}
}
}
#else
void GSState::Move()
{
// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
@ -1346,10 +1291,7 @@ void GSState::Move()
int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
for(int x = 0; x > -w; x--) {
printf("%d",x); //Dark Cloud 2 crashes at x = -63
d[dcol[x]] = s[scol[x]];
}
for(int x = 0; x > -w; x--) d[dcol[x]] = s[scol[x]];
}
}
}
@ -1412,7 +1354,7 @@ void GSState::Move()
}
}
}
#endif
void GSState::SoftReset(uint32 mask)
{
if(mask & 1)
@ -1508,91 +1450,7 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{
do
{
uint32 reg = path.GetReg();
#if 0
// I assume this was some sort of debugging code? Why intercept and perform
// special handling for the first three entries in the table, and then do
// a LUT for the rest? Either do a switch for the whole table (best idea)
// or do a LUT for the whole table.
switch(reg)
{
case GIF_REG_RGBA:
GIFPackedRegHandlerRGBA((GIFPackedReg*)mem);
break;
case GIF_REG_STQ:
GIFPackedRegHandlerSTQ((GIFPackedReg*)mem);
break;
case GIF_REG_UV:
GIFPackedRegHandlerUV((GIFPackedReg*)mem);
break;
default:
(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
break;
}
#endif
#if UsePackedRegSwitch
// This is a switch statement version of the LUT above. Since there are only
// 16 entries, this is almost certainly ideal, since the compiler can inline
// all the handlers, and PGO will further optimize the switch dispatcher.
if (FrameSkipIt)
{
// When skipping frames it looks like we only need to bother with the A_D handler
// and the TEX handlers. (and I'm thinking the TEX handlers might not be necessary
// if the PCSX2 side of the frameskipper is smart enough anyway).
switch(reg)
{
case GIF_REG_A_D: GIFPackedRegHandlerA_D ((GIFPackedReg*)mem); break;
case GIF_REG_TEX0_1: GIFRegHandlerTEX0<0> ((GIFReg*)mem); break;
case GIF_REG_TEX0_2: GIFRegHandlerTEX0<1> ((GIFReg*)mem); break;
// Should RGBA/STQ/UV be NOPs when skipping frames? I think so, but maybe the original
// switch() (above) was some hack to enable them in frameskipping mode. --air
case GIF_REG_RGBA: //GIFPackedRegHandlerRGBA ((GIFPackedReg*)mem); break;
case GIF_REG_STQ: //GIFPackedRegHandlerSTQ ((GIFPackedReg*)mem); break;
case GIF_REG_UV: //GIFPackedRegHandlerUV ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF2: //GIFPackedRegHandlerXYZF2((GIFPackedReg*)mem); break;
case GIF_REG_XYZ2: //GIFPackedRegHandlerXYZ2 ((GIFPackedReg*)mem); break;
case GIF_REG_CLAMP_1: //GIFRegHandlerCLAMP<0> ((GIFReg*)mem); break;
case GIF_REG_CLAMP_2: //GIFRegHandlerCLAMP<1> ((GIFReg*)mem); break;
case GIF_REG_FOG: //GIFPackedRegHandlerFOG ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF3: //GIFRegHandlerXYZF3 ((GIFReg*)mem); break;
case GIF_REG_XYZ3: //GIFRegHandlerXYZ3 ((GIFReg*)mem); break;
case GIF_REG_NOP: break;
}
}
else
{
switch(reg)
{
case GIF_REG_RGBA: GIFPackedRegHandlerRGBA ((GIFPackedReg*)mem); break;
case GIF_REG_STQ: GIFPackedRegHandlerSTQ ((GIFPackedReg*)mem); break;
case GIF_REG_UV: GIFPackedRegHandlerUV ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF2: GIFPackedRegHandlerXYZF2((GIFPackedReg*)mem); break;
case GIF_REG_XYZ2: GIFPackedRegHandlerXYZ2 ((GIFPackedReg*)mem); break;
case GIF_REG_TEX0_1: GIFRegHandlerTEX0<0> ((GIFReg*)mem); break;
case GIF_REG_TEX0_2: GIFRegHandlerTEX0<1> ((GIFReg*)mem); break;
case GIF_REG_CLAMP_1: GIFRegHandlerCLAMP<0> ((GIFReg*)mem); break;
case GIF_REG_CLAMP_2: GIFRegHandlerCLAMP<1> ((GIFReg*)mem); break;
case GIF_REG_FOG: GIFPackedRegHandlerFOG ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF3: GIFRegHandlerXYZF3 ((GIFReg*)mem); break;
case GIF_REG_XYZ3: GIFRegHandlerXYZ3 ((GIFReg*)mem); break;
case GIF_REG_A_D: GIFPackedRegHandlerA_D ((GIFPackedReg*)mem); break;
case GIF_REG_NOP: break;
}
}
#else
// This is the original LUT implementation of the packed reg dispatcher.
// Simple and clean, but the switch system below is probably more efficient.
(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
#endif
(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
mem += sizeof(GIFPackedReg);
size--;
@ -1779,7 +1637,7 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
WriteState(data, &m_tr.y);
WriteState(data, m_mem.m_vm8, m_mem.m_vmsize);
for(int i = 0; i < ArraySize(m_path); i++)
for(int i = 0; i < countof(m_path); i++)
{
m_path[i].tag.NREG = m_path[i].nreg;
m_path[i].tag.NLOOP = m_path[i].nloop;
@ -1874,7 +1732,7 @@ int GSState::Defrost(const GSFreezeData* fd)
m_tr.total = 0; // TODO: restore transfer state
for(int i = 0; i < ArraySize(m_path); i++)
for(int i = 0; i < countof(m_path); i++)
{
ReadState(&m_path[i].tag, data);
ReadState(&m_path[i].reg, data);
@ -1888,6 +1746,8 @@ int GSState::Defrost(const GSFreezeData* fd)
m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
m_env.UpdateDIMX();
for(int i = 0; i < 2; i++)
@ -1918,7 +1778,7 @@ GSState::GSTransferBuffer::GSTransferBuffer()
{
x = y = 0;
start = end = total = 0;
buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16);
buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
}
GSState::GSTransferBuffer::~GSTransferBuffer()

View File

@ -36,17 +36,11 @@
#include "GSAlignedClass.h"
#include "GSDump.h"
// Set this to 1 to enable a switch statement instead of a LUT for the packed register handler
// in the GifTransfer code. Switch statement is probably faster, but it isn't fully implemented
// yet (not properly supporting frameskipping).
#define UsePackedRegSwitch 0
class GSState : public GSAlignedClass<16>
class GSState : public GSAlignedClass<32>
{
#if !UsePackedRegSwitch
typedef void (GSState::*GIFPackedRegHandler)(const GIFPackedReg* r);
GIFPackedRegHandler m_fpGIFPackedRegHandlers[16];
#endif
void GIFPackedRegHandlerNull(const GIFPackedReg* r);
void GIFPackedRegHandlerRGBA(const GIFPackedReg* r);
@ -62,7 +56,7 @@ class GSState : public GSAlignedClass<16>
GIFRegHandler m_fpGIFRegHandlers[256];
void ApplyTEX0( uint i, GIFRegTEX0& TEX0 );
void ApplyTEX0(uint i, GIFRegTEX0& TEX0);
void ApplyPRIM(const GIFRegPRIM& PRIM);
void GIFRegHandlerNull(const GIFReg* r);
@ -136,33 +130,67 @@ class GSState : public GSAlignedClass<16>
protected:
bool IsBadFrame(int& skip, int UserHacks_SkipDraw);
typedef void (GSState::*DrawingKickPtr)(bool skip);
typedef void (GSState::*VertexKickPtr)(bool skip);
DrawingKickPtr m_dk[8];
VertexKickPtr m_vk[8][2][2];
VertexKickPtr m_vkf;
template<class T> void InitVertexKick()
{
m_dk[GS_POINTLIST] = (DrawingKickPtr)&T::DrawingKick<GS_POINTLIST>;
m_dk[GS_LINELIST] = (DrawingKickPtr)&T::DrawingKick<GS_LINELIST>;
m_dk[GS_LINESTRIP] = (DrawingKickPtr)&T::DrawingKick<GS_LINESTRIP>;
m_dk[GS_TRIANGLELIST] = (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLELIST>;
m_dk[GS_TRIANGLESTRIP] = (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLESTRIP>;
m_dk[GS_TRIANGLEFAN] = (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLEFAN>;
m_dk[GS_SPRITE] = (DrawingKickPtr)&T::DrawingKick<GS_SPRITE>;
m_dk[GS_INVALID] = &GSState::DrawingKickNull;
m_vk[GS_POINTLIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 0, 0>;
m_vk[GS_POINTLIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 0, 0>;
m_vk[GS_POINTLIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 1, 0>;
m_vk[GS_POINTLIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 1, 1>;
m_vk[GS_LINELIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 0, 0>;
m_vk[GS_LINELIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 0, 0>;
m_vk[GS_LINELIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 1, 0>;
m_vk[GS_LINELIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 1, 1>;
m_vk[GS_LINESTRIP][0][0] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 0, 0>;
m_vk[GS_LINESTRIP][0][1] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 0, 0>;
m_vk[GS_LINESTRIP][1][0] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 1, 0>;
m_vk[GS_LINESTRIP][1][1] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 1, 1>;
m_vk[GS_TRIANGLELIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 0, 0>;
m_vk[GS_TRIANGLELIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 0, 0>;
m_vk[GS_TRIANGLELIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 1, 0>;
m_vk[GS_TRIANGLELIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 1, 1>;
m_vk[GS_TRIANGLESTRIP][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 0, 0>;
m_vk[GS_TRIANGLESTRIP][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 0, 0>;
m_vk[GS_TRIANGLESTRIP][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 1, 0>;
m_vk[GS_TRIANGLESTRIP][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 1, 1>;
m_vk[GS_TRIANGLEFAN][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 0, 0>;
m_vk[GS_TRIANGLEFAN][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 0, 0>;
m_vk[GS_TRIANGLEFAN][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 1, 0>;
m_vk[GS_TRIANGLEFAN][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 1, 1>;
m_vk[GS_SPRITE][0][0] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 0, 0>;
m_vk[GS_SPRITE][0][1] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 0, 0>;
m_vk[GS_SPRITE][1][0] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 1, 0>;
m_vk[GS_SPRITE][1][1] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 1, 1>;
m_vk[GS_INVALID][0][0] = &GSState::VertexKickNull;
m_vk[GS_INVALID][0][1] = &GSState::VertexKickNull;
m_vk[GS_INVALID][1][0] = &GSState::VertexKickNull;
m_vk[GS_INVALID][1][1] = &GSState::VertexKickNull;
}
void DrawingKickNull(bool skip)
void UpdateVertexKick()
{
m_vkf = m_vk[PRIM->PRIM][PRIM->TME][PRIM->FST];
}
void VertexKickNull(bool skip)
{
ASSERT(0);
}
virtual void DoVertexKick()=0;
__fi void VertexKick(bool skip)
void VertexKick(bool skip)
{
DoVertexKick();
(this->*m_dk[PRIM->PRIM])(skip);
(this->*m_vkf)(skip);
}
public:
@ -221,6 +249,6 @@ public:
void SetFrameSkip(int skip);
void SetRegsMem(uint8* basemem);
void SetIrqCallback(void (*irq)());
void SetMultithreaded(bool isMT=true);
void SetMultithreaded(bool mt = true);
};

View File

@ -37,9 +37,12 @@ extern const uint8 clutTableT32I8[128];
extern const uint8 clutTableT32I4[16];
extern const uint8 clutTableT16I8[32];
extern const uint8 clutTableT16I4[16];
struct D3D9Blend {
struct D3D9Blend
{
int bogus;
D3DBLENDOP op;
D3DBLEND src, dst;
};
extern const D3D9Blend blendMapD3D9[3*3*3*3];

View File

@ -27,6 +27,6 @@ GSTexture::GSTexture()
, m_size(0, 0)
, m_type(None)
, m_msaa(false)
, LikelyOffset (false)
, LikelyOffset(false)
{
}

View File

@ -836,11 +836,11 @@ GSTextureCache::Source::Source(GSRenderer* r)
{
memset(m_valid, 0, sizeof(m_valid));
m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 16);
m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 32);
memset(m_clut, 0, sizeof(m_clut));
m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 16);
m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 32);
m_write.count = 0;
}
@ -1082,7 +1082,7 @@ void GSTextureCache::Target::Update()
}
else
{
static uint8* buff = (uint8*)::_aligned_malloc(1024 * 1024 * 4, 16);
static uint8* buff = (uint8*)::_aligned_malloc(1024 * 1024 * 4, 32);
int pitch = ((w + 3) & ~3) * 4;

View File

@ -39,7 +39,7 @@ public:
FMT_8,
};
class Surface : public GSAlignedClass<16>
class Surface : public GSAlignedClass<32>
{
protected:
GSRenderer* m_renderer;

View File

@ -253,7 +253,7 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
if(m_buff == NULL)
{
m_buff = _aligned_malloc(tw * th * sizeof(uint32), 16);
m_buff = _aligned_malloc(tw * th * sizeof(uint32), 32);
if(m_buff == NULL)
{

View File

@ -137,6 +137,7 @@ void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
}
VSSetShader(i->second.vs, m_vs_cb);
IASetInputLayout(i->second.il);
}

View File

@ -69,7 +69,7 @@ void GSDevice9::SetupIA(const void* vertices, int count, int prim)
void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
{
hash_map< uint32, GSVertexShader9 >::const_iterator i = m_vs.find(sel);
hash_map<uint32, GSVertexShader9>::const_iterator i = m_vs.find(sel);
if(i == m_vs.end())
{
@ -110,6 +110,7 @@ void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
}
VSSetShader(i->second.vs, (const float*)cb, sizeof(*cb) / sizeof(GSVector4));
IASetInputLayout(i->second.il);
}

View File

@ -27,26 +27,6 @@ const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
GSVector4i::GSVector4i(const GSVector4& v)
{
m = _mm_cvttps_epi32(v);
}
GSVector4::GSVector4(const GSVector4i& v)
{
m = _mm_cvtepi32_ps(v);
}
GSVector4i GSVector4i::cast(const GSVector4& v)
{
return GSVector4i(_mm_castps_si128(v.m));
}
GSVector4 GSVector4::cast(const GSVector4i& v)
{
return GSVector4(_mm_castsi128_ps(v.m));
}
GSVector4i GSVector4i::fit(int arx, int ary) const
{
GSVector4i r = *this;

File diff suppressed because it is too large Load Diff

View File

@ -28,7 +28,7 @@
#pragma pack(push, 1)
__aligned16 struct GSVertex
__aligned32 struct GSVertex
{
union
{

View File

@ -26,7 +26,7 @@
#pragma pack(push, 1)
__aligned16 union GSVertexHW9
__aligned32 union GSVertexHW9
{
struct
{
@ -56,7 +56,7 @@ __aligned16 union GSVertexHW9
float GetQ() {return p.w;}
};
__aligned16 union GSVertexHW11
__aligned32 union GSVertexHW11
{
struct
{

View File

@ -31,7 +31,7 @@ public:
GSVertexList()
: m_count(0)
{
m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 16);
m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 32);
for(int i = 0; i < countof(m_v); i++)
{

View File

@ -23,12 +23,16 @@
#include "GSVector.h"
__aligned16 union GSVertexSW
__aligned32 union GSVertexSW
{
struct {GSVector4 c, p, t;};
struct {GSVector4 v[3];};
struct {float f[12];};
#if _M_SSE >= 0x500
struct {GSVector8 cp, t_;};
#endif
GSVertexSW() {}
GSVertexSW(const GSVertexSW& v) {*this = v;}
@ -213,4 +217,3 @@ __forceinline GSVertexSW operator / (const GSVertexSW& v, float f)
v0.t = v.t / vf;
return v0;
}

View File

@ -120,8 +120,8 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
using namespace Xbyak;
GSVertexTrace::CGSW::CGSW(uint32 key, void* ptr, size_t maxsize)
: CodeGenerator(maxsize, ptr)
GSVertexTrace::CGSW::CGSW(uint32 key, void* code, size_t maxsize)
: CodeGenerator(maxsize, code)
{
#if _M_AMD64
#error TODO
@ -161,10 +161,10 @@ GSVertexTrace::CGSW::CGSW(uint32 key, void* ptr, size_t maxsize)
static const float fmin = -FLT_MAX;
static const float fmax = FLT_MAX;
movss(xmm0, xmmword[&fmax]);
movss(xmm0, ptr[&fmax]);
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
movss(xmm1, xmmword[&fmin]);
movss(xmm1, ptr[&fmin]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
@ -202,7 +202,7 @@ L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, xmmword[edx + 1 * sizeof(GSVertexSW) + 32]);
movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
@ -213,7 +213,7 @@ L("loop");
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW)]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
@ -222,7 +222,7 @@ L("loop");
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW) + 16]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
@ -232,7 +232,7 @@ L("loop");
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW) + 32]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
if(!fst)
{
@ -265,27 +265,27 @@ L("loop");
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(xmmword[eax], xmm2);
movaps(ptr[eax], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(xmmword[edx], xmm3);
movaps(ptr[edx], xmm3);
}
movaps(xmmword[eax + 16], xmm4);
movaps(xmmword[edx + 16], xmm5);
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
movaps(xmmword[eax + 32], xmm6);
movaps(xmmword[edx + 32], xmm7);
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
ret();
}
GSVertexTrace::CGHW9::CGHW9(uint32 key, void* ptr, size_t maxsize)
: CodeGenerator(maxsize, ptr)
GSVertexTrace::CGHW9::CGHW9(uint32 key, void* code, size_t maxsize)
: CodeGenerator(maxsize, code)
{
#if _M_AMD64
#error TODO
@ -327,10 +327,10 @@ GSVertexTrace::CGHW9::CGHW9(uint32 key, void* ptr, size_t maxsize)
static const float fmin = -FLT_MAX;
static const float fmax = FLT_MAX;
movss(xmm0, xmmword[&fmax]);
movss(xmm0, ptr[&fmax]);
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
movss(xmm1, xmmword[&fmin]);
movss(xmm1, ptr[&fmin]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
@ -368,7 +368,7 @@ L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, xmmword[edx + 5 * sizeof(GSVertexHW9) + 16]);
movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
@ -377,7 +377,7 @@ L("loop");
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW9) + 16]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
@ -390,7 +390,7 @@ L("loop");
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW9)]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
}
if(color && (iip || j == n - 1))
@ -455,15 +455,15 @@ L("loop");
punpcklwd(xmm3, xmm0);
}
movaps(xmmword[eax], xmm2);
movaps(xmmword[edx], xmm3);
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(xmmword[eax + 16], xmm4);
movaps(xmmword[edx + 16], xmm5);
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
@ -473,15 +473,15 @@ L("loop");
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(xmmword[eax + 32], xmm6);
movaps(xmmword[edx + 32], xmm7);
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
ret();
}
GSVertexTrace::CGHW11::CGHW11(uint32 key, void* ptr, size_t maxsize)
: CodeGenerator(maxsize, ptr)
GSVertexTrace::CGHW11::CGHW11(uint32 key, void* code, size_t maxsize)
: CodeGenerator(maxsize, code)
{
#if _M_AMD64
#error TODO
@ -521,10 +521,10 @@ GSVertexTrace::CGHW11::CGHW11(uint32 key, void* ptr, size_t maxsize)
static const float fmin = -FLT_MAX;
static const float fmax = FLT_MAX;
movss(xmm0, xmmword[&fmax]);
movss(xmm0, ptr[&fmax]);
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
movss(xmm1, xmmword[&fmin]);
movss(xmm1, ptr[&fmin]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
@ -564,7 +564,7 @@ L("loop");
{
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW11)]);
movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
@ -593,7 +593,7 @@ L("loop");
maxps(xmm7, xmm0);
}
movdqa(xmm0, xmmword[edx + j * sizeof(GSVertexHW11) + 16]);
movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
@ -648,8 +648,8 @@ L("loop");
punpcklwd(xmm3, xmm0);
}
movaps(xmmword[eax], xmm2);
movaps(xmmword[edx], xmm3);
movaps(ptr[eax], xmm2);
movaps(ptr[edx], xmm3);
}
// m_min.p = pmin.xyww();
@ -658,16 +658,16 @@ L("loop");
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(xmmword[eax + 16], xmm4);
movaps(xmmword[edx + 16], xmm5);
movaps(ptr[eax + 16], xmm4);
movaps(ptr[edx + 16], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(xmmword[eax + 32], xmm6);
movaps(xmmword[edx + 32], xmm7);
movaps(ptr[eax + 32], xmm6);
movaps(ptr[edx + 32], xmm7);
}
ret();

View File

@ -31,7 +31,7 @@
class GSState;
__aligned16 class GSVertexTrace
__aligned32 class GSVertexTrace
{
struct Vertex {GSVector4i c; GSVector4 p, t;};
struct VertexAlpha {int min, max; bool valid;};
@ -41,14 +41,14 @@ __aligned16 class GSVertexTrace
class CGSW : public Xbyak::CodeGenerator
{
public:
CGSW(uint32 key, void* ptr, size_t maxsize);
CGSW(uint32 key, void* code, size_t maxsize);
};
class GSVertexTraceMapSW : public GSCodeGeneratorFunctionMap<CGSW, uint32, VertexTracePtr>
{
public:
GSVertexTraceMapSW() : GSCodeGeneratorFunctionMap("VertexTraceSW") {}
CGSW* Create(uint32 key, void* ptr, size_t maxsize) {return new CGSW(key, ptr, maxsize);}
CGSW* Create(uint32 key, void* code, size_t maxsize) {return new CGSW(key, code, maxsize);}
};
class CGHW9 : public Xbyak::CodeGenerator
@ -63,7 +63,7 @@ __aligned16 class GSVertexTrace
{
public:
GSVertexTraceMapHW9() : GSCodeGeneratorFunctionMap("VertexTraceHW9") {}
CGHW9* Create(uint32 key, void* ptr, size_t maxsize) {return new CGHW9(key, ptr, maxsize);}
CGHW9* Create(uint32 key, void* code, size_t maxsize) {return new CGHW9(key, code, maxsize);}
};
class CGHW11 : public Xbyak::CodeGenerator
@ -78,7 +78,7 @@ __aligned16 class GSVertexTrace
{
public:
GSVertexTraceMapHW11() : GSCodeGeneratorFunctionMap("VertexTraceHW11") {}
CGHW11* Create(uint32 key, void* ptr, size_t maxsize) {return new CGHW11(key, ptr, maxsize);}
CGHW11* Create(uint32 key, void* code, size_t maxsize) {return new CGHW11(key, code, maxsize);}
};
GSVertexTraceMapSW m_map_sw;

View File

@ -174,6 +174,7 @@ GSVector4i GSWnd::GetClientRect()
// Returns FALSE if the window has no title, or if th window title is under the strict
// management of the emulator.
bool GSWnd::SetWindowText(const char* title)
{
if( !m_IsManaged ) return false;

View File

@ -57,6 +57,7 @@
#include <algorithm>
// Let's take advantage of the work that's already been done on making things cross-platform by bringing this in.
#include "Pcsx2Defs.h"
using namespace std;
@ -126,7 +127,7 @@ typedef signed long long int64;
#define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
#define USE_UPSCALE_HACKS //Hacks intended to fix upscaling / rendering glitches in HW renderers
#define USE_UPSCALE_HACKS // Hacks intended to fix upscaling / rendering glitches in HW renderers
// dxsdk beta missing these:
#define D3D11_SHADER_MACRO D3D10_SHADER_MACRO

View File

@ -1,12 +1,12 @@
#ifndef XBYAK_H_
#define XBYAK_H_
#ifndef XBYAK_XBYAK_H_
#define XBYAK_XBYAK_H_
/*!
@file xbyak.h
@brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
@author herumi
@version $Revision: 1.157 $
@version $Revision: 1.238 $
@url http://homepage1.nifty.com/herumi/soft/xbyak.html
@date $Date: 2008/12/30 04:53:11 $
@date $Date: 2011/02/04 03:46:09 $
@note modified new BSD license
http://www.opensource.org/licenses/bsd-license.php
*/
@ -15,9 +15,12 @@
#include <assert.h>
#include <map>
#include <string>
#ifdef __GNUC__
#include <unistd.h>
#include <sys/mman.h>
#include <algorithm>
#ifdef _WIN32
#include <windows.h>
#elif defined(__GNUC__)
#include <unistd.h>
#include <sys/mman.h>
#endif
#ifdef __x86_64__
@ -45,13 +48,6 @@
#pragma warning(disable : 4127) /* condition is constant(for "if" trick) */
#endif
#endif
#include <windows.h>
#endif
#ifndef NUM_OF_ARRAY
// template<class T, int N>
// size_t num_of_array(const T (&)[N]) { return N; }
#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(*x))
#endif
namespace Xbyak {
@ -59,29 +55,35 @@ namespace Xbyak {
#include "xbyak_bin2hex.h"
enum {
DEFAULT_MAX_CODE_SIZE = 2048,
VERSION = 0x2070, /* 0xABCD = A.BC(D) */
DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x2990, /* 0xABCD = A.BC(D) */
};
/*
#ifndef MIE_DEFINED_UINT32
#define MIE_DEFINED_UINT32
#ifdef _MSC_VER
#ifndef MIE_INTEGER_TYPE_DEFINED
#define MIE_INTEGER_TYPE_DEFINED
#ifdef _MSC_VER
typedef unsigned __int64 uint64;
#else
typedef __int64 sint64;
#else
typedef unsigned long long uint64;
#endif
typedef unsigned int uint32;
typedef unsigned short uint16;
typedef unsigned char uint8;
#ifndef MIE_ALIGN
typedef long long sint64;
#endif
typedef unsigned int uint32;
typedef unsigned short uint16;
typedef unsigned char uint8;
#endif
*/
#ifndef MIE_ALIGN
#ifdef _MSC_VER
#define MIE_ALIGN(x) __declspec(align(x))
#else
#define MIE_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#endif
*/
#ifndef MIE_PACK // for shufps
#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
#endif
enum Error {
ERR_NONE = 0,
ERR_BAD_ADDRESSING,
@ -101,6 +103,10 @@ enum Error {
ERR_CANT_USE_64BIT_DISP,
ERR_OFFSET_IS_TOO_BIG,
ERR_MEM_SIZE_IS_NOT_SPECIFIED,
ERR_BAD_MEM_SIZE,
ERR_BAD_ST_COMBINATION,
ERR_OVER_LOCAL_LABEL,
ERR_UNDER_LOCAL_LABEL,
ERR_INTERNAL
};
@ -125,6 +131,10 @@ static inline const char *ConvertErrorToString(Error err)
"can't use 64bit disp(use (void*))",
"offset is too big",
"MEM size is not specified",
"bad mem size",
"bad st combination",
"over local label",
"under local label",
"internal error",
};
if (err < 0 || err > ERR_INTERNAL) return 0;
@ -135,7 +145,7 @@ namespace inner {
enum { debug = 1 };
static inline uint32 GetPtrDist(const void *p1, const void *p2 = 0)
static inline uint32 GetPtrDist(const void *p1, const void *p2)
{
uint64 diff = static_cast<const char *>(p1) - static_cast<const char *>(p2);
#ifdef XBYAK64
@ -145,6 +155,7 @@ static inline uint32 GetPtrDist(const void *p1, const void *p2 = 0)
}
static inline bool IsInDisp8(uint32 x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
static inline bool IsInInt32(uint64 x) { return 0xFFFFFFFF80000000ULL <= x || x <= 0x7FFFFFFFU; }
}
@ -163,7 +174,8 @@ public:
REG = 1 << 3,
MMX = 1 << 4,
XMM = 1 << 5,
FPU = 1 << 6
FPU = 1 << 6,
YMM = 1 << 7
};
enum Code {
#ifdef XBYAK64
@ -191,10 +203,11 @@ public:
bool isNone() const { return kind_ == 0; }
bool isMMX() const { return is(MMX); }
bool isXMM() const { return is(XMM); }
bool isYMM() const { return is(YMM); }
bool isREG(int bit = 0) const { return is(REG, bit); }
bool isMEM(int bit = 0) const { return is(MEM, bit); }
bool isFPU() const { return is(FPU); }
bool isExt8bit() const { return ext8bit_ != 0; }
Operand changeBit(int bit) const { return Operand(idx_, static_cast<Kind>(kind_), bit, ext8bit_); }
// any bit is accetable if bit == 0
bool is(int kind, uint32 bit = 0) const
{
@ -216,12 +229,18 @@ public:
{ "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" },
};
return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx_];
} else if (isMMX()) {
static const char tbl[8][4] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
} else if (isYMM()) {
static const char tbl[16][5] = { "ym0", "ym1", "ym2", "ym3", "ym4", "ym5", "ym6", "ym7", "ym8", "ym9", "ym10", "ym11", "ym12", "ym13", "ym14", "ym15" };
return tbl[idx_];
} else if (isXMM()) {
static const char tbl[16][5] = { "xm0", "xm1", "xm2", "xm3", "xm4", "xm5", "xm6", "xm7", "xm8", "xm9", "xm10", "xm11", "xm12", "xm13", "xm14", "xm15" };
return tbl[idx_];
} else if (isMMX()) {
static const char tbl[8][4] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
return tbl[idx_];
} else if (isFPU()) {
static const char tbl[8][4] = { "st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7" };
return tbl[idx_];
}
throw ERR_INTERNAL;
}
@ -229,14 +248,15 @@ public:
class Reg : public Operand {
void operator=(const Reg&);
bool hasRex() const { return isExt8bit() | isREG(64) | isExtIdx(); }
public:
Reg() { }
Reg(int idx, Kind kind, int bit = 0, int ext8bit = 0) : Operand(idx, kind, bit, ext8bit) { }
// reg = this
uint8 getRex(const Reg& index = Reg(), const Reg& base = Reg()) const
Reg changeBit(int bit) const { return Reg(getIdx(), getKind(), bit, isExt8bit()); }
bool isExtIdx() const { return getIdx() > 7; }
uint8 getRex(const Reg& base = Reg()) const
{
if ((!isExt8bit() && !index.isExt8bit() && !base.isExt8bit()) && (getIdx() | index.getIdx() | base.getIdx()) < 8) return 0;
return uint8(0x40 | ((getIdx() >> 3) << 2)| ((index.getIdx() >> 3) << 1) | (base.getIdx() >> 3));
return (hasRex() || base.hasRex()) ? uint8(0x40 | ((isREG(64) | base.isREG(64)) ? 8 : 0) | (isExtIdx() ? 4 : 0)| (base.isExtIdx() ? 1 : 0)) : 0;
}
};
@ -261,7 +281,19 @@ public:
class Xmm : public Mmx {
void operator=(const Xmm&);
public:
explicit Xmm(int idx) : Mmx(idx, Operand::XMM, 128) { }
explicit Xmm(int idx, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { }
};
class Ymm : public Xmm {
void operator=(const Ymm&);
public:
explicit Ymm(int idx) : Xmm(idx, Operand::YMM, 256) { }
};
class Fpu : public Reg {
void operator=(const Fpu&);
public:
explicit Fpu(int idx) : Reg(idx, Operand::FPU, 32) { }
};
// register for addressing(32bit or 64bit)
@ -307,7 +339,7 @@ private:
{
return operator+(r, -static_cast<int>(disp));
}
void operator=(const Reg32e&); // don't call
void operator=(const Reg32e&);
public:
explicit Reg32e(int idx, int bit)
: Reg(idx, REG, bit)
@ -362,7 +394,7 @@ struct RegRip {
class CodeArray {
enum {
ALIGN_SIZE = 16,
ALIGN_PAGE_SIZE = 4096,
MAX_FIXED_BUF_SIZE = 8
};
enum Type {
@ -381,13 +413,12 @@ protected:
public:
CodeArray(size_t maxSize = MAX_FIXED_BUF_SIZE, void *userPtr = 0)
: type_(userPtr ? USER_BUF : maxSize <= MAX_FIXED_BUF_SIZE ? FIXED_BUF : ALLOC_BUF)
, allocPtr_(type_ == ALLOC_BUF ? new uint8[maxSize + ALIGN_SIZE] : 0)
, allocPtr_(type_ == ALLOC_BUF ? new uint8[maxSize + ALIGN_PAGE_SIZE] : 0)
, maxSize_(maxSize)
, top_(type_ == ALLOC_BUF ? getAlignedAddress(allocPtr_) : type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : buf_)
, top_(type_ == ALLOC_BUF ? getAlignedAddress(allocPtr_, ALIGN_PAGE_SIZE) : type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : buf_)
, size_(0)
{
if (type_ == ALLOC_BUF && !protect(top_, maxSize, true)) {
// fprintf(stderr, "can't protect (addr=%p, size=%u, canExec=%d)\n", addr, size, canExec);
throw ERR_CANT_PROTECT;
}
}
@ -452,19 +483,19 @@ public:
/*
@param data [in] address of jmp data
@param disp [in] offset from the next of jmp
@param isShort [in] true if short jmp
@param size [in] write size(1, 2, 4, 8)
*/
void rewrite(uint8 *data, uint32 disp, bool isShort)
void rewrite(uint8 *data, uint64 disp, size_t size)
{
if (isShort) {
data[0] = static_cast<uint8>(disp);
} else {
data[0] = static_cast<uint8>(disp);
data[1] = static_cast<uint8>(disp >> 8);
data[2] = static_cast<uint8>(disp >> 16);
data[3] = static_cast<uint8>(disp >> 24);
if (size != 1 && size != 2 && size != 4 && size != 8) throw ERR_BAD_PARAMETER;
for (size_t i = 0; i < size; i++) {
data[i] = static_cast<uint8>(disp >> (i * 8));
}
}
void updateRegField(uint8 regIdx) const
{
*top_ = (*top_ & B11000111) | ((regIdx << 3) & B00111000);
}
/**
change exec permission of memory
@param addr [in] buffer address
@ -474,15 +505,15 @@ public:
*/
static inline bool protect(const void *addr, size_t size, bool canExec)
{
#ifdef __GNUC__
#if defined(_WIN32)
DWORD oldProtect;
return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
#elif defined(__GNUC__)
size_t pageSize = sysconf(_SC_PAGESIZE);
size_t iaddr = reinterpret_cast<size_t>(addr);
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0);
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
#elif defined(_WIN32)
DWORD oldProtect;
return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
#else
return true;
#endif
@ -493,7 +524,7 @@ public:
@param alingedSize [in] power of two
@return aligned addr by alingedSize
*/
static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE)
static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = 16)
{
return reinterpret_cast<uint8*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
}
@ -521,11 +552,7 @@ public:
uint64 getDisp() const { return disp_; }
uint8 getRex() const { return rex_; }
bool is64bitDisp() const { return is64bitDisp_; } // for moffset
#ifdef XBYAK64
void setRex(uint8 rex) { rex_ = rex; }
#else
void setRex(uint8) { }
#endif
};
class AddressFrame {
@ -536,7 +563,11 @@ public:
explicit AddressFrame(uint32 bit) : bit_(bit) { }
Address operator[](const void *disp) const
{
Reg32e r(Reg(), Reg(), 0, inner::GetPtrDist(disp));
size_t adr = reinterpret_cast<size_t>(disp);
#ifdef XBYAK64
if (adr > 0xFFFFFFFFU) throw ERR_OFFSET_IS_TOO_BIG;
#endif
Reg32e r(Reg(), Reg(), 0, static_cast<uint32>(adr));
return operator[](r);
}
#ifdef XBYAK64
@ -587,7 +618,8 @@ public:
} else if (mod == mod10 || (mod == mod00 && r.isNone())) {
frame.dd(r.disp_);
}
frame.setRex(Reg().getRex(r.index_, r));
uint8 rex = ((r.getIdx() | r.index_.getIdx()) < 8) ? 0 : uint8(0x40 | ((r.index_.getIdx() >> 3) << 1) | (r.getIdx() >> 3));
frame.setRex(rex);
return frame;
}
};
@ -600,6 +632,12 @@ struct JmpLabel {
class Label {
CodeArray *base_;
int anonymousCount_; // for @@, @f, @b
enum {
maxStack = 10
};
int stack_[maxStack];
int stackPos_;
int usedCount_;
int localCount_; // for .***
typedef std::map<const std::string, const uint8*> DefinedList;
typedef std::multimap<const std::string, const JmpLabel> UndefinedList;
@ -628,15 +666,22 @@ public:
Label()
: base_(0)
, anonymousCount_(0)
, stackPos_(1)
, usedCount_(0)
, localCount_(0)
{
}
void incLocalCount() { localCount_++; }
void decLocalCount() { localCount_--; }
void set(CodeArray *base)
void enterLocal()
{
base_ = base;
if (stackPos_ == maxStack) throw ERR_OVER_LOCAL_LABEL;
localCount_ = stack_[stackPos_++] = ++usedCount_;
}
void leaveLocal()
{
if (stackPos_ == 1) throw ERR_UNDER_LOCAL_LABEL;
localCount_ = stack_[--stackPos_ - 1];
}
void set(CodeArray *base) { base_ = base; }
void define(const char *label, const uint8 *address)
{
std::string newLabel(label);
@ -657,8 +702,9 @@ public:
const JmpLabel *jmp = &itr->second;
uint32 disp = inner::GetPtrDist(address, jmp->endOfJmp);
if (jmp->isShort && !inner::IsInDisp8(disp)) throw ERR_LABEL_IS_TOO_FAR;
uint8 *data = jmp->endOfJmp - (jmp->isShort ? 1 : 4);
base_->rewrite(data, disp, jmp->isShort);
size_t jmpSize = jmp->isShort ? 1 : 4;
uint8 *data = jmp->endOfJmp - jmpSize;
base_->rewrite(data, disp, jmpSize);
undefinedList_.erase(itr);
}
}
@ -689,22 +735,22 @@ public:
static inline std::string toStr(int num)
{
char buf[16];
static const char fmt[] = ".%08x";
#ifdef _WIN32
#if _MSC_VER < 1400
_snprintf(buf, sizeof(buf), fmt, num);
_snprintf
#else
_snprintf_s(buf, sizeof(buf), fmt, num);
_snprintf_s
#endif
#else
snprintf(buf, sizeof(buf), fmt, num);
snprintf
#endif
(buf, sizeof(buf), ".%08x", num);
return buf;
}
};
class CodeGenerator : public CodeArray {
protected:
public:
enum LabelType {
T_SHORT,
T_NEAR,
@ -747,36 +793,44 @@ private:
{
return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
}
void if16bit(const Operand& reg1, const Operand& reg2)
{
// except movsx(16bit, 32/64bit)
if ((reg1.isBit(16) && !reg2.isBit(i32e)) || (reg2.isBit(16) && !reg1.isBit(i32e))) db(0x66);
}
void rexAddr(const Address& addr, const Reg& reg = Reg())
{
#ifdef XBYAK64
if (addr.is32bit_) db(0x67);
#endif
if16bit(reg, addr);
uint32 rex = addr.getRex() | reg.getRex();
if (reg.isREG(64)) rex |= 0x48;
if (rex) db(rex);
}
void rex(const Operand& op1, const Operand& op2 = Operand())
{
if (op1.isMEM()) {
rexAddr(static_cast<const Address&>(op1), static_cast<const Reg&>(op2));
} else if (op2.isMEM()) {
rexAddr(static_cast<const Address&>(op2), static_cast<const Reg&>(op1));
uint8 rex = 0;
const Operand *p1 = &op1, *p2 = &op2;
if (p1->isMEM()) std::swap(p1, p2);
if (p1->isMEM()) throw ERR_BAD_COMBINATION;
if (p2->isMEM()) {
const Address& addr = static_cast<const Address&>(*p2);
if (BIT == 64 && addr.is32bit_) db(0x67);
rex = addr.getRex() | static_cast<const Reg&>(*p1).getRex();
} else {
const Reg& reg1 = static_cast<const Reg&>(op1);
const Reg& reg2 = static_cast<const Reg&>(op2);
// ModRM(reg, base);
if16bit(reg1, reg2);
uint8 rex = reg2.getRex(Reg(), reg1);
if (reg1.isREG(64) || reg2.isREG(64)) rex |= 0x48;
rex = static_cast<const Reg&>(op2).getRex(static_cast<const Reg&>(op1));
}
// except movsx(16bit, 32/64bit)
if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
if (rex) db(rex);
}
enum AVXtype {
PP_NONE = 1 << 0,
PP_66 = 1 << 1,
PP_F3 = 1 << 2,
PP_F2 = 1 << 3,
MM_RESERVED = 1 << 4,
MM_0F = 1 << 5,
MM_0F38 = 1 << 6,
MM_0F3A = 1 << 7
};
void vex(bool r, int idx, bool is256, int type, bool x = false, bool b = false, int w = 1)
{
uint32 pp = (type & PP_66) ? 1 : (type & PP_F3) ? 2 : (type & PP_F2) ? 3 : 0;
uint32 vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
if (!b && !x && !w && (type & MM_0F)) {
db(0xC5); db((r ? 0 : 0x80) | vvvv);
} else {
uint32 mmmm = (type & MM_0F) ? 1 : (type & MM_0F38) ? 2 : (type & MM_0F3A) ? 3 : 0;
db(0xC4); db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm); db((w << 7) | vvvv);
}
}
Label label_;
bool isInDisp16(uint32 x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
@ -792,10 +846,8 @@ private:
if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
rex(addr, reg);
db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
uint8 t = *addr.getCode();
assert((t & ~0xC7) == 0); /* 0b11000111 */
db(t | ((reg.getIdx() & 7) << 3)); // update reg field
db(addr.getCode() + 1, static_cast<int>(addr.getSize()) - 1);
addr.updateRegField(static_cast<uint8>(reg.getIdx()));
db(addr.getCode(), static_cast<int>(addr.getSize()));
}
void opJmp(const char *label, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref)
{
@ -835,13 +887,13 @@ private:
if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
db(shortCode);
db(0);
rewrite(top + shortHeaderSize, disp - shortJmpSize, true);
rewrite(top + shortHeaderSize, disp - shortJmpSize, 1);
} else {
if (type == T_SHORT) throw ERR_LABEL_IS_TOO_FAR;
if (longPref) db(longPref);
db(longCode);
dd(0);
rewrite(top + longHeaderSize, disp - longJmpSize, false);
rewrite(top + longHeaderSize, disp - longJmpSize, 4);
}
}
/* preCode is for SSSE3/SSE4 */
@ -864,8 +916,7 @@ private:
}
void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE)
{
pref = mmx.isXMM() ? pref : NONE;
opGen(mmx, op, code, pref, isXMMorMMX_MEM, imm8, preCode);
opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
}
void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
{
@ -887,14 +938,14 @@ private:
opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, B00111010);
}
}
void opR_ModM(const Operand& op, int bit, uint8 mod, int ext, int code0, int code1 = NONE, int code2 = NONE)
void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE, bool disableRex = false)
{
int opBit = op.getBit();
if (disableRex && opBit == 64) opBit = 32;
if (op.isREG(bit)) {
rex(op);
db(code0 | (op.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
db(getModRM(mod, ext, op.getIdx()));
opModR(Reg(ext, Operand::REG, opBit), static_cast<const Reg&>(op).changeBit(opBit), code0, code1, code2);
} else if (op.isMEM()) {
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code0, code1, code2);
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, opBit), code0, code1, code2);
} else {
throw ERR_BAD_COMBINATION;
}
@ -902,13 +953,13 @@ private:
void opShift(const Operand& op, int imm, int ext)
{
verifyMemHasSize(op);
opR_ModM(op, 0, 3, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4)));
opR_ModM(op, 0, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4)));
if (imm != 1) db(imm);
}
void opShift(const Operand& op, const Reg8& cl, int ext)
{
if (cl.getIdx() != Operand::CL) throw ERR_BAD_COMBINATION;
opR_ModM(op, 0, 3, ext, B11010010);
opR_ModM(op, 0, ext, B11010010);
}
void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE)
{
@ -941,20 +992,19 @@ private:
verifyMemHasSize(op);
uint32 immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
if (op.getBit() < immBit) throw ERR_IMM_IS_TOO_BIG;
if (op.isREG()) {
if (immBit == 16 && op.isBit(32)) immBit = 32; /* don't use MEM16 if 32bit mode */
}
if (op.isREG(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
if (op.isREG() && op.getIdx() == 0 && (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) { // rax, eax, ax, al
rex(op);
db(code | 4 | (immBit == 8 ? 0 : 1));
} else {
int tmp = (op.getBit() > immBit && 32 > immBit) ? 2 : 0;
opR_ModM(op, 0, 3, ext, B10000000 | tmp);
int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
opR_ModM(op, 0, ext, B10000000 | tmp);
}
db(imm, immBit / 8);
}
void opIncDec(const Operand& op, int code, int ext)
{
verifyMemHasSize(op);
#ifndef XBYAK64
if (op.isREG() && !op.isBit(8)) {
rex(op); db(code | op.getIdx());
@ -964,21 +1014,15 @@ private:
code = B11111110;
if (op.isREG()) {
opModR(Reg(ext, Operand::REG, op.getBit()), static_cast<const Reg&>(op), code);
} else if (op.isMEM() && op.getBit() > 0) {
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
} else {
throw ERR_BAD_COMBINATION;
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
}
}
void opPushPop(const Operand& op, int code, int ext, int alt)
{
if (op.isREG()) {
#ifdef XBYAK64
if (op.isBit(16)) db(0x66);
if (static_cast<const Reg&>(op).getIdx() >= 8) db(0x41);
#else
rex(op);
#endif
db(alt | (op.getIdx() & 7));
} else if (op.isMEM()) {
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
@ -990,16 +1034,51 @@ private:
{
if (op.isMEM() && op.getBit() == 0) throw ERR_MEM_SIZE_IS_NOT_SPECIFIED;
}
protected:
void opMovxx(const Reg& reg, const Operand& op, uint8 code)
{
int w = op.isBit(16);
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
}
void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext)
{
if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
uint8 code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
if (!code) throw ERR_BAD_MEM_SIZE;
if (m64ext && addr.isBit(64)) ext = m64ext;
rex(addr, st0);
db(code);
addr.updateRegField(ext);
db(addr.getCode(), static_cast<int>(addr.getSize()));
}
// like yasm not nasm
// use code1 if reg1 == st0
// use code2 if reg1 != st0 && reg2 == st0
void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32 code1, uint32 code2)
{
uint32 code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
if (!code) throw ERR_BAD_ST_COMBINATION;
db(uint8(code >> 8));
db(uint8(code | (reg1.getIdx() | reg2.getIdx())));
}
void opFpu(const Fpu& reg, uint8 code1, uint8 code2)
{
db(code1); db(code2 | reg.getIdx());
}
public:
unsigned int getVersion() const { return VERSION; }
using CodeArray::db;
const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
const Reg16 ax, cx, dx, bx, sp, bp, si, di;
const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
const AddressFrame ptr, byte, word, dword, qword, xmmword;
const AddressFrame ptr, byte, word, dword, qword;
const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
#ifdef XBYAK64
const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
@ -1007,7 +1086,9 @@ protected:
const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
const Reg8 spl, bpl, sil, dil;
const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15;
const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15; // for my convenience
const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
const RegRip rip;
#endif
@ -1015,8 +1096,8 @@ protected:
{
label_.define(label, getCurr());
}
void inLocalLabel() { label_.incLocalCount(); }
void outLocalLabel() { label_.decLocalCount(); }
void inLocalLabel() { label_.enterLocal(); }
void outLocalLabel() { label_.leaveLocal(); }
void jmp(const char *label, LabelType type = T_AUTO)
{
opJmp(label, type, B11101011, B11101001, 0);
@ -1027,7 +1108,11 @@ protected:
}
void jmp(const Operand& op)
{
opR_ModM(op, i32e, 3, 4, 0xFF);
opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true);
}
void call(const Operand& op)
{
opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true);
}
// (REG|MEM, REG)
void test(const Operand& op, const Reg& reg)
@ -1042,10 +1127,9 @@ protected:
rex(op);
db(B10101000 | (op.isBit(8) ? 0 : 1));
} else {
opR_ModM(op, 0, 3, 0, B11110110);
opR_ModM(op, 0, 0, B11110110);
}
int size = op.getBit() / 8; if (size > 4) size = 4;
db(imm, size);
db(imm, (std::min)(op.getBit() / 8, 4U));
}
void ret(int imm = 0)
{
@ -1134,24 +1218,39 @@ protected:
opRM_RM(reg1, reg2, B10001000);
}
}
void mov(const Operand& op, uint64 imm)
void mov(const Operand& op,
#ifdef XBYAK64
uint64
#else
uint32
#endif
imm)
{
verifyMemHasSize(op);
if (op.isREG()) {
int w = op.isBit(8) ? 0 : 1;
rex(op); db(B10110000 | (w << 3) | (op.getIdx() & 7));
rex(op);
int code, size;
#ifdef XBYAK64
if (op.isBit(64) && inner::IsInInt32(imm)) {
db(B11000111);
code = B11000000;
size = 4;
} else
#endif
{
code = B10110000 | ((op.isBit(8) ? 0 : 1) << 3);
size = op.getBit() / 8;
}
db(code | (op.getIdx() & 7));
db(imm, size);
} else if (op.isMEM()) {
opModM(static_cast<const Address&>(op), Reg(0, Operand::REG, op.getBit()), B11000110);
int size = op.getBit() / 8; if (size > 4) size = 4;
db(static_cast<uint32>(imm), size);
} else {
throw ERR_BAD_COMBINATION;
}
db(imm, op.getBit() / 8);
}
void opMovxx(const Reg& reg, const Operand& op, uint8 code)
{
int w = op.isBit(16);
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
}
void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, B11000111); }
#ifdef XBYAK64
@ -1180,20 +1279,17 @@ protected:
}
void call(const char *label)
{
opJmp(label, T_NEAR, 0, B10011010, 0);
opJmp(label, T_NEAR, 0, B11101000, 0);
}
void call(const void *addr)
{
opJmp(addr, T_NEAR, 0, B11101000, 0);
}
void call(const Operand& op)
{
opR_ModM(op, 16 | i32e, 3, 2, B11111111);
}
// special case
void movd(const Address& addr, const Mmx& mmx)
{
opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, B01111110);
if (mmx.isXMM()) db(0x66);
opModM(addr, mmx, 0x0F, B01111110);
}
void movd(const Reg32& reg, const Mmx& mmx)
{
@ -1202,8 +1298,8 @@ protected:
}
void movd(const Mmx& mmx, const Address& addr)
{
ASSERT(!addr.isBit(32)); // don't use dword ptr, bogus, won't output 0x66 for xmm dest op
opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, B01101110);
if (mmx.isXMM()) db(0x66);
opModM(addr, mmx, 0x0F, B01101110);
}
void movd(const Mmx& mmx, const Reg32& reg)
{
@ -1225,8 +1321,31 @@ protected:
}
void movq(const Address& addr, const Mmx& mmx)
{
opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, mmx.isXMM() ? B11010110 : B01111111);
if (mmx.isXMM()) db(0x66);
opModM(addr, mmx, 0x0F, mmx.isXMM() ? B11010110 : B01111111);
}
#ifdef XBYAK64
void movq(const Reg64& reg, const Mmx& mmx)
{
if (mmx.isXMM()) db(0x66);
opModR(mmx, reg, 0x0F, B01111110);
}
void movq(const Mmx& mmx, const Reg64& reg)
{
if (mmx.isXMM()) db(0x66);
opModR(mmx, reg, 0x0F, B01101110);
}
void pextrq(const Operand& op, const Xmm& xmm, uint8 imm)
{
if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION;
opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, B00111010); // force to 64bit
}
void pinsrq(const Xmm& xmm, const Operand& op, uint8 imm)
{
if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION;
opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, B00111010); // force to 64bit
}
#endif
// MMX2 : pextrw : reg, mmx/xmm, imm
// SSE4 : pextrw, pextrb, pextrd, extractps : reg/mem, mmx/xmm, imm
void pextrw(const Operand& op, const Mmx& xmm, uint8 imm) { opExt(op, xmm, 0x15, imm, true); }
@ -1270,7 +1389,7 @@ protected:
bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
if (!is16bit && !(reg.isREG(i32e) && (op.isREG(i32e) || op.isMEM()))) throw ERR_BAD_COMBINATION;
if (is16bit) db(0x66);
db(0xF3); opModRM(Reg(reg.getIdx(), Operand::REG, i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, 0x0F, 0xB8);
db(0xF3); opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, 0x0F, 0xB8);
}
void crc32(const Reg32e& reg, const Operand& op)
{
@ -1278,17 +1397,86 @@ protected:
db(0xF2);
opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
}
public:
void vextractps(const Operand& op, const Xmm& xmm, uint8 imm)
{
if (!(op.isREG(32) || op.isMEM()) || xmm.isYMM()) throw ERR_BAD_COMBINATION;
opAVX_X_XM_IMM(xmm, cvtReg(op, op.isREG(), Operand::XMM), MM_0F3A | PP_66, 0x17, false, 0, imm);
}
// support (x, x, x/m), (y, y, y/m)
void opAVX_X_X_XM(const Xmm& xm1, const Operand& op1, const Operand& op2, int type, int code0, bool supportYMM, int w = -1)
{
const Xmm *xm2;
const Operand *op;
if (op2.isNone()) {
xm2 = &xm1;
op = &op1;
} else {
if (!(op1.isXMM() || (supportYMM && op1.isYMM()))) throw ERR_BAD_COMBINATION;
xm2 = static_cast<const Xmm*>(&op1);
op = &op2;
}
// (xm1, xm2, op)
if (!((xm1.isXMM() && xm2->isXMM()) || (supportYMM && xm1.isYMM() && xm2->isYMM()))) throw ERR_BAD_COMBINATION;
bool x, b;
if (op->isMEM()) {
const Address& addr = *static_cast<const Address*>(op);
uint8 rex = addr.getRex();
x = (rex & 2) != 0;
b = (rex & 1) != 0;
if (BIT == 64 && addr.is32bit_) db(0x67);
if (BIT == 64 && w == -1) w = (rex & 4) ? 1 : 0;
} else {
x = false;
b = static_cast<const Reg*>(op)->isExtIdx();
}
if (w == -1) w = 0;
vex(xm1.isExtIdx(), xm2->getIdx(), xm1.isYMM(), type, x, b, w);
db(code0);
if (op->isMEM()) {
const Address& addr = *static_cast<const Address*>(op);
addr.updateRegField(static_cast<uint8>(xm1.getIdx()));
db(addr.getCode(), static_cast<int>(addr.getSize()));
} else {
db(getModRM(3, xm1.getIdx(), op->getIdx()));
}
}
// if cvt then return pointer to Xmm(idx) (or Ymm(idx)), otherwise return op
const Operand& cvtReg(const Operand& op, bool cvt, Operand::Kind kind) const
{
if (!cvt) return op;
static const Xmm* xmTbl[] = {
&xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7,
#ifdef XBYAK64
&xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15
#endif
};
static const Ymm* ymTbl[] = {
&ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7,
#ifdef XBYAK64
&ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15
#endif
};
return (kind == Operand::XMM) ? *xmTbl[op.getIdx()] : *ymTbl[op.getIdx()];
}
// support (x, x/m, imm), (y, y/m, imm)
void opAVX_X_XM_IMM(const Xmm& xmm, const Operand& op, int type, int code, bool supportYMM, int w = -1, int imm = NONE)
{
opAVX_X_X_XM(xmm, xmm.isXMM() ? xm0 : ym0, op, type, code, supportYMM, w); if (imm != NONE) db((uint8)imm);
}
enum { NONE = 256 };
public:
CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0)
: CodeArray(maxSize, userPtr)
, mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7)
, xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7)
, ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7)
, xm0(xmm0), xm1(xmm1), xm2(xmm2), xm3(xmm3), xm4(xmm4), xm5(xmm5), xm6(xmm6), xm7(xmm7) // for my convenience
, ym0(ymm0), ym1(ymm1), ym2(ymm2), ym3(ymm3), ym4(ymm4), ym5(ymm5), ym6(ymm6), ym7(ymm7) // for my convenience
, eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI)
, ax(Operand::EAX), cx(Operand::ECX), dx(Operand::EDX), bx(Operand::EBX), sp(Operand::ESP), bp(Operand::EBP), si(Operand::ESI), di(Operand::EDI)
, al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH)
, ptr(0), byte(8), word(16), dword(32), qword(64), xmmword(128)
, ptr(0), byte(8), word(16), dword(32), qword(64)
, st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7)
#ifdef XBYAK64
, rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15)
, r8d(Operand::R8D), r9d(Operand::R9D), r10d(Operand::R10D), r11d(Operand::R11D), r12d(Operand::R12D), r13d(Operand::R13D), r14d(Operand::R14D), r15d(Operand::R15D)
@ -1296,7 +1484,9 @@ public:
, r8b(Operand::R8B), r9b(Operand::R9B), r10b(Operand::R10B), r11b(Operand::R11B), r12b(Operand::R12B), r13b(Operand::R13B), r14b(Operand::R14B), r15b(Operand::R15B)
, spl(Operand::SPL, 1), bpl(Operand::BPL, 1), sil(Operand::SIL, 1), dil(Operand::DIL, 1)
, xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15)
, ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15)
, xm8(xmm8), xm9(xmm9), xm10(xmm10), xm11(xmm11), xm12(xmm12), xm13(xmm13), xm14(xmm14), xm15(xmm15) // for my convenience
, ym8(ymm8), ym9(ymm9), ym10(ymm10), ym11(ymm11), ym12(ymm12), ym13(ymm13), ym14(ymm14), ym15(ymm15) // for my convenience
, rip()
#endif
{
@ -1309,7 +1499,7 @@ public:
// if (hasUndefinedLabel()) throw ERR_LABEL_IS_NOT_FOUND;
return top_;
}
#ifdef TEST_NM
#ifdef XBYAK_TEST
void dump(bool doClear = true)
{
CodeArray::dump();
@ -1322,7 +1512,7 @@ public:
void align(int x = 16)
{
if (x != 4 && x != 8 && x != 16 && x != 32) throw ERR_BAD_ALIGN;
while (inner::GetPtrDist(getCurr()) % x) {
while (size_t(getCurr()) % x) {
nop();
}
}
@ -1335,4 +1525,4 @@ public:
} // end of namespace
#endif // XBYAK_H_
#endif // XBYAK_XBYAK_H_

View File

@ -1,4 +1,4 @@
const char *getVersionString() const { return "2.07"; }
const char *getVersionString() const { return "2.99"; }
void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
@ -184,88 +184,94 @@ void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0
void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
void cmovo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 0); }
void jo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }
void seto(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 0); }
void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 0); }
void cmovno(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 1); }
void jno(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }
void setno(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 1); }
void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 1); }
void cmovb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
void jb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
void setb(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 2); }
void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
void cmovc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
void jc(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
void cmovnae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
void jnae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
void setnae(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 2); }
void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
void cmovnb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
void jnb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
void setnb(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 3); }
void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
void cmovae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
void jae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
void setae(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 3); }
void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
void cmovnc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
void jnc(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
void cmove(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
void je(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
void sete(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 4); }
void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
void cmovz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
void jz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
void setz(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 4); }
void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
void cmovne(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
void jne(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
void setne(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 5); }
void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
void cmovnz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
void jnz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
void setnz(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 5); }
void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
void cmovbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
void jbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
void setbe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 6); }
void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
void cmovna(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
void jna(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
void setna(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 6); }
void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
void cmovnbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
void jnbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
void setnbe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 7); }
void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
void cmova(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
void ja(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
void seta(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 7); }
void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
void cmovs(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 8); }
void js(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }
void sets(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 8); }
void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 8); }
void cmovns(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 9); }
void jns(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }
void setns(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 9); }
void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 9); }
void cmovp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
void jp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
void setp(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 10); }
void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
void cmovpe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
void jpe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
void setpe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 10); }
void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
void cmovnp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
void jnp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
void setnp(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 11); }
void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
void cmovpo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
void jpo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
void setpo(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 11); }
void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
void cmovl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
void jl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
void setl(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 12); }
void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
void cmovnge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
void jnge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
void setnge(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 12); }
void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
void cmovnl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
void jnl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
void setnl(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 13); }
void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
void cmovge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
void jge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
void setge(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 13); }
void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
void cmovle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
void jle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
void setle(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 14); }
void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
void cmovng(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
void jng(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
void setng(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 14); }
void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
void cmovnle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
void jnle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
void setnle(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 15); }
void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
void cmovg(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
void jg(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
void setg(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 15); }
void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
#ifdef XBYAK64
void cdqe() { db(0x48); db(0x98); }
#else
@ -308,12 +314,57 @@ void mwait() { db(0x0F); db(0x01); db(0xC9); }
void rdmsr() { db(0x0F); db(0x32); }
void rdpmc() { db(0x0F); db(0x33); }
void rdtsc() { db(0x0F); db(0x31); }
void rdtscp() { db(0x0F); db(0x01); db(0xF9); }
void wait() { db(0x9B); }
void wbinvd() { db(0x0F); db(0x09); }
void wrmsr() { db(0x0F); db(0x30); }
void xlatb() { db(0xD7); }
void popf() { db(0x9D); }
void pushf() { db(0x9C); }
void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
void f2xm1() { db(0xD9); db(0xF0); }
void fabs() { db(0xD9); db(0xE1); }
void faddp() { db(0xDE); db(0xC1); }
void fchs() { db(0xD9); db(0xE0); }
void fcom() { db(0xD8); db(0xD1); }
void fcomp() { db(0xD8); db(0xD9); }
void fcompp() { db(0xDE); db(0xD9); }
void fcos() { db(0xD9); db(0xFF); }
void fdecstp() { db(0xD9); db(0xF6); }
void fdivp() { db(0xDE); db(0xF9); }
void fdivrp() { db(0xDE); db(0xF1); }
void fincstp() { db(0xD9); db(0xF7); }
void fld1() { db(0xD9); db(0xE8); }
void fldl2t() { db(0xD9); db(0xE9); }
void fldl2e() { db(0xD9); db(0xEA); }
void fldpi() { db(0xD9); db(0xEB); }
void fldlg2() { db(0xD9); db(0xEC); }
void fldln2() { db(0xD9); db(0xED); }
void fldz() { db(0xD9); db(0xEE); }
void fmulp() { db(0xDE); db(0xC9); }
void fnop() { db(0xD9); db(0xD0); }
void fpatan() { db(0xD9); db(0xF3); }
void fprem() { db(0xD9); db(0xF8); }
void fprem1() { db(0xD9); db(0xF5); }
void fptan() { db(0xD9); db(0xF2); }
void frndint() { db(0xD9); db(0xFC); }
void fscale() { db(0xD9); db(0xFD); }
void fsin() { db(0xD9); db(0xFE); }
void fsincos() { db(0xD9); db(0xFB); }
void fsqrt() { db(0xD9); db(0xFA); }
void fsubp() { db(0xDE); db(0xE9); }
void fsubrp() { db(0xDE); db(0xE1); }
void ftst() { db(0xD9); db(0xE4); }
void fucom() { db(0xDD); db(0xE1); }
void fucomp() { db(0xDD); db(0xE9); }
void fucompp() { db(0xDA); db(0xE9); }
void fxam() { db(0xD9); db(0xE5); }
void fxch() { db(0xD9); db(0xC9); }
void fxtract() { db(0xD9); db(0xF4); }
void fyl2x() { db(0xD9); db(0xF1); }
void fyl2xp1() { db(0xD9); db(0xF9); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
@ -332,12 +383,12 @@ void xor(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
void xor(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x30, 6); }
void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
void div(const Operand& op) { opR_ModM(op, 0, 3, 6, 0xF6); }
void idiv(const Operand& op) { opR_ModM(op, 0, 3, 7, 0xF6); }
void imul(const Operand& op) { opR_ModM(op, 0, 3, 5, 0xF6); }
void mul(const Operand& op) { opR_ModM(op, 0, 3, 4, 0xF6); }
void neg(const Operand& op) { opR_ModM(op, 0, 3, 3, 0xF6); }
void not(const Operand& op) { opR_ModM(op, 0, 3, 2, 0xF6); }
void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
void not(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
void rcl(const Operand& op, const Reg8& cl) { opShift(op, cl, 2); }
void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
@ -360,52 +411,57 @@ void shrd(const Operand& op, const Reg& reg, uint8 imm) { opShxd(op, reg, imm, 0
void shrd(const Operand& op, const Reg& reg, const Reg8& cl) { opShxd(op, reg, 0, 0xAC, &cl); }
void bsf(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
void bsr(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, 256, 0x38); }
void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, 256, 0x38); }
void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, 256, 0x38); }
void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, 256, 0x38); }
void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, 256, 0x38); }
void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, 256, 0x38); }
void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, 256, 0x38); }
void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, 256, 0x38); }
void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, 256, 0x38); }
void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, 256, 0x38); }
void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, 256, 0x38); }
void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, 256, 0x38); }
void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, 256, 0x38); }
void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, 256, 0x38); }
void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, 256, 0x38); }
void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
void palignr(const Mmx& mmx, const Operand& op, int imm) { opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8>(imm), 0x3a); }
void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, 256, 0x38); }
void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void dppd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
@ -420,6 +476,8 @@ void pcmpestrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x60
void pcmpestri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void pcmpistrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void pcmpistri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void aeskeygenassist(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
@ -427,3 +485,540 @@ void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getId
void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x58, true); }
void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x58, true); }
void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x58, false); }
void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x58, false); }
void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5C, true); }
void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5C, true); }
void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5C, false); }
void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5C, false); }
void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x59, true); }
void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x59, true); }
void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x59, false); }
void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x59, false); }
void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5E, true); }
void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5E, true); }
void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5E, false); }
void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5E, false); }
void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5F, true); }
void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5F, true); }
void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5F, false); }
void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5F, false); }
void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5D, true); }
void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5D, true); }
void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5D, false); }
void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5D, false); }
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x54, true); }
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x54, true); }
void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x55, true); }
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x55, true); }
void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x56, true); }
void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x56, true); }
void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x57, true); }
void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x57, true); }
void vblendpd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
void vblendpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
void vblendps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
void vblendps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
void vdppd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
void vdppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
void vdpps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
void vdpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
void vmpsadbw(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x42, false, 0); db(imm); }
void vmpsadbw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x42, false, 0); db(imm); }
void vpblendw(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0E, false, 0); db(imm); }
void vpblendw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0E, false, 0); db(imm); }
void vroundsd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
void vroundsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
void vroundss(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
void vroundss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
void vpclmulqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
void vpclmulqdq(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
void vpermilps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0C, true, 0); }
void vpermilpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0D, true, 0); }
void vcmppd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
void vcmppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
void vcmpps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0xC2, true, -1); db(imm); }
void vcmpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC2, true, -1); db(imm); }
void vcmpsd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
void vcmpsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
void vcmpss(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
void vcmpss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
void vcvtsd2ss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0x5A, false, -1); }
void vcvtsd2ss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x5A, false, -1); }
void vcvtss2sd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x5A, false, -1); }
void vcvtss2sd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x5A, false, -1); }
void vinsertps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
void vinsertps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
void vpacksswb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x63, false, -1); }
void vpacksswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x63, false, -1); }
void vpackssdw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6B, false, -1); }
void vpackssdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6B, false, -1); }
void vpackuswb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x67, false, -1); }
void vpackuswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x67, false, -1); }
void vpackusdw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x2B, false, -1); }
void vpackusdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x2B, false, -1); }
void vpaddb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFC, false, -1); }
void vpaddb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFC, false, -1); }
void vpaddw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFD, false, -1); }
void vpaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFD, false, -1); }
void vpaddd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFE, false, -1); }
void vpaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFE, false, -1); }
void vpaddq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD4, false, -1); }
void vpaddq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD4, false, -1); }
void vpaddsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEC, false, -1); }
void vpaddsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEC, false, -1); }
void vpaddsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xED, false, -1); }
void vpaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xED, false, -1); }
void vpaddusb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDC, false, -1); }
void vpaddusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDC, false, -1); }
void vpaddusw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDD, false, -1); }
void vpaddusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDD, false, -1); }
void vpalignr(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0F, false, -1); db(imm); }
void vpalignr(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0F, false, -1); db(imm); }
void vpand(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDB, false, -1); }
void vpand(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDB, false, -1); }
void vpandn(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDF, false, -1); }
void vpandn(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDF, false, -1); }
void vpavgb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE0, false, -1); }
void vpavgb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE0, false, -1); }
void vpavgw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE3, false, -1); }
void vpavgw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE3, false, -1); }
void vpcmpeqb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x74, false, -1); }
void vpcmpeqb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x74, false, -1); }
void vpcmpeqw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x75, false, -1); }
void vpcmpeqw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x75, false, -1); }
void vpcmpeqd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x76, false, -1); }
void vpcmpeqd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x76, false, -1); }
void vpcmpeqq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x29, false, -1); }
void vpcmpeqq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x29, false, -1); }
void vpcmpgtb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x64, false, -1); }
void vpcmpgtb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x64, false, -1); }
void vpcmpgtw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x65, false, -1); }
void vpcmpgtw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x65, false, -1); }
void vpcmpgtd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x66, false, -1); }
void vpcmpgtd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x66, false, -1); }
void vpcmpgtq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x37, false, -1); }
void vpcmpgtq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x37, false, -1); }
void vphaddw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x01, false, -1); }
void vphaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x01, false, -1); }
void vphaddd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x02, false, -1); }
void vphaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x02, false, -1); }
void vphaddsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x03, false, -1); }
void vphaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x03, false, -1); }
void vphsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x05, false, -1); }
void vphsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x05, false, -1); }
void vphsubd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x06, false, -1); }
void vphsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x06, false, -1); }
void vphsubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x07, false, -1); }
void vphsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x07, false, -1); }
void vpmaddwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF5, false, -1); }
void vpmaddwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF5, false, -1); }
void vpmaddubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x04, false, -1); }
void vpmaddubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x04, false, -1); }
void vpmaxsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3C, false, -1); }
void vpmaxsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3C, false, -1); }
void vpmaxsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEE, false, -1); }
void vpmaxsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEE, false, -1); }
void vpmaxsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3D, false, -1); }
void vpmaxsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3D, false, -1); }
void vpmaxub(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDE, false, -1); }
void vpmaxub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDE, false, -1); }
void vpmaxuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3E, false, -1); }
void vpmaxuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3E, false, -1); }
void vpmaxud(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3F, false, -1); }
void vpmaxud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3F, false, -1); }
void vpminsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x38, false, -1); }
void vpminsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x38, false, -1); }
void vpminsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEA, false, -1); }
void vpminsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEA, false, -1); }
void vpminsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x39, false, -1); }
void vpminsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x39, false, -1); }
void vpminub(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDA, false, -1); }
void vpminub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDA, false, -1); }
void vpminuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3A, false, -1); }
void vpminuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3A, false, -1); }
void vpminud(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3B, false, -1); }
void vpminud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3B, false, -1); }
void vpmulhuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE4, false, -1); }
void vpmulhuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE4, false, -1); }
void vpmulhrsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0B, false, -1); }
void vpmulhrsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0B, false, -1); }
void vpmulhw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE5, false, -1); }
void vpmulhw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE5, false, -1); }
void vpmullw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD5, false, -1); }
void vpmullw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD5, false, -1); }
void vpmulld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x40, false, -1); }
void vpmulld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x40, false, -1); }
void vpmuludq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF4, false, -1); }
void vpmuludq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF4, false, -1); }
void vpmuldq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x28, false, -1); }
void vpmuldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x28, false, -1); }
void vpor(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEB, false, -1); }
void vpor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEB, false, -1); }
void vpsadbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF6, false, -1); }
void vpsadbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF6, false, -1); }
void vpshufb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x00, false, -1); }
void vpsignb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x08, false, -1); }
void vpsignb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x08, false, -1); }
void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x09, false, -1); }
void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, false, -1); }
void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, false, -1); }
void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, false, -1); }
void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); }
void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); }
void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); }
void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); }
void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); }
void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); }
void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); }
void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); }
void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); }
void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); }
void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); }
void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); }
void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); }
void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); }
void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); }
void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); }
void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, false, -1); }
void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, false, -1); }
void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, false, -1); }
void vpsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF9, false, -1); }
void vpsubd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFA, false, -1); }
void vpsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFA, false, -1); }
void vpsubq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFB, false, -1); }
void vpsubq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFB, false, -1); }
void vpsubsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE8, false, -1); }
void vpsubsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE8, false, -1); }
void vpsubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE9, false, -1); }
void vpsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE9, false, -1); }
void vpsubusb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD8, false, -1); }
void vpsubusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD8, false, -1); }
void vpsubusw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD9, false, -1); }
void vpsubusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD9, false, -1); }
void vpunpckhbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x68, false, -1); }
void vpunpckhbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x68, false, -1); }
void vpunpckhwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x69, false, -1); }
void vpunpckhwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x69, false, -1); }
void vpunpckhdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6A, false, -1); }
void vpunpckhdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6A, false, -1); }
void vpunpckhqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6D, false, -1); }
void vpunpckhqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6D, false, -1); }
void vpunpcklbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x60, false, -1); }
void vpunpcklbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x60, false, -1); }
void vpunpcklwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x61, false, -1); }
void vpunpcklwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x61, false, -1); }
void vpunpckldq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x62, false, -1); }
void vpunpckldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x62, false, -1); }
void vpunpcklqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6C, false, -1); }
void vpunpcklqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6C, false, -1); }
void vpxor(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEF, false, -1); }
void vpxor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEF, false, -1); }
void vrcpss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x53, false, -1); }
void vrcpss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x53, false, -1); }
void vrsqrtss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x52, false, -1); }
void vrsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x52, false, -1); }
void vshufpd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
void vshufpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
void vshufps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0xC6, true, -1); db(imm); }
void vshufps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC6, true, -1); db(imm); }
void vsqrtsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0x51, false, -1); }
void vsqrtsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x51, false, -1); }
void vsqrtss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x51, false, -1); }
void vsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x51, false, -1); }
void vunpckhpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x15, true, -1); }
void vunpckhpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x15, true, -1); }
void vunpckhps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0x15, true, -1); }
void vunpckhps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x15, true, -1); }
void vunpcklpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x14, true, -1); }
void vunpcklpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x14, true, -1); }
void vunpcklps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0x14, true, -1); }
void vunpcklps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x14, true, -1); }
void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0xDF, false, 0, imm); }
void vroundpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x09, true, 0, imm); }
void vroundps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x08, true, 0, imm); }
void vpermilpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x05, true, 0, imm); }
void vpermilps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x04, true, 0, imm); }
void vpcmpestri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x61, false, 0, imm); }
void vpcmpestrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x60, false, 0, imm); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x63, false, 0, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x62, false, 0, imm); }
void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0E, true, 0); }
void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0F, true, 0); }
void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2F, false, -1); }
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2F, false, -1); }
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x5B, true, -1); }
void vcvtps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x5B, true, -1); }
void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x5B, true, -1); }
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x28, true, -1); }
void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x28, true, -1); }
void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x12, true, -1); }
void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x6F, true, -1); }
void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x6F, true, -1); }
void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x16, true, -1); }
void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x12, true, -1); }
void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x10, true, -1); }
void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x10, true, -1); }
void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1C, false, -1); }
void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1D, false, -1); }
void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1E, false, -1); }
void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x41, false, -1); }
void vpmovsxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x20, false, -1); }
void vpmovsxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x21, false, -1); }
void vpmovsxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x22, false, -1); }
void vpmovsxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x23, false, -1); }
void vpmovsxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x24, false, -1); }
void vpmovsxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x25, false, -1); }
void vpmovzxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x30, false, -1); }
void vpmovzxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x31, false, -1); }
void vpmovzxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x32, false, -1); }
void vpmovzxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x33, false, -1); }
void vpmovzxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x34, false, -1); }
void vpmovzxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x35, false, -1); }
void vpshufd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x70, false, -1, imm); }
void vpshufhw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x70, false, -1, imm); }
void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x70, false, -1, imm); }
void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x17, false, -1); }
void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x53, true, -1); }
void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x52, true, -1); }
void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x51, true, -1); }
void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x51, true, -1); }
void vucomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2E, false, -1); }
void vucomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2E, false, -1); }
void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x29, true, -1); }
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x29, true, -1); }
void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x7F, true, -1); }
void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_F3, 0x7F, true, -1); }
void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x11, true, -1); }
void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x11, true, -1); }
void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0xD0, true, -1); }
void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0xD0, true, -1); }
void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7C, true, -1); }
void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7C, true, -1); }
void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7D, true, -1); }
void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7D, true, -1); }
void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDC, false, 0); }
void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDD, false, 0); }
void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDE, false, 0); }
void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDF, false, 0); }
void vmaskmovps(const Xmm& xm1, const Xmm& xm2, const Address& addr) { opAVX_X_X_XM(xm1, xm2, addr, MM_0F38 | PP_66, 0x2C, true, 0); }
void vmaskmovps(const Address& addr, const Xmm& xm1, const Xmm& xm2) { opAVX_X_X_XM(xm2, xm1, addr, MM_0F38 | PP_66, 0x2E, true, 0); }
void vmaskmovpd(const Xmm& xm1, const Xmm& xm2, const Address& addr) { opAVX_X_X_XM(xm1, xm2, addr, MM_0F38 | PP_66, 0x2D, true, 0); }
void vmaskmovpd(const Address& addr, const Xmm& xm1, const Xmm& xm2) { opAVX_X_X_XM(xm2, xm1, addr, MM_0F38 | PP_66, 0x2F, true, 0); }
void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x16, false); }
void vmovhpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x17, false); }
void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F, 0x16, false); }
void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x17, false); }
void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x12, false); }
void vmovlpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x13, false); }
void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F, 0x12, false); }
void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x13, false); }
void vfmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 1); }
void vfmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 1); }
void vfmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 1); }
void vfmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 0); }
void vfmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 0); }
void vfmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 0); }
void vfmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 1); }
void vfmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 1); }
void vfmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 1); }
void vfmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 0); }
void vfmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 0); }
void vfmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 0); }
void vfmaddsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 1); }
void vfmaddsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 1); }
void vfmaddsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 1); }
void vfmaddsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 0); }
void vfmaddsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 0); }
void vfmaddsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 0); }
void vfmsubadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 1); }
void vfmsubadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 1); }
void vfmsubadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 1); }
void vfmsubadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 0); }
void vfmsubadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 0); }
void vfmsubadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 0); }
void vfmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 1); }
void vfmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 1); }
void vfmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 1); }
void vfmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 0); }
void vfmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 0); }
void vfmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 0); }
void vfmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 1); }
void vfmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 1); }
void vfmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 1); }
void vfmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 0); }
void vfmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 0); }
void vfmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 0); }
void vfnmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 1); }
void vfnmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 1); }
void vfnmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 1); }
void vfnmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 0); }
void vfnmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 0); }
void vfnmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 0); }
void vfnmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 1); }
void vfnmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 1); }
void vfnmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 1); }
void vfnmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 0); }
void vfnmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 0); }
void vfnmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 0); }
void vfnmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 1); }
void vfnmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 1); }
void vfnmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 1); }
void vfnmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 0); }
void vfnmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 0); }
void vfnmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 0); }
void vfnmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 1); }
void vfnmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 1); }
void vfnmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 1); }
void vfnmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 0); }
void vfnmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 0); }
void vfnmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 0); }
void vaesimc(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0xDB, false, 0); }
void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x1A, true, 0); }
void vbroadcastsd(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x19, true, 0); }
void vbroadcastss(const Xmm& x, const Address& addr) { opAVX_X_XM_IMM(x, addr, MM_0F38 | PP_66, 0x18, true, 0); }
void vextractf128(const Operand& op, const Ymm& y, uint8 imm) { opAVX_X_XM_IMM(y, cvtReg(op, op.isXMM(), Operand::YMM), MM_0F3A | PP_66, 0x19, true, 0, imm); }
void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, cvtReg(op, op.isXMM(), Operand::YMM), MM_0F3A | PP_66, 0x18, true, 0); db(imm); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, MM_0F3A | PP_66, 0x06, true, 0); db(imm); }
void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_F2, 0xF0, true, 0); }
void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, MM_0F, 0xAE, false, -1); }
void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, MM_0F, 0xAE, false, -1); }
void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_66, 0xF7, false, -1); }
void vpextrb(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(i32e) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x14, false); db(imm); }
void vpextrw(const Reg& r, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), MM_0F | PP_66, 0xC5, false); db(imm); }
void vpextrw(const Address& addr, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, addr, MM_0F3A | PP_66, 0x15, false); db(imm); }
void vpextrd(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x16, false, 0); db(imm); }
void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x20, false); db(imm); }
void vpinsrb(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x20, false); db(imm); }
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F | PP_66, 0xC4, false); db(imm); }
void vpinsrw(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F | PP_66, 0xC4, false); db(imm); }
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
void vpinsrd(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
void vpmovmskb(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, x, MM_0F | PP_66, 0xD7, false); }
void vpslldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm7, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpslldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm7, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsrldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm3, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsrldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm3, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsllw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsllw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
void vpslld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
void vpslld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsllq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsllq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsraw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm4, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsraw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm4, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsrad(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm4, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrad(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm4, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrlw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsrlw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsrld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrlq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsrlq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
void vmovd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x7E, false, 0); }
void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F, 0x12, false); }
void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F, 0x16, false); }
void vmovmskpd(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F | PP_66, 0x50, true, 0); }
void vmovmskps(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F, 0x50, true, 0); }
void vmovntdq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0xE7, true); }
void vmovntpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0x2B, true); }
void vmovntps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F, 0x2B, true); }
void vmovntdqa(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F38 | PP_66, 0x2A, false); }
void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0x10, false); }
void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x10, false); }
void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x11, false); }
void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x10, false); }
void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x10, false); }
void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x11, false); }
void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 0); }
void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 0); }
void vcvtsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 0); }
void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 0); }
void vcvtsi2ss(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, cvtReg(op2, op2.isREG(), Operand::XMM), MM_0F | PP_F3, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
void vcvtsi2sd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, cvtReg(op2, op2.isREG(), Operand::XMM), MM_0F | PP_F2, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
void vcvtps2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, cvtReg(op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM), MM_0F, 0x5A, true); }
void vcvtdq2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, cvtReg(op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM), MM_0F | PP_F3, 0xE6, true); }
void vcvtpd2ps(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0x5A, true); }
void vcvtpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_F2, 0xE6, true); }
void vcvttpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0xE6, true); }
#ifdef XBYAK64
void vmovq(const Xmm& x, const Reg64& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 1); }
void vmovq(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x7E, false, -1); }
void vmovq(const Reg64& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 1); }
void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0xD6, false, -1); }
void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_F3, 0x7E, false, -1); }
void vpextrq(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x16, false, 1); db(imm); }
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
void vpinsrq(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 1); }
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); }
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); }
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); }
#endif

View File

@ -2,9 +2,10 @@
#define XBYAK_XBYAK_UTIL_H_
/**
utility class for Xbyak
@note this header is under construction
utility class and functions for Xbyak
@note this header is UNDER CONSTRUCTION!
*/
#include "xbyak/xbyak.h"
#ifdef _WIN32
#if (_MSC_VER < 1400) && defined(XBYAK32)
@ -29,10 +30,17 @@
#include <intrin.h> // for __cpuid
#endif
#else
#if __GNUC_PREREQ(4, 3)
#ifndef __GNUC_PREREQ
#define __GNUC_PREREQ(major, minor) (((major) << 16) + (minor))
#endif
#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
#include <cpuid.h>
#else
#define __cpuid(eaxIn, a, b, c, d) __asm__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#else
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#endif
#endif
#endif
@ -43,6 +51,10 @@ namespace Xbyak { namespace util {
*/
class Cpu {
unsigned int type_;
unsigned int get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
}
public:
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
{
@ -64,11 +76,17 @@ public:
tSSE41 = 1 << 7,
tSSE42 = 1 << 8,
tPOPCNT = 1 << 9,
tAESNI = 1 << 10,
tSSE5 = 1 << 11,
tOSXSACE = 1 << 12,
tPCLMULQDQ = 1 << 13,
tAVX = 1 << 14,
tFMA = 1 << 15,
t3DN = 1 << 16,
tE3DN = 1 << 17,
tSSE4a = 1 << 18,
tSSE5 = 1 << 11,
tRDTSCP = 1 << 19,
tINTEL = 1 << 24,
tAMD = 1 << 25
@ -80,28 +98,39 @@ public:
getCpuid(0, data);
static const char intel[] = "ntel";
static const char amd[] = "cAMD";
if (data[2] == *reinterpret_cast<const unsigned int*>(amd)) {
if (data[2] == get32bitAsBE(amd)) {
type_ |= tAMD;
getCpuid(0x80000001, data);
if (data[3] & (1 << 31)) type_ |= t3DN;
if (data[3] & (1 << 15)) type_ |= tCMOV;
if (data[3] & (1 << 30)) type_ |= tE3DN;
if (data[3] & (1 << 22)) type_ |= tMMX2;
if (data[3] & (1U << 31)) type_ |= t3DN;
if (data[3] & (1U << 15)) type_ |= tCMOV;
if (data[3] & (1U << 30)) type_ |= tE3DN;
if (data[3] & (1U << 22)) type_ |= tMMX2;
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
}
if (data[2] == *reinterpret_cast<const unsigned int*>(intel)) {
if (data[2] == get32bitAsBE(intel)) {
type_ |= tINTEL;
getCpuid(0x80000001, data);
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
}
getCpuid(1, data);
if (data[2] & (1 << 0)) type_ |= tSSE3;
if (data[2] & (1 << 9)) type_ |= tSSSE3;
if (data[2] & (1 << 19)) type_ |= tSSE41;
if (data[2] & (1 << 20)) type_ |= tSSE42;
if (data[2] & (1 << 23)) type_ |= tPOPCNT;
if (data[3] & (1 << 15)) type_ |= tCMOV;
if (data[3] & (1 << 23)) type_ |= tMMX;
if (data[3] & (1 << 25)) type_ |= tMMX2 | tSSE;
if (data[3] & (1 << 26)) type_ |= tSSE2;
if (data[2] & (1U << 0)) type_ |= tSSE3;
if (data[2] & (1U << 9)) type_ |= tSSSE3;
if (data[2] & (1U << 19)) type_ |= tSSE41;
if (data[2] & (1U << 20)) type_ |= tSSE42;
if (data[2] & (1U << 23)) type_ |= tPOPCNT;
if (data[2] & (1U << 25)) type_ |= tAESNI;
if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ;
if (data[2] & (1U << 27)) type_ |= tOSXSACE;
#if _M_SSE >= 0x500
// QQQ
// should check XFEATURE_ENABLED_MASK[2:1] = '11b' by xgetvb
if (data[2] & (1U << 28)) type_ |= tAVX;
if (data[2] & (1U << 12)) type_ |= tFMA;
#endif
if (data[3] & (1U << 15)) type_ |= tCMOV;
if (data[3] & (1U << 23)) type_ |= tMMX;
if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE;
if (data[3] & (1U << 26)) type_ |= tSSE2;
}
bool has(Type type) const
{
@ -109,6 +138,40 @@ public:
}
};
class Clock {
public:
static inline uint64 getRdtsc()
{
#ifdef _MSC_VER
return __rdtsc();
#else
unsigned int eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64)edx << 32) | eax;
#endif
}
Clock()
: clock_(0)
, count_(0)
{
}
void begin()
{
clock_ -= getRdtsc();
}
void end()
{
clock_ += getRdtsc();
count_++;
}
int getCount() const { return count_; }
uint64 getClock() const { return clock_; }
void clear() { count_ = 0; clock_ = 0; }
private:
uint64 clock_;
int count_;
};
#ifdef XBYAK32
namespace local {
@ -133,53 +196,47 @@ XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG(ebp)
#undef XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG
} // end of local
template<class Gen>
struct EnableSetEip : public Gen {
EnableSetEip(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0)
: Gen(maxSize, userPtr)
{
}
/**
get pid to out register
@note out = eax or ecx or edx
*/
void setEipTo(const Xbyak::Reg32& out)
{
/**
get eip to out register
@note out is not esp
*/
template<class T>
void setEipTo(T *self, const Xbyak::Reg32& out)
{
#if 0
Gen::call(Gen::getCurr() + 5);
Gen::pop(out);
self->call("@f");
self->L("@@");
self->pop(out);
#else
int idx = out.getIdx();
switch (idx) {
case Xbyak::Operand::EAX:
Gen::call((void*)local::set_eip_to_eax);
self->call((void*)local::set_eip_to_eax);
break;
case Xbyak::Operand::ECX:
Gen::call((void*)local::set_eip_to_ecx);
self->call((void*)local::set_eip_to_ecx);
break;
case Xbyak::Operand::EDX:
Gen::call((void*)local::set_eip_to_edx);
self->call((void*)local::set_eip_to_edx);
break;
case Xbyak::Operand::EBX:
Gen::call((void*)local::set_eip_to_ebx);
self->call((void*)local::set_eip_to_ebx);
break;
case Xbyak::Operand::ESI:
Gen::call((void*)local::set_eip_to_esi);
self->call((void*)local::set_eip_to_esi);
break;
case Xbyak::Operand::EDI:
Gen::call((void*)local::set_eip_to_edi);
self->call((void*)local::set_eip_to_edi);
break;
case Xbyak::Operand::EBP:
Gen::call((void*)local::set_eip_to_ebp);
self->call((void*)local::set_eip_to_ebp);
break;
default:
assert(0);
}
#endif
}
};
}
#endif
} } // end of util
#endif