Mostly code cleanups, XBYAK 2.99, VEX conversion for the sw renderer (3-5% faster), GSState::Move fix for dark cloud 2 invention crash.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4287 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-02-07 01:59:05 +00:00
parent e2d36a53a4
commit ca7abd983a
56 changed files with 6404 additions and 3150 deletions

View File

@ -193,6 +193,7 @@ static const int __pagesize = PCSX2_PAGESIZE;
# define __aligned(alig) __declspec(align(alig)) # define __aligned(alig) __declspec(align(alig))
# define __aligned16 __declspec(align(16)) # define __aligned16 __declspec(align(16))
# define __aligned32 __declspec(align(32))
# define __pagealigned __declspec(align(PCSX2_PAGESIZE)) # define __pagealigned __declspec(align(PCSX2_PAGESIZE))
// Deprecated; use __align instead. // Deprecated; use __align instead.

View File

@ -153,7 +153,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
{ {
GSDevice* dev = NULL; GSDevice* dev = NULL;
if( renderer == -1 ) if(renderer == -1)
{ {
renderer = theApp.GetConfig("renderer", 0); renderer = theApp.GetConfig("renderer", 0);
} }
@ -167,6 +167,7 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
// GSopen call then they'll get corrupted graphics, but that's not my problem. // GSopen call then they'll get corrupted graphics, but that's not my problem.
delete s_gs; delete s_gs;
s_gs = NULL; s_gs = NULL;
} }
@ -178,20 +179,25 @@ static INT32 _GSopen(void* dsp, char* title, int renderer)
case 12: case 13: new GSDeviceNull(); break; case 12: case 13: new GSDeviceNull(); break;
} }
if( !dev ) return -1; if(!dev) return -1;
if( !s_gs ) if(!s_gs)
{ {
switch(renderer) switch(renderer)
{ {
default: default:
case 0: s_gs = new GSRendererDX9(); break; case 0:
case 3: s_gs = new GSRendererDX11(); break; s_gs = new GSRendererDX9();
break;
case 3:
s_gs = new GSRendererDX11();
break;
case 2: case 5: case 8: case 11: case 13: case 2: case 5: case 8: case 11: case 13:
s_gs = new GSRendererNull(); break; s_gs = new GSRendererNull();
break;
case 1: case 4: case 7: case 10: case 12: case 1: case 4: case 7: case 10: case 12:
s_gs = new GSRendererSW(); break; s_gs = new GSRendererSW();
break;
} }
s_renderer = renderer; s_renderer = renderer;
@ -519,72 +525,6 @@ EXPORT_C GSsetFrameLimit(int limit)
#ifdef _WINDOWS #ifdef _WINDOWS
// Returns false if the window's been closed or an invalid packet was encountered.
static __forceinline bool LoopDatPacket_Thingamajig(HWND hWnd, uint8 (&regs)[0x2000], vector<uint8>& buff, FILE* fp, long start)
{
switch(fgetc(fp))
{
case EOF:
fseek(fp, start, 0);
return !!IsWindowVisible(hWnd);
case 0:
{
uint32 index = fgetc(fp);
uint32 size;
fread(&size, 4, 1, fp);
switch(index)
{
case 0:
{
if(buff.size() < 0x4000) buff.resize(0x4000);
uint32 addr = 0x4000 - size;
fread(&buff[0] + addr, size, 1, fp);
GSgifTransfer1(&buff[0], addr);
}
break;
case 1:
if(buff.size() < size) buff.resize(size);
fread(&buff[0], size, 1, fp);
GSgifTransfer2(&buff[0], size / 16);
break;
case 2:
if(buff.size() < size) buff.resize(size);
fread(&buff[0], size, 1, fp);
GSgifTransfer3(&buff[0], size / 16);
break;
}
}
break;
case 1:
GSvsync(fgetc(fp));
return !!IsWindowVisible(hWnd);
case 2:
{
uint32 size;
fread(&size, 4, 1, fp);
if(buff.size() < size) buff.resize(size);
GSreadFIFO2(&buff[0], size / 16);
}
break;
case 3:
fread(regs, 0x2000, 1, fp);
break;
default:
return false;
}
return true;
}
// lpszCmdLine: // lpszCmdLine:
// First parameter is the renderer. // First parameter is the renderer.
// Second parameter is the gs file to load and run. // Second parameter is the gs file to load and run.
@ -634,7 +574,73 @@ EXPORT_C GSReplay(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow)
GSvsync(1); GSvsync(1);
while( LoopDatPacket_Thingamajig(hWnd, regs, buff, fp, start) ) ; bool exit = false;
while(!exit)
{
uint32 index;
uint32 size;
uint32 addr;
int pos;
switch(fgetc(fp))
{
case EOF:
fseek(fp, start, 0);
exit = !IsWindowVisible(hWnd);
break;
case 0:
index = fgetc(fp);
fread(&size, 4, 1, fp);
switch(index)
{
case 0:
if(buff.size() < 0x4000) buff.resize(0x4000);
addr = 0x4000 - size;
fread(buff.data() + addr, size, 1, fp);
GSgifTransfer1(buff.data(), addr);
break;
case 1:
if(buff.size() < size) buff.resize(size);
fread(buff.data(), size, 1, fp);
GSgifTransfer2(buff.data(), size / 16);
break;
case 2:
if(buff.size() < size) buff.resize(size);
fread(buff.data(), size, 1, fp);
GSgifTransfer3(buff.data(), size / 16);
break;
case 3:
if(buff.size() < size) buff.resize(size);
fread(buff.data(), size, 1, fp);
GSgifTransfer(buff.data(), size / 16);
break;
}
break;
case 1:
GSvsync(fgetc(fp));
exit = !IsWindowVisible(hWnd);
break;
case 2:
fread(&size, 4, 1, fp);
if(buff.size() < size) buff.resize(size);
GSreadFIFO2(&buff[0], size / 16);
break;
case 3:
fread(regs, 0x2000, 1, fp);
break;
}
}
GSclose(); GSclose();
GSshutdown(); GSshutdown();
@ -672,7 +678,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
{PSM_PSMZ16S, "16ZS"}, {PSM_PSMZ16S, "16ZS"},
}; };
uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16); uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i; for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;
@ -809,7 +815,7 @@ EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow
{ {
GSLocalMemory mem; GSLocalMemory mem;
uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16); uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i; for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i;

View File

@ -77,6 +77,7 @@ enum GIF_REG
GIF_REG_CLAMP_1 = 0x08, GIF_REG_CLAMP_1 = 0x08,
GIF_REG_CLAMP_2 = 0x09, GIF_REG_CLAMP_2 = 0x09,
GIF_REG_FOG = 0x0a, GIF_REG_FOG = 0x0a,
GIF_REG_INVALID = 0x0b,
GIF_REG_XYZF3 = 0x0c, GIF_REG_XYZF3 = 0x0c,
GIF_REG_XYZ3 = 0x0d, GIF_REG_XYZ3 = 0x0d,
GIF_REG_A_D = 0x0e, GIF_REG_A_D = 0x0e,
@ -1077,7 +1078,7 @@ REG128_SET(GIFPackedReg)
GIFPackedNOP NOP; GIFPackedNOP NOP;
REG_SET_END REG_SET_END
__aligned16 struct GIFPath __aligned32 struct GIFPath
{ {
GIFTag tag; GIFTag tag;
uint32 reg; uint32 reg;
@ -1107,8 +1108,11 @@ __aligned16 struct GIFPath
if((++reg & 0xf) == nreg) if((++reg & 0xf) == nreg)
{ {
reg = 0; reg = 0;
if(--nloop == 0) if(--nloop == 0)
{
return false; return false;
}
} }
return true; return true;

View File

@ -1201,7 +1201,7 @@ public:
#else #else
/* /*
__aligned16 uint32 block[8 * 8]; __aligned32 uint32 block[8 * 8];
UnpackBlock4HL(src, srcpitch, block); UnpackBlock4HL(src, srcpitch, block);
@ -1316,7 +1316,7 @@ public:
#else #else
/* /*
__aligned16 uint32 block[8 * 8]; __aligned32 uint32 block[8 * 8];
UnpackBlock4HH(src, srcpitch, block); UnpackBlock4HH(src, srcpitch, block);
@ -1467,7 +1467,7 @@ public:
#else #else
__aligned16 uint8 block[16 * 16]; __aligned32 uint8 block[16 * 16];
ReadBlock8<true>(src, (uint8*)block, sizeof(block) / 16); ReadBlock8<true>(src, (uint8*)block, sizeof(block) / 16);
@ -1542,7 +1542,7 @@ public:
#else #else
__aligned16 uint8 block[(32 / 2) * 16]; __aligned32 uint8 block[(32 / 2) * 16];
ReadBlock4<true>(src, (uint8*)block, sizeof(block) / 16); ReadBlock4<true>(src, (uint8*)block, sizeof(block) / 16);
@ -1583,7 +1583,7 @@ public:
#else #else
__aligned16 uint32 block[8 * 8]; __aligned32 uint32 block[8 * 8];
ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8); ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
@ -1624,7 +1624,7 @@ public:
#else #else
__aligned16 uint32 block[8 * 8]; __aligned32 uint32 block[8 * 8];
ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8); ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);
@ -1665,7 +1665,7 @@ public:
#else #else
__aligned16 uint32 block[8 * 8]; __aligned32 uint32 block[8 * 8];
ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8); ReadBlock32<true>(src, (uint8*)block, sizeof(block) / 8);

View File

@ -68,7 +68,8 @@ void GSCaptureDlg::OnInit()
ComboBoxAppend(IDC_CODECS, "Uncompressed", 0, true); ComboBoxAppend(IDC_CODECS, "Uncompressed", 0, true);
CoInitialize(0); CoInitialize(0); // this is obviously wrong here, each thread should call this on start, and where is CoUninitalize?
BeginEnumSysDev(CLSID_VideoCompressorCategory, moniker) BeginEnumSysDev(CLSID_VideoCompressorCategory, moniker)
{ {
Codec c; Codec c;
@ -195,6 +196,7 @@ bool GSCaptureDlg::OnCommand(HWND hWnd, UINT id, UINT code)
if (ris != 2) if (ris != 2)
{ {
wstring s = wstring(c.DisplayName.m_str); wstring s = wstring(c.DisplayName.m_str);
theApp.SetConfig("CaptureVideoCodecDisplayName", string(s.begin(), s.end()).c_str()); theApp.SetConfig("CaptureVideoCodecDisplayName", string(s.begin(), s.end()).c_str());
} }
else else

View File

@ -126,7 +126,7 @@ void GSClut::Write(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ASSERT(TEX0.CSA == 0); ASSERT(TEX0.CSA == 0);
@ -135,7 +135,7 @@ void GSClut::WriteCLUT32_I8_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TE
void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) void GSClut::WriteCLUT32_I4_CSM1(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ASSERT(TEX0.CSA < 16); ASSERT(TEX0.CSA < 16);

View File

@ -28,7 +28,7 @@
class GSLocalMemory; class GSLocalMemory;
__aligned16 class GSClut : public GSAlignedClass<16> __aligned32 class GSClut : public GSAlignedClass<32>
{ {
GSLocalMemory* m_mem; GSLocalMemory* m_mem;
@ -37,7 +37,7 @@ __aligned16 class GSClut : public GSAlignedClass<16>
uint32* m_buff32; uint32* m_buff32;
uint64* m_buff64; uint64* m_buff64;
__aligned16 struct WriteState __aligned32 struct WriteState
{ {
GIFRegTEX0 TEX0; GIFRegTEX0 TEX0;
GIFRegTEXCLUT TEXCLUT; GIFRegTEXCLUT TEXCLUT;
@ -45,7 +45,7 @@ __aligned16 class GSClut : public GSAlignedClass<16>
bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT); bool IsDirty(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT);
} m_write; } m_write;
__aligned16 struct ReadState __aligned32 struct ReadState
{ {
GIFRegTEX0 TEX0; GIFRegTEX0 TEX0;
GIFRegTEXA TEXA; GIFRegTEXA TEXA;

View File

@ -145,8 +145,11 @@ void GSDevice::Recycle(GSTexture* t)
if(t) if(t)
{ {
t->last_frame_used = m_frame; t->last_frame_used = m_frame;
m_pool.push_front(t); m_pool.push_front(t);
//printf("%d\n",m_pool.size()); //printf("%d\n",m_pool.size());
while(m_pool.size() > 300) while(m_pool.size() > 300)
{ {
delete m_pool.back(); delete m_pool.back();
@ -159,9 +162,11 @@ void GSDevice::Recycle(GSTexture* t)
void GSDevice::AgePool() void GSDevice::AgePool()
{ {
m_frame++; m_frame++;
while (m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
while(m_pool.size() > 20 && m_frame - m_pool.back()->last_frame_used > 10)
{ {
delete m_pool.back(); delete m_pool.back();
m_pool.pop_back(); m_pool.pop_back();
} }
} }

View File

@ -46,7 +46,7 @@ struct InterlaceConstantBuffer
#pragma pack(pop) #pragma pack(pop)
class GSDevice : public GSAlignedClass<16> class GSDevice : public GSAlignedClass<32>
{ {
list<GSTexture*> m_pool; list<GSTexture*> m_pool;
@ -66,7 +66,7 @@ protected:
struct {size_t stride, start, count, limit;} m_vertices; struct {size_t stride, start, count, limit;} m_vertices;
uint32 m_msaa; uint32 m_msaa;
DXGI_SAMPLE_DESC m_msaa_desc; DXGI_SAMPLE_DESC m_msaa_desc;
unsigned m_frame; // for ageing the pool unsigned int m_frame; // for ageing the pool
virtual GSTexture* Create(int type, int w, int h, bool msaa, int format) = 0; virtual GSTexture* Create(int type, int w, int h, bool msaa, int format) = 0;

View File

@ -229,8 +229,10 @@ bool GSDevice11::Create(GSWnd* wnd)
} }
} }
if (m_msaa_desc.Count == 1) if(m_msaa_desc.Count == 1)
{
m_msaa = 0; m_msaa = 0;
}
// convert // convert
@ -378,7 +380,7 @@ bool GSDevice11::Create(GSWnd* wnd)
if(m_wnd->IsManaged()) if(m_wnd->IsManaged())
{ {
SetExclusive( !theApp.GetConfig("windowed", 1) ); SetExclusive(!theApp.GetConfig("windowed", 1));
} }
return true; return true;
@ -392,11 +394,14 @@ bool GSDevice11::Reset(int w, int h)
if(m_swapchain) if(m_swapchain)
{ {
DXGI_SWAP_CHAIN_DESC scd; DXGI_SWAP_CHAIN_DESC scd;
memset(&scd, 0, sizeof(scd)); memset(&scd, 0, sizeof(scd));
m_swapchain->GetDesc(&scd); m_swapchain->GetDesc(&scd);
m_swapchain->ResizeBuffers(scd.BufferCount, w, h, scd.BufferDesc.Format, 0); m_swapchain->ResizeBuffers(scd.BufferCount, w, h, scd.BufferDesc.Format, 0);
CComPtr<ID3D11Texture2D> backbuffer; CComPtr<ID3D11Texture2D> backbuffer;
if(FAILED(m_swapchain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backbuffer))) if(FAILED(m_swapchain->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&backbuffer)))
{ {
return false; return false;
@ -422,9 +427,12 @@ void GSDevice11::SetExclusive(bool isExcl)
m_swapchain->ResizeTarget(&desc); m_swapchain->ResizeTarget(&desc);
*/ */
HRESULT hr = m_swapchain->SetFullscreenState( isExcl, NULL ); HRESULT hr = m_swapchain->SetFullscreenState(isExcl, NULL);
if(hr == DXGI_ERROR_NOT_CURRENTLY_AVAILABLE) if(hr == DXGI_ERROR_NOT_CURRENTLY_AVAILABLE)
{
fprintf(stderr, "(GSdx10) SetExclusive(%s) failed; request unavailable.", isExcl ? "true" : "false"); fprintf(stderr, "(GSdx10) SetExclusive(%s) failed; request unavailable.", isExcl ? "true" : "false");
}
} }
void GSDevice11::Flip() void GSDevice11::Flip()
@ -885,10 +893,13 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
void GSDevice11::PSSetShaderResource(int i, GSTexture* sr) void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
{ {
ID3D11ShaderResourceView* srv = NULL; ID3D11ShaderResourceView* srv = NULL;
if (sr) srv = *(GSTexture11*)sr;
if (m_state.ps_srv[i] != srv) { if(sr) srv = *(GSTexture11*)sr;
if(m_state.ps_srv[i] != srv)
{
m_state.ps_srv[i] = srv; m_state.ps_srv[i] = srv;
m_srv_changed = true; m_srv_changed = true;
} }
} }
@ -914,13 +925,17 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
m_ctx->PSSetShader(ps, NULL, 0); m_ctx->PSSetShader(ps, NULL, 0);
} }
if (m_srv_changed) { if (m_srv_changed)
{
m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv); m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
m_srv_changed = false; m_srv_changed = false;
} }
if (m_ss_changed) { if(m_ss_changed)
{
m_ctx->PSSetSamplers(0, 3, m_state.ps_ss); m_ctx->PSSetSamplers(0, 3, m_state.ps_ss);
m_ss_changed = false; m_ss_changed = false;
} }
@ -982,8 +997,8 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
vp.TopLeftX = 0; vp.TopLeftX = 0;
vp.TopLeftY = 0; vp.TopLeftY = 0;
vp.Width = (FLOAT)rt->GetWidth(); vp.Width = (float)rt->GetWidth();
vp.Height = (FLOAT)rt->GetHeight(); vp.Height = (float)rt->GetHeight();
vp.MinDepth = 0.0f; vp.MinDepth = 0.0f;
vp.MaxDepth = 1.0f; vp.MaxDepth = 1.0f;

View File

@ -31,7 +31,6 @@ GSDevice9::GSDevice9()
memset(&m_pp, 0, sizeof(m_pp)); memset(&m_pp, 0, sizeof(m_pp));
memset(&m_d3dcaps, 0, sizeof(m_d3dcaps)); memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
memset(&m_state, 0, sizeof(m_state)); memset(&m_state, 0, sizeof(m_state));
m_state.bf = 0xffffffff; m_state.bf = 0xffffffff;
@ -39,81 +38,109 @@ GSDevice9::GSDevice9()
GSDevice9::~GSDevice9() GSDevice9::~GSDevice9()
{ {
for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
for_each(m_om_bs.begin(), m_om_bs.end(), delete_second()); for_each(m_om_bs.begin(), m_om_bs.end(), delete_second());
for_each(m_om_dss.begin(), m_om_dss.end(), delete_second()); for_each(m_om_dss.begin(), m_om_dss.end(), delete_second());
for_each(m_ps_ss.begin(), m_ps_ss.end(), delete_second()); for_each(m_ps_ss.begin(), m_ps_ss.end(), delete_second());
for_each(m_mskfix.begin(), m_mskfix.end(), delete_second());
if(m_state.vs_cb) _aligned_free(m_state.vs_cb); if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
if(m_state.ps_cb) _aligned_free(m_state.ps_cb); if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
} }
// if supported and null != msaa_desc, msaa_desc will contain requested Count and Quality
static bool IsMsaaSupported(IDirect3D9* d3d, D3DFORMAT depth_format, uint msaaCount, DXGI_SAMPLE_DESC* msaa_desc = NULL)
{
if(msaaCount > 16) return false;
//if supported and null!=msaa_desc, msaa_desc will contain requested Count and Quality
static bool IsMsaaSupported(CComPtr<IDirect3D9>& d3d, D3DFORMAT depth_format, uint msaaCount, OUT DXGI_SAMPLE_DESC* msaa_desc=NULL){
D3DCAPS9 d3dcaps; D3DCAPS9 d3dcaps;
if (msaaCount>16) return false;
memset(&d3dcaps, 0, sizeof(d3dcaps)); memset(&d3dcaps, 0, sizeof(d3dcaps));
d3d->GetDeviceCaps(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, &d3dcaps); d3d->GetDeviceCaps(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, &d3dcaps);
DWORD quality[2] = {0, 0}; DWORD quality[2] = {0, 0};
if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] >0 if(SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, D3DFMT_A8R8G8B8, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[0])) && quality[0] > 0
&& SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] >0 && SUCCEEDED(d3d->CheckDeviceMultiSampleType(d3dcaps.AdapterOrdinal, d3dcaps.DeviceType, depth_format, TRUE, (D3DMULTISAMPLE_TYPE)msaaCount, &quality[1])) && quality[1] > 0)
){ {
if (msaa_desc){ if(msaa_desc)
msaa_desc->Count = msaaCount; {
msaa_desc->Quality = std::min<DWORD>(quality[0] - 1, quality[1] - 1); msaa_desc->Count = msaaCount;
msaa_desc->Quality = std::min<DWORD>(quality[0] - 1, quality[1] - 1);
} }
return true; return true;
} }
return false; return false;
} }
static bool TestDepthFormat(CComPtr<IDirect3D9> &d3d, D3DFORMAT format) static bool TestDepthFormat(IDirect3D9* d3d, D3DFORMAT format)
{ {
if (FAILED(d3d->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format))) if(FAILED(d3d->CheckDeviceFormat(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DUSAGE_DEPTHSTENCIL, D3DRTYPE_SURFACE, format)))
{
return false; return false;
if (FAILED(d3d->CheckDepthStencilMatch(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format))) }
if(FAILED(d3d->CheckDepthStencilMatch(D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, D3DFMT_X8R8G8B8, D3DFMT_X8R8G8B8, format)))
{
return false; return false;
}
return true; return true;
} }
static D3DFORMAT BestD3dFormat(IDirect3D9* d3d, int msaaCount = 0, DXGI_SAMPLE_DESC* msaa_desc = NULL)
{
// In descending order of preference
//In descending order of preference static D3DFORMAT fmts[] =
static D3DFORMAT s_DX9formatsToSearch[]={D3DFMT_D32, D3DFMT_D32F_LOCKABLE, D3DFMT_D24S8}; {
D3DFMT_D32,
D3DFMT_D32F_LOCKABLE,
D3DFMT_D24S8
};
static D3DFORMAT BestD3dFormat(CComPtr<IDirect3D9>& d3d, int msaaCount=0, OUT DXGI_SAMPLE_DESC* msaa_desc=NULL){ if(1 == msaaCount) msaaCount = 0;
if(!d3d) return D3DFMT_UNKNOWN;
if (1==msaaCount) msaaCount=0;
for (int i=0; i<sizeof(s_DX9formatsToSearch); i++) for(int i = 0; i < sizeof(fmts); i++)
if (TestDepthFormat(d3d, s_DX9formatsToSearch[i]) && (!msaaCount || IsMsaaSupported(d3d, s_DX9formatsToSearch[i], msaaCount, msaa_desc))) {
return s_DX9formatsToSearch[i]; if(TestDepthFormat(d3d, fmts[i]) && (!msaaCount || IsMsaaSupported(d3d, fmts[i], msaaCount, msaa_desc)))
{
return fmts[i];
}
}
return D3DFMT_UNKNOWN; return D3DFMT_UNKNOWN;
} }
//return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0 // return: 32, 24, or 0 if not supported. if 1==msaa, considered as msaa=0
uint GSDevice9::GetMaxDepth(uint msaa=0){
uint GSDevice9::GetMaxDepth(uint msaa = 0)
{
CComPtr<IDirect3D9> d3d; CComPtr<IDirect3D9> d3d;
d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION)); d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
D3DFORMAT f=BestD3dFormat(d3d, msaa); switch(BestD3dFormat(d3d, msaa))
switch (f){ {
case D3DFMT_D32: case D3DFMT_D32F_LOCKABLE: return 32; case D3DFMT_D32:
case D3DFMT_D24S8: return 24; case D3DFMT_D32F_LOCKABLE:
return 32;
case D3DFMT_D24S8:
return 24;
} }
return 0; return 0;
} }
void GSDevice9::ForceValidMsaaConfig(){ void GSDevice9::ForceValidMsaaConfig()
if (0==GetMaxDepth(theApp.GetConfig("msaa", 0))) {
theApp.SetConfig("msaa", 0);//replace invalid msaa value in ini file with 0. if(0 == GetMaxDepth(theApp.GetConfig("msaa", 0)))
{
theApp.SetConfig("msaa", 0); // replace invalid msaa value in ini file with 0.
}
}; };
bool GSDevice9::Create(GSWnd* wnd) bool GSDevice9::Create(GSWnd* wnd)
@ -128,17 +155,26 @@ bool GSDevice9::Create(GSWnd* wnd)
m_d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION)); m_d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
if(!m_d3d) return false; if(!m_d3d) return false;
ForceValidMsaaConfig(); ForceValidMsaaConfig();
//Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
// the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable). // Get best format/depth for msaa. Assumption is that if the resulting depth is 24 instead of possible 32,
m_depth_format=BestD3dFormat(m_d3d, m_msaa, &m_msaa_desc); // the user was already warned when she selected it. (Lower res z buffer without warning is unacceptable).
if (D3DFMT_UNKNOWN == m_depth_format){
//can't find a format with requested msaa, try without. m_depth_format = BestD3dFormat(m_d3d, m_msaa, &m_msaa_desc);
m_depth_format = BestD3dFormat(m_d3d, 0);
if (D3DFMT_UNKNOWN == m_depth_format) if(D3DFMT_UNKNOWN == m_depth_format)
return false; {
// can't find a format with requested msaa, try without.
m_msaa=0; m_depth_format = BestD3dFormat(m_d3d, 0);
if(D3DFMT_UNKNOWN == m_depth_format)
{
return false;
}
m_msaa = 0;
} }
memset(&m_d3dcaps, 0, sizeof(m_d3dcaps)); memset(&m_d3dcaps, 0, sizeof(m_d3dcaps));
@ -180,7 +216,6 @@ bool GSDevice9::Create(GSWnd* wnd)
return false; return false;
} }
if(!Reset(1, 1)) if(!Reset(1, 1))
{ {
return false; return false;
@ -274,7 +309,8 @@ bool GSDevice9::Create(GSWnd* wnd)
void GSDevice9::SetVsync(bool enable) void GSDevice9::SetVsync(bool enable)
{ {
if( m_vsync == enable ) return; if(m_vsync == enable) return;
__super::SetVsync(enable); __super::SetVsync(enable);
// Clever trick: Delete the backbuffer, so that the next Present will fail and // Clever trick: Delete the backbuffer, so that the next Present will fail and
@ -282,6 +318,7 @@ void GSDevice9::SetVsync(bool enable)
// vsync settings. :) // vsync settings. :)
delete m_backbuffer; delete m_backbuffer;
m_backbuffer = NULL; m_backbuffer = NULL;
} }
@ -293,6 +330,7 @@ bool GSDevice9::Reset(int w, int h)
HRESULT hr; HRESULT hr;
int mode = (!m_wnd->IsManaged() || theApp.GetConfig("windowed", 1)) ? Windowed : Fullscreen; int mode = (!m_wnd->IsManaged() || theApp.GetConfig("windowed", 1)) ? Windowed : Fullscreen;
if(mode == DontCare) if(mode == DontCare)
{ {
mode = m_pp.Windowed ? Windowed : Fullscreen; mode = m_pp.Windowed ? Windowed : Fullscreen;
@ -707,11 +745,11 @@ void GSDevice9::StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, c
IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices)); IASetVertexBuffer(vertices, sizeof(vertices[0]), countof(vertices));
IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP); IASetPrimitiveTopology(D3DPT_TRIANGLESTRIP);
IASetInputLayout(m_convert.il);
// vs // vs
VSSetShader(m_convert.vs, NULL, 0); VSSetShader(m_convert.vs, NULL, 0);
IASetInputLayout(m_convert.il);
// ps // ps
@ -904,7 +942,7 @@ void GSDevice9::VSSetShader(IDirect3DVertexShader9* vs, const float* vs_cb, int
{ {
if(m_state.vs_cb) _aligned_free(m_state.vs_cb); if(m_state.vs_cb) _aligned_free(m_state.vs_cb);
m_state.vs_cb = (float*)_aligned_malloc(size, 16); m_state.vs_cb = (float*)_aligned_malloc(size, 32);
} }
m_state.vs_cb_len = vs_cb_len; m_state.vs_cb_len = vs_cb_len;
@ -926,10 +964,13 @@ void GSDevice9::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
void GSDevice9::PSSetShaderResource(int i, GSTexture* sr) void GSDevice9::PSSetShaderResource(int i, GSTexture* sr)
{ {
IDirect3DTexture9* srv = NULL; IDirect3DTexture9* srv = NULL;
if (sr) srv = *(GSTexture9*)sr;
if (m_state.ps_srvs[i] != srv) { if(sr) srv = *(GSTexture9*)sr;
if(m_state.ps_srvs[i] != srv)
{
m_state.ps_srvs[i] = srv; m_state.ps_srvs[i] = srv;
m_dev->SetTexture(i, srv); m_dev->SetTexture(i, srv);
} }
} }
@ -953,7 +994,7 @@ void GSDevice9::PSSetShader(IDirect3DPixelShader9* ps, const float* ps_cb, int p
{ {
if(m_state.ps_cb) _aligned_free(m_state.ps_cb); if(m_state.ps_cb) _aligned_free(m_state.ps_cb);
m_state.ps_cb = (float*)_aligned_malloc(size, 16); m_state.ps_cb = (float*)_aligned_malloc(size, 32);
} }
m_state.ps_cb_len = ps_cb_len; m_state.ps_cb_len = ps_cb_len;

View File

@ -30,7 +30,7 @@ class GSDeviceDX : public GSDevice
public: public:
#pragma pack(push, 1) #pragma pack(push, 1)
__aligned16 struct VSConstantBuffer __aligned32 struct VSConstantBuffer
{ {
GSVector4 VertexScale; GSVector4 VertexScale;
GSVector4 VertexOffset; GSVector4 VertexOffset;
@ -86,7 +86,7 @@ public:
VSSelector() : key(0) {} VSSelector() : key(0) {}
}; };
__aligned16 struct PSConstantBuffer __aligned32 struct PSConstantBuffer
{ {
GSVector4 FogColor_AREF; GSVector4 FogColor_AREF;
GSVector4 HalfTexel; GSVector4 HalfTexel;

File diff suppressed because it is too large Load Diff

View File

@ -67,10 +67,10 @@ class GSDrawScanlineCodeGenerator : public CodeGenerator
void mix16(const Xmm& a, const Xmm& b, const Xmm& temp); void mix16(const Xmm& a, const Xmm& b, const Xmm& temp);
void clamp16(const Xmm& a, const Xmm& temp); void clamp16(const Xmm& a, const Xmm& temp);
void alltrue(); void alltrue();
void blend8(const Xmm& a, const Xmm& b);
void blend(const Xmm& a, const Xmm& b, const Xmm& mask); void blend(const Xmm& a, const Xmm& b, const Xmm& mask);
void blend8r(const Xmm& b, const Xmm& a);
void blendr(const Xmm& b, const Xmm& a, const Xmm& mask); void blendr(const Xmm& b, const Xmm& a, const Xmm& mask);
void blend8(const Xmm& a, const Xmm& b);
void blend8r(const Xmm& b, const Xmm& a);
public: public:
GSDrawScanlineCodeGenerator(GSScanlineEnvironment& env, uint64 key, void* ptr, size_t maxsize); GSDrawScanlineCodeGenerator(GSScanlineEnvironment& env, uint64 key, void* ptr, size_t maxsize);

View File

@ -26,7 +26,7 @@
#pragma pack(push, 1) #pragma pack(push, 1)
__aligned16 class GSDrawingContext __aligned32 class GSDrawingContext
{ {
public: public:
GIFRegXYOFFSET XYOFFSET; GIFRegXYOFFSET XYOFFSET;
@ -43,7 +43,7 @@ public:
GIFRegFRAME FRAME; GIFRegFRAME FRAME;
GIFRegZBUF ZBUF; GIFRegZBUF ZBUF;
__aligned16 struct __aligned32 struct
{ {
GSVector4i dx10; GSVector4i dx10;
GSVector4 dx9; GSVector4 dx9;

View File

@ -25,7 +25,7 @@
#pragma pack(push, 1) #pragma pack(push, 1)
__aligned16 class GSDrawingEnvironment __aligned32 class GSDrawingEnvironment
{ {
public: public:
GIFRegPRIM PRIM; GIFRegPRIM PRIM;

View File

@ -56,14 +56,14 @@ uint32 GSLocalMemory::pageOffset16SZ[32][64][64];
uint32 GSLocalMemory::pageOffset8[32][64][128]; uint32 GSLocalMemory::pageOffset8[32][64][128];
uint32 GSLocalMemory::pageOffset4[32][128][128]; uint32 GSLocalMemory::pageOffset4[32][128][128];
int GSLocalMemory::rowOffset32[2048]; int GSLocalMemory::rowOffset32[4096];
int GSLocalMemory::rowOffset32Z[2048]; int GSLocalMemory::rowOffset32Z[4096];
int GSLocalMemory::rowOffset16[2048]; int GSLocalMemory::rowOffset16[4096];
int GSLocalMemory::rowOffset16S[2048]; int GSLocalMemory::rowOffset16S[4096];
int GSLocalMemory::rowOffset16Z[2048]; int GSLocalMemory::rowOffset16Z[4096];
int GSLocalMemory::rowOffset16SZ[2048]; int GSLocalMemory::rowOffset16SZ[4096];
int GSLocalMemory::rowOffset8[2][2048]; int GSLocalMemory::rowOffset8[2][4096];
int GSLocalMemory::rowOffset4[2][2048]; int GSLocalMemory::rowOffset4[2][4096];
short GSLocalMemory::blockOffset32[256]; short GSLocalMemory::blockOffset32[256];
short GSLocalMemory::blockOffset32Z[256]; short GSLocalMemory::blockOffset32Z[256];
@ -116,44 +116,44 @@ GSLocalMemory::GSLocalMemory()
for(int x = 0; x < countof(rowOffset32); x++) for(int x = 0; x < countof(rowOffset32); x++)
{ {
rowOffset32[x] = (int)PixelAddress32(x, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32); rowOffset32[x] = (int)PixelAddress32(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32(0, 0, 0, 32);
} }
for(int x = 0; x < countof(rowOffset32Z); x++) for(int x = 0; x < countof(rowOffset32Z); x++)
{ {
rowOffset32Z[x] = (int)PixelAddress32Z(x, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32); rowOffset32Z[x] = (int)PixelAddress32Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress32Z(0, 0, 0, 32);
} }
for(int x = 0; x < countof(rowOffset16); x++) for(int x = 0; x < countof(rowOffset16); x++)
{ {
rowOffset16[x] = (int)PixelAddress16(x, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32); rowOffset16[x] = (int)PixelAddress16(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16(0, 0, 0, 32);
} }
for(int x = 0; x < countof(rowOffset16S); x++) for(int x = 0; x < countof(rowOffset16S); x++)
{ {
rowOffset16S[x] = (int)PixelAddress16S(x, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32); rowOffset16S[x] = (int)PixelAddress16S(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16S(0, 0, 0, 32);
} }
for(int x = 0; x < countof(rowOffset16Z); x++) for(int x = 0; x < countof(rowOffset16Z); x++)
{ {
rowOffset16Z[x] = (int)PixelAddress16Z(x, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32); rowOffset16Z[x] = (int)PixelAddress16Z(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16Z(0, 0, 0, 32);
} }
for(int x = 0; x < countof(rowOffset16SZ); x++) for(int x = 0; x < countof(rowOffset16SZ); x++)
{ {
rowOffset16SZ[x] = (int)PixelAddress16SZ(x, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32); rowOffset16SZ[x] = (int)PixelAddress16SZ(x & 0x7ff, 0, 0, 32) - (int)PixelAddress16SZ(0, 0, 0, 32);
} }
for(int x = 0; x < countof(rowOffset8[0]); x++) for(int x = 0; x < countof(rowOffset8[0]); x++)
{ {
rowOffset8[0][x] = (int)PixelAddress8(x, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32); rowOffset8[0][x] = (int)PixelAddress8(x & 0x7ff, 0, 0, 32) - (int)PixelAddress8(0, 0, 0, 32);
rowOffset8[1][x] = (int)PixelAddress8(x, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32); rowOffset8[1][x] = (int)PixelAddress8(x & 0x7ff, 2, 0, 32) - (int)PixelAddress8(0, 2, 0, 32);
} }
for(int x = 0; x < countof(rowOffset4[0]); x++) for(int x = 0; x < countof(rowOffset4[0]); x++)
{ {
rowOffset4[0][x] = (int)PixelAddress4(x, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32); rowOffset4[0][x] = (int)PixelAddress4(x & 0x7ff, 0, 0, 32) - (int)PixelAddress4(0, 0, 0, 32);
rowOffset4[1][x] = (int)PixelAddress4(x, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32); rowOffset4[1][x] = (int)PixelAddress4(x & 0x7ff, 2, 0, 32) - (int)PixelAddress4(0, 2, 0, 32);
} }
for(int x = 0; x < countof(blockOffset32); x++) for(int x = 0; x < countof(blockOffset32); x++)
@ -459,7 +459,7 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
return i->second; return i->second;
} }
GSOffset* o = (GSOffset*)_aligned_malloc(sizeof(GSOffset), 16); GSOffset* o = (GSOffset*)_aligned_malloc(sizeof(GSOffset), 32);
o->hash = hash; o->hash = hash;
@ -474,9 +474,9 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
pixelAddress pa = m_psm[psm].pa; pixelAddress pa = m_psm[psm].pa;
for(int i = 0; i < 2048; i++) for(int i = 0; i < 4096; i++)
{ {
o->pixel.row[i] = (int)pa(0, i, bp, bw); o->pixel.row[i] = (int)pa(0, i & 0x7ff, bp, bw);
} }
for(int i = 0; i < 8; i++) for(int i = 0; i < 8; i++)
@ -513,7 +513,7 @@ GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const G
return i->second; return i->second;
} }
GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 16); GSPixelOffset4* o = (GSPixelOffset4*)_aligned_malloc(sizeof(GSPixelOffset4), 32);
o->hash = hash; o->hash = hash;
@ -628,7 +628,7 @@ void GSLocalMemory::WriteImageLeftRight(int l, int r, int y, int h, const uint8*
template<int psm, int bsx, int bsy, int trbpp> template<int psm, int bsx, int bsy, int trbpp>
void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF) void GSLocalMemory::WriteImageTopBottom(int l, int r, int y, int h, const uint8* src, int srcpitch, const GIFRegBITBLTBUF& BITBLTBUF)
{ {
__aligned16 uint8 buff[64]; // merge buffer for one column __aligned32 uint8 buff[64]; // merge buffer for one column
uint32 bp = BITBLTBUF.DBP; uint32 bp = BITBLTBUF.DBP;
uint32 bw = BITBLTBUF.DBW; uint32 bw = BITBLTBUF.DBW;
@ -1438,7 +1438,7 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32) FOREACH_BLOCK_START(r, 16, 8, 32)
{ {
@ -1451,7 +1451,7 @@ void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32) FOREACH_BLOCK_START(r, 16, 8, 32)
{ {
@ -1548,7 +1548,7 @@ void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32) FOREACH_BLOCK_START(r, 16, 8, 32)
{ {
@ -1561,7 +1561,7 @@ void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32) FOREACH_BLOCK_START(r, 16, 8, 32)
{ {
@ -1576,14 +1576,14 @@ void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i
void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadBlock32<true>(BlockPtr(bp), dst, dstpitch); ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
} }
void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
if(TEXA.AEM) if(TEXA.AEM)
{ {
@ -1597,7 +1597,7 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1606,7 +1606,7 @@ void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1615,49 +1615,49 @@ void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, con
void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut); ReadAndExpandBlock8_32(BlockPtr(bp), dst, dstpitch, m_clut);
} }
void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut); ReadAndExpandBlock4_32(BlockPtr(bp), dst, dstpitch, m_clut);
} }
void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut); ReadAndExpandBlock8H_32(BlockPtr(bp), dst, dstpitch, m_clut);
} }
void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut); ReadAndExpandBlock4HL_32(BlockPtr(bp), dst, dstpitch, m_clut);
} }
void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
} }
void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadBlock32<true>(BlockPtr(bp), dst, dstpitch); ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
} }
void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
if(TEXA.AEM) if(TEXA.AEM)
{ {
@ -1671,7 +1671,7 @@ void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, con
void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1680,7 +1680,7 @@ void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, con
void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
__aligned16 uint16 block[16 * 8]; __aligned32 uint16 block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
@ -1823,28 +1823,28 @@ void GSLocalMemory::ReadTextureBlock8P(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock4P(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadBlock4P(BlockPtr(bp), dst, dstpitch); ReadBlock4P(BlockPtr(bp), dst, dstpitch);
} }
void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock8HP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadBlock8HP(BlockPtr(bp), dst, dstpitch); ReadBlock8HP(BlockPtr(bp), dst, dstpitch);
} }
void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock4HLP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadBlock4HLP(BlockPtr(bp), dst, dstpitch); ReadBlock4HLP(BlockPtr(bp), dst, dstpitch);
} }
void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const void GSLocalMemory::ReadTextureBlock4HHP(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{ {
ALIGN_STACK(16); ALIGN_STACK(32);
ReadBlock4HHP(BlockPtr(bp), dst, dstpitch); ReadBlock4HHP(BlockPtr(bp), dst, dstpitch);
} }
@ -1855,7 +1855,7 @@ HRESULT GSLocalMemory::SaveBMP(const string& fn, uint32 bp, uint32 bw, uint32 ps
{ {
int pitch = w * 4; int pitch = w * 4;
int size = pitch * h; int size = pitch * h;
void* bits = ::_aligned_malloc(size, 16); void* bits = _aligned_malloc(size, 32);
GIFRegTEX0 TEX0; GIFRegTEX0 TEX0;

View File

@ -39,7 +39,7 @@ struct GSOffset
struct struct
{ {
int row[2048]; // yn (n = 0 1 2 ...) int row[4096]; // yn (n = 0 1 2 ...) NOTE: this wraps around above 2048, only transfers should address the upper half (dark cloud 2 inventing)
int* col[8]; // rowOffset* int* col[8]; // rowOffset*
} pixel; } pixel;
@ -116,14 +116,14 @@ protected:
static uint32 pageOffset8[32][64][128]; static uint32 pageOffset8[32][64][128];
static uint32 pageOffset4[32][128][128]; static uint32 pageOffset4[32][128][128];
static int rowOffset32[2048]; static int rowOffset32[4096];
static int rowOffset32Z[2048]; static int rowOffset32Z[4096];
static int rowOffset16[2048]; static int rowOffset16[4096];
static int rowOffset16S[2048]; static int rowOffset16S[4096];
static int rowOffset16Z[2048]; static int rowOffset16Z[4096];
static int rowOffset16SZ[2048]; static int rowOffset16SZ[4096];
static int rowOffset8[2][2048]; static int rowOffset8[2][4096];
static int rowOffset4[2][2048]; static int rowOffset4[2][4096];
static short blockOffset32[256]; static short blockOffset32[256];
static short blockOffset32Z[256]; static short blockOffset32Z[256];

View File

@ -29,18 +29,20 @@
// Using a spinning finish on the main (MTGS) thread is apparently a big win still, over trying // Using a spinning finish on the main (MTGS) thread is apparently a big win still, over trying
// to wait out all the pending m_finished semaphores. It leaves one spinwait in the rasterizer, // to wait out all the pending m_finished semaphores. It leaves one spinwait in the rasterizer,
// but that's still worlds better than 2-6 spinning threads like before. // but that's still worlds better than 2-6 spinning threads like before.
#define UseSpinningFinish 1
#define UseSpinningFinish
// Set this to 1 to remove a lot of non-const div/modulus ops from the rasterization process. // Set this to 1 to remove a lot of non-const div/modulus ops from the rasterization process.
// Might likely be a measurable speedup but limits threading to 1, 2, 4, and 8 threads. // Might likely be a measurable speedup but limits threading to 1, 2, 4, and 8 threads.
// note by rama: Speedup is around 5% on average. // note by rama: Speedup is around 5% on average.
#define UseConstThreadCount 0
#if UseConstThreadCount // #define UseConstThreadCount
#ifdef UseConstThreadCount
// ThreadsConst - const number of threads. User-configured threads (in GSdx panel) must match // ThreadsConst - const number of threads. User-configured threads (in GSdx panel) must match
// this value if UseConstThreadCount is enabled. [yeah, it's hacky for now] // this value if UseConstThreadCount is enabled. [yeah, it's hacky for now]
static const int ThreadsConst = 2; static const int ThreadsConst = 2;
static const int ThreadMaskConst = ThreadsConst-1; static const int ThreadMaskConst = ThreadsConst - 1;
#endif #endif
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads) GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
@ -57,11 +59,15 @@ GSRasterizer::~GSRasterizer()
__forceinline bool GSRasterizer::IsOneOfMyScanlines(int scanline) const __forceinline bool GSRasterizer::IsOneOfMyScanlines(int scanline) const
{ {
#if UseConstThreadCount #ifdef UseConstThreadCount
return (ThreadMaskConst==0) || ((scanline & ThreadMaskConst) == m_id);
#else return ThreadMaskConst == 0 || (scanline & ThreadMaskConst) == m_id;
#else
return (scanline % m_threads) == m_id; return (scanline % m_threads) == m_id;
#endif
#endif
} }
void GSRasterizer::Draw(const GSRasterizerData* data) void GSRasterizer::Draw(const GSRasterizerData* data)
@ -871,7 +877,7 @@ void GSRasterizerMT::ThreadProc()
{ {
// _mm_setcsr(MXCSR); // _mm_setcsr(MXCSR);
while( true ) while(true)
{ {
sem_wait(&m_semaphore); sem_wait(&m_semaphore);
@ -879,10 +885,15 @@ void GSRasterizerMT::ThreadProc()
__super::Draw(m_data); __super::Draw(m_data);
if( UseSpinningFinish ) #ifdef UseSpinningFinish
_interlockedbittestandreset( &m_sync, m_id );
else _interlockedbittestandreset(&m_sync, m_id);
sem_post(&m_finished);
#else
sem_post(&m_finished);
#endif
} }
sem_post(&m_stopped); sem_post(&m_stopped);
@ -917,33 +928,36 @@ void GSRasterizerList::Draw(const GSRasterizerData* data)
m_sync = m_syncstart; m_sync = m_syncstart;
for(unsigned i=1; i<size(); ++i) for(size_t i = 1; i < size(); i++)
{ {
(*this)[i]->Draw(data); (*this)[i]->Draw(data);
} }
(*this)[0]->Draw(data); (*this)[0]->Draw(data);
if( UseSpinningFinish ) #ifdef UseSpinningFinish
while(m_sync) _mm_pause();
#else
for(size_t i = 1; i < size(); i++)
{ {
while(m_sync) _mm_pause(); sem_wait(&m_finished);
}
else
{
for(unsigned i=1; i<size(); ++i )
sem_wait(&m_finished);
} }
#endif
m_stats.ticks = __rdtsc() - start; m_stats.ticks = __rdtsc() - start;
for(unsigned i=0; i<size(); ++i) for(size_t i = 0; i < size(); i++)
{ {
GSRasterizerStats s; GSRasterizerStats s;
(*this)[i]->GetStats(s); (*this)[i]->GetStats(s);
m_stats.pixels += s.pixels; m_stats.pixels += s.pixels;
m_stats.prims = max(m_stats.prims, s.prims); m_stats.prims = std::max<int>(m_stats.prims, s.prims);
} }
} }

View File

@ -30,7 +30,7 @@
#include "pthread.h" #include "pthread.h"
#include "semaphore.h" #include "semaphore.h"
__aligned16 class GSRasterizerData __aligned32 class GSRasterizerData
{ {
public: public:
GSVector4i scissor; GSVector4i scissor;
@ -50,7 +50,7 @@ public:
virtual void PrintStats() = 0; virtual void PrintStats() = 0;
}; };
class IDrawScanline : public GSAlignedClass<16> class IDrawScanline : public GSAlignedClass<32>
{ {
public: public:
typedef void (__fastcall *DrawScanlineStaticPtr)(int right, int left, int top, const GSVertexSW& v); typedef void (__fastcall *DrawScanlineStaticPtr)(int right, int left, int top, const GSVertexSW& v);
@ -153,9 +153,11 @@ public:
push_back(new GSRasterizer(new DS(parent, 0), 0, threads)); push_back(new GSRasterizer(new DS(parent, 0), 0, threads));
m_syncstart = 0; m_syncstart = 0;
for(int i = 1; i < threads; i++) for(int i = 1; i < threads; i++)
{ {
push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_finished, m_sync)); push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_finished, m_sync));
_interlockedbittestandset(&m_syncstart, i); _interlockedbittestandset(&m_syncstart, i);
} }
} }

View File

@ -24,7 +24,7 @@
GSRenderer::GSRenderer() GSRenderer::GSRenderer()
: GSState() : GSState()
, m_tex_buff( (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 16) ) , m_tex_buff((uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32))
, m_vt(this) , m_vt(this)
, m_dev(NULL) , m_dev(NULL)
, m_shader(0) , m_shader(0)
@ -61,9 +61,10 @@ GSRenderer::~GSRenderer()
m_dev->Reset(1, 1, GSDevice::Windowed); m_dev->Reset(1, 1, GSDevice::Windowed);
}*/ }*/
_aligned_free( m_tex_buff ); _aligned_free(m_tex_buff);
delete m_dev; delete m_dev;
DeleteCriticalSection(&m_pGSsetTitle_Crit); DeleteCriticalSection(&m_pGSsetTitle_Crit);
} }
@ -220,13 +221,6 @@ bool GSRenderer::Merge(int field)
r.bottom = r.top + y; r.bottom = r.top + y;
} }
// Breaks the blur filter, and actually makes games blurry again.
// This might have to do with earlier changes to device size detection.
/*if(blurdetected && i == 1)
{
r += GSVector4i(0, 1).xyxy();
}*/
GSVector4 scale = GSVector4(tex[i]->GetScale()).xyxy(); GSVector4 scale = GSVector4(tex[i]->GetScale()).xyxy();
src[i] = GSVector4(r) * scale / GSVector4(tex[i]->GetSize()).xyxy(); src[i] = GSVector4(r) * scale / GSVector4(tex[i]->GetSize()).xyxy();
@ -380,8 +374,8 @@ void GSRenderer::VSync(int field)
EnterCriticalSection(&m_pGSsetTitle_Crit); EnterCriticalSection(&m_pGSsetTitle_Crit);
strncpy(m_GStitleInfoBuffer, s.c_str(), ArraySize(m_GStitleInfoBuffer)-1); strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1);
m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer)-1] = 0;// make sure null terminated even if text overflows m_GStitleInfoBuffer[sizeof(m_GStitleInfoBuffer) - 1] = 0;// make sure null terminated even if text overflows
LeaveCriticalSection(&m_pGSsetTitle_Crit); LeaveCriticalSection(&m_pGSsetTitle_Crit);
} }

View File

@ -158,12 +158,13 @@ protected:
void GrowVertexBuffer() void GrowVertexBuffer()
{ {
m_maxcount = max(10000, m_maxcount * 3/2); m_maxcount = max(10000, m_maxcount * 3/2);
m_vertices = (Vertex*)_aligned_realloc(m_vertices, sizeof(Vertex) * m_maxcount, 16); m_vertices = (Vertex*)_aligned_realloc(m_vertices, sizeof(Vertex) * m_maxcount, 32);
m_maxcount -= 100; m_maxcount -= 100;
} }
// Returns a pointer to the drawing vertex. Can return NULL! // Returns a pointer to the drawing vertex. Can return NULL!
template<uint32 prim> __fi Vertex* BaseDrawingKick(int& count)
template<uint32 prim> __forceinline Vertex* DrawingKick(bool skip, int& count)
{ {
switch(prim) switch(prim)
{ {
@ -237,7 +238,7 @@ protected:
__assume(0); __assume(0);
} }
return v; return !skip ? v : NULL;
} }
virtual void Draw() = 0; virtual void Draw() = 0;

View File

@ -249,7 +249,9 @@ public:
ps_sel.clr1 = om_bsel.IsCLR1(); ps_sel.clr1 = om_bsel.IsCLR1();
ps_sel.fba = context->FBA.FBA; ps_sel.fba = context->FBA.FBA;
ps_sel.aout = context->FRAME.PSM == PSM_PSMCT16 || context->FRAME.PSM == PSM_PSMCT16S || (context->FRAME.FBMSK & 0xff000000) == 0x7f000000 ? 1 : 0; ps_sel.aout = context->FRAME.PSM == PSM_PSMCT16 || context->FRAME.PSM == PSM_PSMCT16S || (context->FRAME.FBMSK & 0xff000000) == 0x7f000000 ? 1 : 0;
if (UserHacks_AlphaHack) ps_sel.aout = 1; if (UserHacks_AlphaHack) ps_sel.aout = 1;
if(PRIM->FGE) if(PRIM->FGE)
{ {
ps_sel.fog = 1; ps_sel.fog = 1;

View File

@ -38,20 +38,20 @@ bool GSRendererDX11::CreateDevice(GSDevice* dev)
return true; return true;
} }
void GSRendererDX11::DoVertexKick() template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererDX11::VertexKick(bool skip)
{ {
const bool tme = PRIM->TME;
const bool fst = PRIM->FST;
GSVertexHW11& dst = m_vl.AddTail(); GSVertexHW11& dst = m_vl.AddTail();
dst.vi[0] = m_v.vi[0]; dst.vi[0] = m_v.vi[0];
dst.vi[1] = m_v.vi[1]; dst.vi[1] = m_v.vi[1];
#ifdef USE_UPSCALE_HACKS #ifdef USE_UPSCALE_HACKS
if(tme && fst) if(tme && fst)
{ {
//GSVector4::storel(&dst.ST, m_v.GetUV()); //GSVector4::storel(&dst.ST, m_v.GetUV());
int Udiff = 0; int Udiff = 0;
int Vdiff = 0; int Vdiff = 0;
int Uadjust = 0; int Uadjust = 0;
@ -95,6 +95,7 @@ void GSRendererDX11::DoVertexKick()
else if (Vdiff <= 1) { Vadjust = 1; } else if (Vdiff <= 1) { Vadjust = 1; }
} }
} }
dst.ST.S = (float)m_v.UV.U - Uadjust; dst.ST.S = (float)m_v.UV.U - Uadjust;
dst.ST.T = (float)m_v.UV.V - Vadjust; dst.ST.T = (float)m_v.UV.V - Vadjust;
} }
@ -104,104 +105,103 @@ void GSRendererDX11::DoVertexKick()
//dst.XYZ.X += 5; //dst.XYZ.X += 5;
//dst.XYZ.Y += 5; //dst.XYZ.Y += 5;
} }
#else #else
if(tme && fst) if(tme && fst)
{ {
GSVector4::storel(&dst.ST, m_v.GetUV()); GSVector4::storel(&dst.ST, m_v.GetUV());
} }
#endif #endif
}
template< uint32 prim > int count = 0;
void GSRendererDX11::DrawingKick( bool skip )
{ if(GSVertexHW11* v = DrawingKick<prim>(skip, count))
int count;
GSVertexHW11* v = BaseDrawingKick<prim>(count);
if (skip || !v) return;
GSVector4i scissor = m_context->scissor.dx10;
GSVector4i pmin, pmax;
#if _M_SSE >= 0x401
GSVector4i v0, v1, v2;
switch(prim)
{ {
case GS_POINTLIST: GSVector4i scissor = m_context->scissor.dx10;
v0 = GSVector4i::load((int)v[0].p.xy).upl16();
pmin = v0; GSVector4i pmin, pmax;
pmax = v0;
break; #if _M_SSE >= 0x401
case GS_LINELIST:
case GS_LINESTRIP: GSVector4i v0, v1, v2;
case GS_SPRITE:
v0 = GSVector4i::load((int)v[0].p.xy); switch(prim)
v1 = GSVector4i::load((int)v[1].p.xy); {
pmin = v0.min_u16(v1).upl16(); case GS_POINTLIST:
pmax = v0.max_u16(v1).upl16(); v0 = GSVector4i::load((int)v[0].p.xy).upl16();
break; pmin = v0;
case GS_TRIANGLELIST: pmax = v0;
case GS_TRIANGLESTRIP: break;
case GS_TRIANGLEFAN: case GS_LINELIST:
v0 = GSVector4i::load((int)v[0].p.xy); case GS_LINESTRIP:
v1 = GSVector4i::load((int)v[1].p.xy); case GS_SPRITE:
v2 = GSVector4i::load((int)v[2].p.xy); v0 = GSVector4i::load((int)v[0].p.xy);
pmin = v0.min_u16(v1).min_u16(v2).upl16(); v1 = GSVector4i::load((int)v[1].p.xy);
pmax = v0.max_u16(v1).max_u16(v2).upl16(); pmin = v0.min_u16(v1).upl16();
break; pmax = v0.max_u16(v1).upl16();
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
v0 = GSVector4i::load((int)v[0].p.xy);
v1 = GSVector4i::load((int)v[1].p.xy);
v2 = GSVector4i::load((int)v[2].p.xy);
pmin = v0.min_u16(v1).min_u16(v2).upl16();
pmax = v0.max_u16(v1).max_u16(v2).upl16();
break;
}
#else
switch(prim)
{
case GS_POINTLIST:
pmin.x = v[0].p.x;
pmin.y = v[0].p.y;
pmax.x = v[0].p.x;
pmax.y = v[0].p.y;
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
pmin.x = std::min<uint16>(v[0].p.x, v[1].p.x);
pmin.y = std::min<uint16>(v[0].p.y, v[1].p.y);
pmax.x = std::max<uint16>(v[0].p.x, v[1].p.x);
pmax.y = std::max<uint16>(v[0].p.y, v[1].p.y);
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
pmin.x = std::min<uint16>(std::min<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
pmin.y = std::min<uint16>(std::min<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
pmax.x = std::max<uint16>(std::max<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
pmax.y = std::max<uint16>(std::max<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
break;
}
#endif
GSVector4i test = (pmax < scissor) | (pmin > scissor.zwxy());
switch(prim)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
test |= pmin == pmax;
break;
}
if(test.mask() & 0xff)
{
return;
}
m_count += count;
} }
#else
switch(prim)
{
case GS_POINTLIST:
pmin.x = v[0].p.x;
pmin.y = v[0].p.y;
pmax.x = v[0].p.x;
pmax.y = v[0].p.y;
break;
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
pmin.x = std::min<uint16>(v[0].p.x, v[1].p.x);
pmin.y = std::min<uint16>(v[0].p.y, v[1].p.y);
pmax.x = std::max<uint16>(v[0].p.x, v[1].p.x);
pmax.y = std::max<uint16>(v[0].p.y, v[1].p.y);
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
pmin.x = std::min<uint16>(std::min<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
pmin.y = std::min<uint16>(std::min<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
pmax.x = std::max<uint16>(std::max<uint16>(v[0].p.x, v[1].p.x), v[2].p.x);
pmax.y = std::max<uint16>(std::max<uint16>(v[0].p.y, v[1].p.y), v[2].p.y);
break;
}
#endif
GSVector4i test = (pmax < scissor) | (pmin > scissor.zwxy());
switch(prim)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
test |= pmin == pmax;
break;
}
if(test.mask() & 0xff)
{
return;
}
m_count += count;
} }
void GSRendererDX11::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) void GSRendererDX11::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)

View File

@ -36,8 +36,5 @@ public:
bool CreateDevice(GSDevice* dev); bool CreateDevice(GSDevice* dev);
template<uint32 prim> template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip);
void DrawingKick( bool skip );
void DoVertexKick();
}; };

View File

@ -57,11 +57,9 @@ bool GSRendererDX9::CreateDevice(GSDevice* dev)
return true; return true;
} }
void GSRendererDX9::DoVertexKick() template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererDX9::VertexKick(bool skip)
{ {
const bool tme = PRIM->TME;
const bool fst = PRIM->FST;
GSVertexHW9& dst = m_vl.AddTail(); GSVertexHW9& dst = m_vl.AddTail();
dst.p = GSVector4(((GSVector4i)m_v.XYZ).upl16()); dst.p = GSVector4(((GSVector4i)m_v.XYZ).upl16());
@ -142,92 +140,90 @@ void GSRendererDX9::DoVertexKick()
dst.c0 = m_v.RGBAQ.u32[0]; dst.c0 = m_v.RGBAQ.u32[0];
dst.c1 = m_v.FOG.u32[1]; dst.c1 = m_v.FOG.u32[1];
}
template< uint32 prim > //
void GSRendererDX9::DrawingKick( bool skip )
{
int count;
// BaseDrawingKick can never return NULL here because the DrawingKick function // BaseDrawingKick can never return NULL here because the DrawingKick function
// tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only // tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only
// condition where this function would return NULL). // condition where this function would return NULL).
GSVertexHW9* v = BaseDrawingKick<prim>(count); int count = 0;
if (skip || !v) return;
if(GSVertexHW9* v = DrawingKick<prim>(skip, count))
GSVector4 scissor = m_context->scissor.dx9;
GSVector4 pmin, pmax;
switch(prim)
{ {
case GS_POINTLIST: GSVector4 scissor = m_context->scissor.dx9;
pmin = v[0].p;
pmax = v[0].p; GSVector4 pmin, pmax;
break;
case GS_LINELIST: switch(prim)
case GS_LINESTRIP: {
case GS_SPRITE: case GS_POINTLIST:
pmin = v[0].p.min(v[1].p); pmin = v[0].p;
pmax = v[0].p.max(v[1].p); pmax = v[0].p;
break; break;
case GS_TRIANGLELIST: case GS_LINELIST:
case GS_TRIANGLESTRIP: case GS_LINESTRIP:
case GS_TRIANGLEFAN: case GS_SPRITE:
pmin = v[0].p.min(v[1].p).min(v[2].p); pmin = v[0].p.min(v[1].p);
pmax = v[0].p.max(v[1].p).max(v[2].p); pmax = v[0].p.max(v[1].p);
break; break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
pmin = v[0].p.min(v[1].p).min(v[2].p);
pmax = v[0].p.max(v[1].p).max(v[2].p);
break;
}
GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
switch(prim)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
test |= pmin == pmax;
break;
}
if(test.mask() & 3)
{
return;
}
switch(prim)
{
case GS_POINTLIST:
break;
case GS_LINELIST:
case GS_LINESTRIP:
if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
if(PRIM->IIP == 0) {v[0].c0 = v[1].c0 = v[2].c0;}
break;
case GS_SPRITE:
if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
v[0].p.z = v[1].p.z;
v[0].p.w = v[1].p.w;
v[0].c1 = v[1].c1;
v[2] = v[1];
v[3] = v[1];
v[1].p.y = v[0].p.y;
v[1].t.y = v[0].t.y;
v[2].p.x = v[0].p.x;
v[2].t.x = v[0].t.x;
v[4] = v[1];
v[5] = v[2];
count += 4;
break;
}
m_count += count;
} }
GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
switch(prim)
{
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
test |= pmin == pmax;
break;
}
if(test.mask() & 3)
{
return;
}
switch(prim)
{
case GS_POINTLIST:
break;
case GS_LINELIST:
case GS_LINESTRIP:
if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
if(PRIM->IIP == 0) {v[0].c0 = v[1].c0 = v[2].c0;}
break;
case GS_SPRITE:
if(PRIM->IIP == 0) {v[0].c0 = v[1].c0;}
v[0].p.z = v[1].p.z;
v[0].p.w = v[1].p.w;
v[0].c1 = v[1].c1;
v[2] = v[1];
v[3] = v[1];
v[1].p.y = v[0].p.y;
v[1].t.y = v[0].t.y;
v[2].p.x = v[0].p.x;
v[2].t.x = v[0].t.x;
v[4] = v[1];
v[5] = v[2];
count += 4;
break;
}
m_count += count;
} }
void GSRendererDX9::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) void GSRendererDX9::Draw(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)

View File

@ -43,8 +43,5 @@ public:
bool CreateDevice(GSDevice* dev); bool CreateDevice(GSDevice* dev);
template<uint32 prim> template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip);
void DrawingKick( bool skip );
void DoVertexKick();
}; };

View File

@ -43,10 +43,7 @@ public:
InitVertexKick<GSRendererNull>(); InitVertexKick<GSRendererNull>();
} }
virtual ~GSRendererNull() {} template<uint32 prim, uint32 tme, uint32 fst> void VertexKick(bool skip)
{
template<uint32 prim> }
void DrawingKick( bool skip ) {}
void DoVertexKick() {}
}; };

View File

@ -94,6 +94,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
if(m_dev->ResizeTexture(&m_texture[i], w, h)) if(m_dev->ResizeTexture(&m_texture[i], w, h))
{ {
uint8* buff = GetTextureBufferLock(); uint8* buff = GetTextureBufferLock();
static int pitch = 1024 * 4; static int pitch = 1024 * 4;
GSVector4i r(0, 0, w, h); GSVector4i r(0, 0, w, h);
@ -113,6 +114,7 @@ GSTexture* GSRendererSW::GetOutput(int i)
s_n++; s_n++;
} }
ReleaseTextureBufferLock(); ReleaseTextureBufferLock();
} }
@ -427,24 +429,22 @@ void GSRendererSW::GetScanlineParam(GSScanlineParam& p, GS_PRIM_CLASS primclass)
} }
} }
void GSRendererSW::DoVertexKick() template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererSW::VertexKick(bool skip)
{ {
const bool tme = PRIM->TME; const GSDrawingContext* context = m_context;
const bool fst = PRIM->FST;
const GSDrawingContext& context = *m_context;
GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]); GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]);
xy = xy.insert16<3>(m_v.FOG.F); xy = xy.insert16<3>(m_v.FOG.F);
xy = xy.upl16(); xy = xy.upl16();
xy -= context.XYOFFSET; xy -= context->XYOFFSET;
GSVertexSW& dst = m_vl.AddTail(); GSVertexSW v;
dst.p = GSVector4(xy) * g_pos_scale; v.p = GSVector4(xy) * g_pos_scale;
dst.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7); v.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7);
if(tme) if(tme)
{ {
@ -452,37 +452,31 @@ void GSRendererSW::DoVertexKick()
if(fst) if(fst)
{ {
dst.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4)); v.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
q = 1.0f; q = 1.0f;
} }
else else
{ {
dst.t = GSVector4(m_v.ST.S, m_v.ST.T); v.t = GSVector4(m_v.ST.S, m_v.ST.T);
dst.t *= GSVector4(0x10000 << context.TEX0.TW, 0x10000 << context.TEX0.TH); v.t *= GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH);
q = m_v.RGBAQ.Q; q = m_v.RGBAQ.Q;
} }
dst.t = dst.t.xyxy(GSVector4::load(q)); v.t = v.t.xyxy(GSVector4::load(q));
} }
GSVertexSW& dst = m_vl.AddTail();
dst = v;
dst.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the uint32 => float => uint32 conversion dst.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the uint32 => float => uint32 conversion
}
int count = 0;
template< uint32 prim >
void GSRendererSW::DrawingKick( bool skip ) if(GSVertexSW* v = DrawingKick<prim>(skip, count))
{
int count;
// BaseDrawingKick can never return NULL here because the DrawingKick function
// tables route to DrawingKickNull for GS_INVLALID prim types (and that's the only
// condition where this function would return NULL).
GSVertexSW* v = BaseDrawingKick<prim>(count);
if (skip || !v) return;
if(!m_dump)
{ {
if(!m_dump)
{
GSVector4 pmin, pmax; GSVector4 pmin, pmax;
switch(prim) switch(prim)
@ -505,7 +499,7 @@ void GSRendererSW::DrawingKick( bool skip )
break; break;
} }
GSVector4 scissor = m_context->scissor.ex; GSVector4 scissor = context->scissor.ex;
GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy()); GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
@ -529,77 +523,77 @@ void GSRendererSW::DrawingKick( bool skip )
test |= tmp == tmp.yxwz(); test |= tmp == tmp.yxwz();
break; break;
} }
if(test.mask() & 3) if(test.mask() & 3)
{ {
return; return;
} }
} }
switch(prim)
{
case GS_POINTLIST:
break;
case GS_LINELIST:
case GS_LINESTRIP:
if(PRIM->IIP == 0) {v[0].c = v[1].c;}
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
if(PRIM->IIP == 0) {v[0].c = v[2].c; v[1].c = v[2].c;}
break;
case GS_SPRITE:
break;
}
if(m_count < 30 && m_count >= 3)
{
GSVertexSW* v = &m_vertices[m_count - 3];
int tl = 0;
int br = 0;
bool isquad = false;
switch(prim) switch(prim)
{ {
case GS_POINTLIST:
break;
case GS_LINELIST:
case GS_LINESTRIP:
if(PRIM->IIP == 0) {v[0].c = v[1].c;}
break;
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP: case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN: case GS_TRIANGLEFAN:
case GS_TRIANGLELIST: if(PRIM->IIP == 0) {v[0].c = v[2].c; v[1].c = v[2].c;}
isquad = GSVertexSW::IsQuad(v, tl, br); break;
case GS_SPRITE:
break; break;
} }
if(isquad) if(m_count < 30 && m_count >= 3)
{ {
m_count -= 3; GSVertexSW* v = &m_vertices[m_count - 3];
if(m_count > 0) int tl = 0;
int br = 0;
bool isquad = false;
switch(prim)
{ {
tl += m_count; case GS_TRIANGLESTRIP:
br += m_count; case GS_TRIANGLEFAN:
case GS_TRIANGLELIST:
Flush(); isquad = GSVertexSW::IsQuad(v, tl, br);
break;
} }
if(tl != 0) m_vertices[0] = m_vertices[tl]; if(isquad)
if(br != 1) m_vertices[1] = m_vertices[br]; {
m_count -= 3;
m_count = 2; if(m_count > 0)
{
tl += m_count;
br += m_count;
uint32 tmp = PRIM->PRIM; Flush();
PRIM->PRIM = GS_SPRITE; }
Flush(); if(tl != 0) m_vertices[0] = m_vertices[tl];
if(br != 1) m_vertices[1] = m_vertices[br];
PRIM->PRIM = tmp; m_count = 2;
m_perfmon.Put(GSPerfMon::Quad, 1); uint32 tmp = PRIM->PRIM;
PRIM->PRIM = GS_SPRITE;
return; Flush();
PRIM->PRIM = tmp;
m_perfmon.Put(GSPerfMon::Quad, 1);
return;
}
} }
}
m_count += count; m_count += count;
}
} }

View File

@ -47,13 +47,6 @@ public:
GSRendererSW(); GSRendererSW();
virtual ~GSRendererSW(); virtual ~GSRendererSW();
template<uint32 prim> template<uint32 prim, uint32 tme, uint32 fst>
void DrawingKick( bool skip ); void VertexKick(bool skip);
void DoVertexKick();
void InvalidateTextureCache()
{
m_tc->RemoveAll();
}
}; };

View File

@ -99,7 +99,7 @@ union GSScanlineSelector
} }
}; };
__aligned16 struct GSScanlineParam __aligned32 struct GSScanlineParam
{ {
GSScanlineSelector sel; GSScanlineSelector sel;
@ -115,7 +115,7 @@ __aligned16 struct GSScanlineParam
uint32 fm, zm; uint32 fm, zm;
}; };
__aligned16 struct GSScanlineEnvironment __aligned32 struct GSScanlineEnvironment
{ {
void* vm; void* vm;
const void* tex; const void* tex;

View File

@ -88,7 +88,9 @@ void GSSettingsDlg::OnInit()
ComboBoxAppend(IDC_RESOLUTION, "Please select...", (LPARAM)&m_modes.back(), true); ComboBoxAppend(IDC_RESOLUTION, "Please select...", (LPARAM)&m_modes.back(), true);
CComPtr<IDirect3D9> d3d; CComPtr<IDirect3D9> d3d;
d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION)); d3d.Attach(Direct3DCreate9(D3D_SDK_VERSION));
if(d3d) if(d3d)
{ {
uint32 w = theApp.GetConfig("ModeWidth", 0); uint32 w = theApp.GetConfig("ModeWidth", 0);
@ -151,10 +153,13 @@ void GSSettingsDlg::OnInit()
SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETRANGE, 0, MAKELPARAM(8192, 256)); SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETRANGE, 0, MAKELPARAM(8192, 256));
SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("resy", 1024), 0)); SendMessage(GetDlgItem(m_hWnd, IDC_RESY), UDM_SETPOS, 0, MAKELPARAM(theApp.GetConfig("resy", 1024), 0));
int r=theApp.GetConfig("Renderer", 0); int r = theApp.GetConfig("Renderer", 0);
if (r>=0 && r<=2){//DX9
if(r >= 0 && r <= 2) // DX9
{
GSDevice9::ForceValidMsaaConfig(); GSDevice9::ForceValidMsaaConfig();
m_lastValidMsaa=theApp.GetConfig("msaa", 0);
m_lastValidMsaa = theApp.GetConfig("msaa", 0);
} }
SendMessage(GetDlgItem(m_hWnd, IDC_MSAA), UDM_SETRANGE, 0, MAKELPARAM(16, 0)); SendMessage(GetDlgItem(m_hWnd, IDC_MSAA), UDM_SETRANGE, 0, MAKELPARAM(16, 0));

View File

@ -48,7 +48,14 @@ void GSSetupPrimCodeGenerator::Generate()
{ {
for(int i = 0; i < 5; i++) for(int i = 0; i < 5; i++)
{ {
movaps(Xmm(3 + i), xmmword[&m_shift[i]]); if(m_cpu.has(util::Cpu::tAVX))
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
else
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
} }
} }
@ -68,113 +75,221 @@ void GSSetupPrimCodeGenerator::Depth()
return; return;
} }
if(!m_sel.sprite) if(m_cpu.has(util::Cpu::tAVX))
{ {
// GSVector4 t = dscan.p; if(!m_sel.sprite)
movaps(xmm0, xmmword[edx + 16]);
if(m_en.f)
{ {
// GSVector4 df = p.wwww(); // GSVector4 t = dscan.p;
movaps(xmm1, xmm0); vmovaps(xmm0, ptr[edx + 16]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh(); if(m_en.f)
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(xmmword[&m_env.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{ {
// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); // GSVector4 df = p.wwww();
movaps(xmm2, xmm1); vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2); // m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vmulps(xmm2, xmm1, xmm3);
movdqa(xmmword[&m_env.d[i].f], xmm2); vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_env.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
vmulps(xmm2, xmm1, Xmm(4 + i));
vcvttps2dq(xmm2, xmm2);
vpshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_env.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_env.d4.z = dz * 4.0f;
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_env.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].z = dz * m_shift[i];
vmulps(xmm1, xmm0, Xmm(4 + i));
vmovdqa(ptr[&m_env.d[i].z], xmm1);
}
} }
} }
else
if(m_en.z)
{ {
// GSVector4 dz = p.zzzz(); // GSVector4 p = vertices[0].p;
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); vmovaps(xmm0, ptr[ecx + 16]);
// m_env.d4.z = dz * 4.0f; if(m_en.f)
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(xmmword[&m_env.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{ {
// m_env.d[i].z = dz * m_shift[i]; // m_env.p.f = GSVector4i(p).zzzzh().zzzz();
movaps(xmm1, xmm0); vcvttps2dq(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i)); vpshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(xmmword[&m_env.d[i].z], xmm1); vpshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_env.p.f], xmm1);
}
if(m_en.z)
{
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
vmovss(xmm1, dword[&half]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_env.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
vmovdqa(ptr[&m_env.p.z], xmm0);
} }
} }
} }
else else
{ {
// GSVector4 p = vertices[0].p; if(!m_sel.sprite)
movaps(xmm0, xmmword[ecx + 16]);
if(m_en.f)
{ {
// m_env.p.f = GSVector4i(p).zzzzh().zzzz(); // GSVector4 t = dscan.p;
movaps(xmm1, xmm0); movaps(xmm0, ptr[edx + 16]);
cvttps2dq(xmm1, xmm1);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); if(m_en.f)
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); {
movdqa(xmmword[&m_env.p.f], xmm1); // GSVector4 df = p.wwww();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
// m_env.d4.f = GSVector4i(df * 4.0f).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_env.d4.f], xmm2);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
pshuflw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_env.d[i].f], xmm2);
}
}
if(m_en.z)
{
// GSVector4 dz = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
// m_env.d4.z = dz * 4.0f;
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
movdqa(ptr[&m_env.d4.z], xmm1);
for(int i = 0; i < 4; i++)
{
// m_env.d[i].z = dz * m_shift[i];
movaps(xmm1, xmm0);
mulps(xmm1, Xmm(4 + i));
movdqa(ptr[&m_env.d[i].z], xmm1);
}
}
} }
else
if(m_en.z)
{ {
// GSVector4 z = p.zzzz(); // GSVector4 p = vertices[0].p;
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); movaps(xmm0, ptr[ecx + 16]);
if(m_sel.zoverflow) if(m_en.f)
{ {
// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); // m_env.p.f = GSVector4i(p).zzzzh().zzzz();
static const float half = 0.5f; cvttps2dq(xmm1, xmm0);
pshufhw(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
movss(xmm1, dword[&half]); pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); movdqa(ptr[&m_env.p.f], xmm1);
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_env.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
} }
movdqa(xmmword[&m_env.p.z], xmm0); if(m_en.z)
{
// GSVector4 z = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_env.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
static const float half = 0.5f;
movss(xmm1, dword[&half]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_env.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
movdqa(ptr[&m_env.p.z], xmm0);
}
} }
} }
} }
@ -186,64 +301,129 @@ void GSSetupPrimCodeGenerator::Texture()
return; return;
} }
// GSVector4 t = dscan.t; if(m_cpu.has(util::Cpu::tAVX))
movaps(xmm0, xmmword[edx + 32]);
movaps(xmm1, xmm0);
mulps(xmm1, xmm3);
if(m_sel.fst)
{ {
// m_env.d4.st = GSVector4i(t * 4.0f); // GSVector4 t = dscan.t;
cvttps2dq(xmm1, xmm1); vmovaps(xmm0, ptr[edx + 32]);
movdqa(xmmword[&m_env.d4.st], xmm1);
vmulps(xmm1, xmm0, xmm3);
if(m_sel.fst)
{
// m_env.d4.st = GSVector4i(t * 4.0f);
vcvttps2dq(xmm1, xmm1);
vmovdqa(ptr[&m_env.d4.st], xmm1);
}
else
{
// m_env.d4.stq = t * 4.0f;
vmovaps(ptr[&m_env.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{
// GSVector4 v = ds/dt * m_shift[i];
vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst)
{
// m_env.d[i].si/ti = GSVector4i(v);
vcvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: vmovdqa(ptr[&m_env.d[i].si], xmm2); break;
case 1: vmovdqa(ptr[&m_env.d[i].ti], xmm2); break;
}
}
else
{
// m_env.d[i].s/t/q = v;
switch(j)
{
case 0: vmovaps(ptr[&m_env.d[i].s], xmm2); break;
case 1: vmovaps(ptr[&m_env.d[i].t], xmm2); break;
case 2: vmovaps(ptr[&m_env.d[i].q], xmm2); break;
}
}
}
}
} }
else else
{ {
// m_env.d4.stq = t * 4.0f; // GSVector4 t = dscan.t;
movaps(xmmword[&m_env.d4.stq], xmm1); movaps(xmm0, ptr[edx + 32]);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0); movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); mulps(xmm1, xmm3);
for(int i = 0; i < 4; i++) if(m_sel.fst)
{ {
// GSVector4 v = ds/dt * m_shift[i]; // m_env.d4.st = GSVector4i(t * 4.0f);
movaps(xmm2, xmm1); cvttps2dq(xmm1, xmm1);
mulps(xmm2, Xmm(4 + i)); movdqa(ptr[&m_env.d4.st], xmm1);
}
else
{
// m_env.d4.stq = t * 4.0f;
if(m_sel.fst) movaps(ptr[&m_env.d4.stq], xmm1);
}
for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{
// GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy();
// GSVector4 dq = t.zzzz();
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
{ {
// m_env.d[i].si/ti = GSVector4i(v); // GSVector4 v = ds/dt * m_shift[i];
cvttps2dq(xmm2, xmm2); movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i));
switch(j) if(m_sel.fst)
{ {
case 0: movdqa(xmmword[&m_env.d[i].si], xmm2); break; // m_env.d[i].si/ti = GSVector4i(v);
case 1: movdqa(xmmword[&m_env.d[i].ti], xmm2); break;
cvttps2dq(xmm2, xmm2);
switch(j)
{
case 0: movdqa(ptr[&m_env.d[i].si], xmm2); break;
case 1: movdqa(ptr[&m_env.d[i].ti], xmm2); break;
}
} }
} else
else
{
// m_env.d[i].s/t/q = v;
switch(j)
{ {
case 0: movaps(xmmword[&m_env.d[i].s], xmm2); break; // m_env.d[i].s/t/q = v;
case 1: movaps(xmmword[&m_env.d[i].t], xmm2); break;
case 2: movaps(xmmword[&m_env.d[i].q], xmm2); break; switch(j)
{
case 0: movaps(ptr[&m_env.d[i].s], xmm2); break;
case 1: movaps(ptr[&m_env.d[i].t], xmm2); break;
case 2: movaps(ptr[&m_env.d[i].q], xmm2); break;
}
} }
} }
} }
@ -257,113 +437,217 @@ void GSSetupPrimCodeGenerator::Color()
return; return;
} }
if(m_sel.iip) if(m_cpu.has(util::Cpu::tAVX))
{ {
// GSVector4 c = dscan.c; if(m_sel.iip)
movaps(xmm0, xmmword[edx]);
movaps(xmm1, xmm0);
// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(xmmword[&m_env.d4.c], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{ {
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); // GSVector4 c = dscan.c;
movaps(xmm2, xmm0); vmovaps(xmm0, ptr[edx]);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32(); // m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm3, xmm1); vmulps(xmm1, xmm0, xmm3);
mulps(xmm3, Xmm(4 + i)); vcvttps2dq(xmm1, xmm1);
cvttps2dq(xmm3, xmm3); vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm3, xmm3); vpackssdw(xmm1, xmm1);
vmovdqa(ptr[&m_env.d4.c], xmm1);
// m_env.d[i].rb = r.upl16(b); // xmm3 is not needed anymore
punpcklwd(xmm2, xmm3); // GSVector4 dr = c.xxxx();
movdqa(xmmword[&m_env.d[i].rb], xmm2); // GSVector4 db = c.zzzz();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_env.d[i].rb = r.upl16(b);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_env.d[i].rb], xmm0);
}
// GSVector4 c = dscan.c;
vmovaps(xmm0, ptr[edx]); // not enough regs, have to reload it
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
vmulps(xmm0, xmm2, Xmm(4 + i));
vcvttps2dq(xmm0, xmm0);
vpackssdw(xmm0, xmm0);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
vmulps(xmm1, xmm3, Xmm(4 + i));
vcvttps2dq(xmm1, xmm1);
vpackssdw(xmm1, xmm1);
// m_env.d[i].ga = g.upl16(a);
vpunpcklwd(xmm0, xmm1);
vmovdqa(ptr[&m_env.d[i].ga], xmm0);
}
} }
else
// GSVector4 c = dscan.c;
movaps(xmm0, xmmword[edx]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{ {
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); // GSVector4i c = GSVector4i(vertices[0].c);
movaps(xmm2, xmm0); vcvttps2dq(xmm0, ptr[ecx]);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32(); // c = c.upl16(c.zwxy());
movaps(xmm3, xmm1); vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
mulps(xmm3, Xmm(4 + i)); vpunpcklwd(xmm0, xmm1);
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_env.d[i].ga = g.upl16(a); // if(!tme) c = c.srl16(7);
punpcklwd(xmm2, xmm3); if(m_sel.tfx == TFX_NONE)
movdqa(xmmword[&m_env.d[i].ga], xmm2); {
vpsrlw(xmm0, 7);
}
// m_env.c.rb = c.xxxx();
// m_env.c.ga = c.zzzz();
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vmovdqa(ptr[&m_env.c.rb], xmm1);
vmovdqa(ptr[&m_env.c.ga], xmm2);
} }
} }
else else
{ {
// GSVector4i c = GSVector4i(vertices[0].c); if(m_sel.iip)
movaps(xmm0, xmmword[ecx]);
cvttps2dq(xmm0, xmm0);
// c = c.upl16(c.zwxy());
movdqa(xmm1, xmm0);
pshufd(xmm1, xmm1, _MM_SHUFFLE(1, 0, 3, 2));
punpcklwd(xmm0, xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{ {
psrlw(xmm0, 7); // GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]);
movaps(xmm1, xmm0);
// m_env.d4.c = GSVector4i(c * 4.0f).xzyw().ps32();
movaps(xmm2, xmm0);
mulps(xmm2, xmm3);
cvttps2dq(xmm2, xmm2);
pshufd(xmm2, xmm2, _MM_SHUFFLE(3, 1, 2, 0));
packssdw(xmm2, xmm2);
movdqa(ptr[&m_env.d4.c], xmm2);
// xmm3 is not needed anymore
// GSVector4 dr = c.xxxx();
// GSVector4 db = c.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i b = GSVector4i(db * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_env.d[i].rb = r.upl16(b);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_env.d[i].rb], xmm2);
}
// GSVector4 c = dscan.c;
movaps(xmm0, ptr[edx]); // not enough regs, have to reload it
movaps(xmm1, xmm0);
// GSVector4 dg = c.yyyy();
// GSVector4 da = c.wwww();
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();
movaps(xmm2, xmm0);
mulps(xmm2, Xmm(4 + i));
cvttps2dq(xmm2, xmm2);
packssdw(xmm2, xmm2);
// GSVector4i a = GSVector4i(da * m_shift[i]).ps32();
movaps(xmm3, xmm1);
mulps(xmm3, Xmm(4 + i));
cvttps2dq(xmm3, xmm3);
packssdw(xmm3, xmm3);
// m_env.d[i].ga = g.upl16(a);
punpcklwd(xmm2, xmm3);
movdqa(ptr[&m_env.d[i].ga], xmm2);
}
} }
else
{
// GSVector4i c = GSVector4i(vertices[0].c);
// m_env.c.rb = c.xxxx(); movaps(xmm0, ptr[ecx]);
// m_env.c.ga = c.zzzz(); cvttps2dq(xmm0, xmm0);
movdqa(xmm1, xmm0); // c = c.upl16(c.zwxy());
pshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); pshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
movdqa(xmmword[&m_env.c.rb], xmm0); punpcklwd(xmm0, xmm1);
movdqa(xmmword[&m_env.c.ga], xmm1);
// if(!tme) c = c.srl16(7);
if(m_sel.tfx == TFX_NONE)
{
psrlw(xmm0, 7);
}
// m_env.c.rb = c.xxxx();
// m_env.c.ga = c.zzzz();
pshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
pshufd(xmm2, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
movdqa(ptr[&m_env.c.rb], xmm1);
movdqa(ptr[&m_env.c.ga], xmm2);
}
} }
} }

View File

@ -84,7 +84,7 @@ GSState::GSState()
m_sssize += sizeof(m_tr.x); m_sssize += sizeof(m_tr.x);
m_sssize += sizeof(m_tr.y); m_sssize += sizeof(m_tr.y);
m_sssize += m_mem.m_vmsize; m_sssize += m_mem.m_vmsize;
m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * ArraySize(m_path); m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * countof(m_path);
m_sssize += sizeof(m_q); m_sssize += sizeof(m_q);
PRIM = &m_env.PRIM; PRIM = &m_env.PRIM;
@ -103,6 +103,7 @@ GSState::~GSState()
void GSState::SetRegsMem(uint8* basemem) void GSState::SetRegsMem(uint8* basemem)
{ {
ASSERT(basemem); ASSERT(basemem);
m_regs = (GSPrivRegSet*)basemem; m_regs = (GSPrivRegSet*)basemem;
} }
@ -111,84 +112,82 @@ void GSState::SetIrqCallback(void (*irq)())
m_irq = irq; m_irq = irq;
} }
void GSState::SetMultithreaded( bool isMT ) void GSState::SetMultithreaded(bool mt)
{ {
// Some older versions of PCSX2 didn't properly set the irq callback to NULL // Some older versions of PCSX2 didn't properly set the irq callback to NULL
// in multithreaded mode (possibly because ZeroGS itself would assert in such // in multithreaded mode (possibly because ZeroGS itself would assert in such
// cases), and didn't bind them to a dummy callback either. PCSX2 handles all // cases), and didn't bind them to a dummy callback either. PCSX2 handles all
// IRQs internally when multithreaded anyway -- so let's ignore them here: // IRQs internally when multithreaded anyway -- so let's ignore them here:
m_mt = isMT; m_mt = mt;
if( isMT )
if(mt)
{ {
m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerNull; m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerNull;
m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerNull; m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerNull;
m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerNull; m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerNull;
} }
else else
{ {
m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerSIGNAL; m_fpGIFRegHandlers[GIF_A_D_REG_SIGNAL] = &GSState::GIFRegHandlerSIGNAL;
m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerFINISH; m_fpGIFRegHandlers[GIF_A_D_REG_FINISH] = &GSState::GIFRegHandlerFINISH;
m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerLABEL; m_fpGIFRegHandlers[GIF_A_D_REG_LABEL] = &GSState::GIFRegHandlerLABEL;
} }
} }
void GSState::SetFrameSkip(int skip) void GSState::SetFrameSkip(int skip)
{ {
if(m_frameskip == skip) return; if(m_frameskip == skip) return;
m_frameskip = skip; m_frameskip = skip;
if(skip) if(skip)
{ {
#if !UsePackedRegSwitch m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP;
#endif
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerNOP;
} }
else else
{ {
#if !UsePackedRegSwitch m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2; m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2; m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>; m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>; m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG; m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3; m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
#endif m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM; m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ; m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV;
m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2;
m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF3;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ3;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF3; m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ3; m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
} }
} }
void GSState::Reset() void GSState::Reset()
{ {
memset(&m_path[0], 0, sizeof(m_path[0]) * ArraySize(m_path)); memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
memset(&m_v, 0, sizeof(m_v)); memset(&m_v, 0, sizeof(m_v));
// PRIM = &m_env.PRIM; // PRIM = &m_env.PRIM;
@ -203,88 +202,86 @@ void GSState::Reset()
void GSState::ResetHandlers() void GSState::ResetHandlers()
{ {
#if !UsePackedRegSwitch
for(int i = 0; i < countof(m_fpGIFPackedRegHandlers); i++) for(int i = 0; i < countof(m_fpGIFPackedRegHandlers); i++)
{ {
m_fpGIFPackedRegHandlers[i] = &GSState::GIFPackedRegHandlerNull; m_fpGIFPackedRegHandlers[i] = &GSState::GIFPackedRegHandlerNull;
} }
m_fpGIFPackedRegHandlers[GIF_REG_PRIM] = (GIFPackedRegHandler)&GSState::GIFRegHandlerPRIM; m_fpGIFPackedRegHandlers[GIF_REG_PRIM] = (GIFPackedRegHandler)&GSState::GIFRegHandlerPRIM;
m_fpGIFPackedRegHandlers[GIF_REG_RGBA] = &GSState::GIFPackedRegHandlerRGBA; m_fpGIFPackedRegHandlers[GIF_REG_RGBA] = &GSState::GIFPackedRegHandlerRGBA;
m_fpGIFPackedRegHandlers[GIF_REG_STQ] = &GSState::GIFPackedRegHandlerSTQ; m_fpGIFPackedRegHandlers[GIF_REG_STQ] = &GSState::GIFPackedRegHandlerSTQ;
m_fpGIFPackedRegHandlers[GIF_REG_UV] = &GSState::GIFPackedRegHandlerUV; m_fpGIFPackedRegHandlers[GIF_REG_UV] = &GSState::GIFPackedRegHandlerUV;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2; m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2; m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2;
m_fpGIFPackedRegHandlers[GIF_REG_TEX0_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<0>; m_fpGIFPackedRegHandlers[GIF_REG_TEX0_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<0>;
m_fpGIFPackedRegHandlers[GIF_REG_TEX0_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<1>; m_fpGIFPackedRegHandlers[GIF_REG_TEX0_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerTEX0<1>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>; m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>; m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG; m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3; m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZF3;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3; m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = (GIFPackedRegHandler)&GSState::GIFRegHandlerXYZ3;
m_fpGIFPackedRegHandlers[GIF_REG_A_D] = &GSState::GIFPackedRegHandlerA_D; m_fpGIFPackedRegHandlers[GIF_REG_A_D] = &GSState::GIFPackedRegHandlerA_D;
m_fpGIFPackedRegHandlers[GIF_REG_NOP] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_NOP] = &GSState::GIFPackedRegHandlerNOP;
#endif
for(int i = 0; i < countof(m_fpGIFRegHandlers); i++) for(int i = 0; i < countof(m_fpGIFRegHandlers); i++)
{ {
m_fpGIFRegHandlers[i] = &GSState::GIFRegHandlerNull; m_fpGIFRegHandlers[i] = &GSState::GIFRegHandlerNull;
} }
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM; m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ; m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST; m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV; m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_1] = &GSState::GIFRegHandlerTEX0<0>; m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_1] = &GSState::GIFRegHandlerTEX0<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_2] = &GSState::GIFRegHandlerTEX0<1>; m_fpGIFRegHandlers[GIF_A_D_REG_TEX0_2] = &GSState::GIFRegHandlerTEX0<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_1] = &GSState::GIFRegHandlerCLAMP<0>; m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_1] = &GSState::GIFRegHandlerCLAMP<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_2] = &GSState::GIFRegHandlerCLAMP<1>; m_fpGIFRegHandlers[GIF_A_D_REG_CLAMP_2] = &GSState::GIFRegHandlerCLAMP<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_FOG] = &GSState::GIFRegHandlerFOG; m_fpGIFRegHandlers[GIF_A_D_REG_FOG] = &GSState::GIFRegHandlerFOG;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF3; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF3;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ3; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ3;
m_fpGIFRegHandlers[GIF_A_D_REG_NOP] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_NOP] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_1] = &GSState::GIFRegHandlerTEX1<0>; m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_1] = &GSState::GIFRegHandlerTEX1<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_2] = &GSState::GIFRegHandlerTEX1<1>; m_fpGIFRegHandlers[GIF_A_D_REG_TEX1_2] = &GSState::GIFRegHandlerTEX1<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_1] = &GSState::GIFRegHandlerTEX2<0>; m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_1] = &GSState::GIFRegHandlerTEX2<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_2] = &GSState::GIFRegHandlerTEX2<1>; m_fpGIFRegHandlers[GIF_A_D_REG_TEX2_2] = &GSState::GIFRegHandlerTEX2<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_1] = &GSState::GIFRegHandlerXYOFFSET<0>; m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_1] = &GSState::GIFRegHandlerXYOFFSET<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_2] = &GSState::GIFRegHandlerXYOFFSET<1>; m_fpGIFRegHandlers[GIF_A_D_REG_XYOFFSET_2] = &GSState::GIFRegHandlerXYOFFSET<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT; m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE; m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
m_fpGIFRegHandlers[GIF_A_D_REG_TEXCLUT] = &GSState::GIFRegHandlerTEXCLUT; m_fpGIFRegHandlers[GIF_A_D_REG_TEXCLUT] = &GSState::GIFRegHandlerTEXCLUT;
m_fpGIFRegHandlers[GIF_A_D_REG_SCANMSK] = &GSState::GIFRegHandlerSCANMSK; m_fpGIFRegHandlers[GIF_A_D_REG_SCANMSK] = &GSState::GIFRegHandlerSCANMSK;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_1] = &GSState::GIFRegHandlerMIPTBP1<0>; m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_1] = &GSState::GIFRegHandlerMIPTBP1<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_2] = &GSState::GIFRegHandlerMIPTBP1<1>; m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP1_2] = &GSState::GIFRegHandlerMIPTBP1<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_1] = &GSState::GIFRegHandlerMIPTBP2<0>; m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_1] = &GSState::GIFRegHandlerMIPTBP2<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_2] = &GSState::GIFRegHandlerMIPTBP2<1>; m_fpGIFRegHandlers[GIF_A_D_REG_MIPTBP2_2] = &GSState::GIFRegHandlerMIPTBP2<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEXA] = &GSState::GIFRegHandlerTEXA; m_fpGIFRegHandlers[GIF_A_D_REG_TEXA] = &GSState::GIFRegHandlerTEXA;
m_fpGIFRegHandlers[GIF_A_D_REG_FOGCOL] = &GSState::GIFRegHandlerFOGCOL; m_fpGIFRegHandlers[GIF_A_D_REG_FOGCOL] = &GSState::GIFRegHandlerFOGCOL;
m_fpGIFRegHandlers[GIF_A_D_REG_TEXFLUSH] = &GSState::GIFRegHandlerTEXFLUSH; m_fpGIFRegHandlers[GIF_A_D_REG_TEXFLUSH] = &GSState::GIFRegHandlerTEXFLUSH;
m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_1] = &GSState::GIFRegHandlerSCISSOR<0>; m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_1] = &GSState::GIFRegHandlerSCISSOR<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_2] = &GSState::GIFRegHandlerSCISSOR<1>; m_fpGIFRegHandlers[GIF_A_D_REG_SCISSOR_2] = &GSState::GIFRegHandlerSCISSOR<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_1] = &GSState::GIFRegHandlerALPHA<0>; m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_1] = &GSState::GIFRegHandlerALPHA<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_2] = &GSState::GIFRegHandlerALPHA<1>; m_fpGIFRegHandlers[GIF_A_D_REG_ALPHA_2] = &GSState::GIFRegHandlerALPHA<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_DIMX] = &GSState::GIFRegHandlerDIMX; m_fpGIFRegHandlers[GIF_A_D_REG_DIMX] = &GSState::GIFRegHandlerDIMX;
m_fpGIFRegHandlers[GIF_A_D_REG_DTHE] = &GSState::GIFRegHandlerDTHE; m_fpGIFRegHandlers[GIF_A_D_REG_DTHE] = &GSState::GIFRegHandlerDTHE;
m_fpGIFRegHandlers[GIF_A_D_REG_COLCLAMP] = &GSState::GIFRegHandlerCOLCLAMP; m_fpGIFRegHandlers[GIF_A_D_REG_COLCLAMP] = &GSState::GIFRegHandlerCOLCLAMP;
m_fpGIFRegHandlers[GIF_A_D_REG_TEST_1] = &GSState::GIFRegHandlerTEST<0>; m_fpGIFRegHandlers[GIF_A_D_REG_TEST_1] = &GSState::GIFRegHandlerTEST<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_TEST_2] = &GSState::GIFRegHandlerTEST<1>; m_fpGIFRegHandlers[GIF_A_D_REG_TEST_2] = &GSState::GIFRegHandlerTEST<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_PABE] = &GSState::GIFRegHandlerPABE; m_fpGIFRegHandlers[GIF_A_D_REG_PABE] = &GSState::GIFRegHandlerPABE;
m_fpGIFRegHandlers[GIF_A_D_REG_FBA_1] = &GSState::GIFRegHandlerFBA<0>; m_fpGIFRegHandlers[GIF_A_D_REG_FBA_1] = &GSState::GIFRegHandlerFBA<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_FBA_2] = &GSState::GIFRegHandlerFBA<1>; m_fpGIFRegHandlers[GIF_A_D_REG_FBA_2] = &GSState::GIFRegHandlerFBA<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_1] = &GSState::GIFRegHandlerFRAME<0>; m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_1] = &GSState::GIFRegHandlerFRAME<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_2] = &GSState::GIFRegHandlerFRAME<1>; m_fpGIFRegHandlers[GIF_A_D_REG_FRAME_2] = &GSState::GIFRegHandlerFRAME<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_1] = &GSState::GIFRegHandlerZBUF<0>; m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_1] = &GSState::GIFRegHandlerZBUF<0>;
m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_2] = &GSState::GIFRegHandlerZBUF<1>; m_fpGIFRegHandlers[GIF_A_D_REG_ZBUF_2] = &GSState::GIFRegHandlerZBUF<1>;
m_fpGIFRegHandlers[GIF_A_D_REG_BITBLTBUF] = &GSState::GIFRegHandlerBITBLTBUF; m_fpGIFRegHandlers[GIF_A_D_REG_BITBLTBUF] = &GSState::GIFRegHandlerBITBLTBUF;
m_fpGIFRegHandlers[GIF_A_D_REG_TRXPOS] = &GSState::GIFRegHandlerTRXPOS; m_fpGIFRegHandlers[GIF_A_D_REG_TRXPOS] = &GSState::GIFRegHandlerTRXPOS;
m_fpGIFRegHandlers[GIF_A_D_REG_TRXREG] = &GSState::GIFRegHandlerTRXREG; m_fpGIFRegHandlers[GIF_A_D_REG_TRXREG] = &GSState::GIFRegHandlerTRXREG;
m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR] = &GSState::GIFRegHandlerTRXDIR; m_fpGIFRegHandlers[GIF_A_D_REG_TRXDIR] = &GSState::GIFRegHandlerTRXDIR;
m_fpGIFRegHandlers[GIF_A_D_REG_HWREG] = &GSState::GIFRegHandlerHWREG; m_fpGIFRegHandlers[GIF_A_D_REG_HWREG] = &GSState::GIFRegHandlerHWREG;
SetMultithreaded( m_mt ); SetMultithreaded(m_mt);
} }
GSVector4i GSState::GetDisplayRect(int i) GSVector4i GSState::GetDisplayRect(int i)
@ -375,22 +372,24 @@ int GSState::GetFPS()
// GIFPackedRegHandler* // GIFPackedRegHandler*
void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerNull(const GIFPackedReg* r)
{ {
// ASSERT(0); // ASSERT(0);
} }
void __fi GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
{ {
#if _M_SSE >= 0x301 #if _M_SSE >= 0x301
GSVector4i mask = GSVector4i::load(0x0c080400); GSVector4i mask = GSVector4i::load(0x0c080400);
GSVector4i v = GSVector4i::load<false>(r).shuffle8(mask); GSVector4i v = GSVector4i::load<false>(r).shuffle8(mask);
m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v); m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v);
#elif _M_SSE >= 0x200 #elif _M_SSE >= 0x200
GSVector4i v = GSVector4i::load<false>(r) & GSVector4i::x000000ff(); GSVector4i v = GSVector4i::load<false>(r) & GSVector4i::x000000ff();
m_v.RGBAQ.u32[0] = v.rgba32(); m_v.RGBAQ.u32[0] = v.rgba32();
#else #else
@ -405,7 +404,7 @@ void __fi GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* r)
m_v.RGBAQ.Q = m_q; m_v.RGBAQ.Q = m_q;
} }
void __fi GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
{ {
#if defined(_M_AMD64) #if defined(_M_AMD64)
@ -426,7 +425,7 @@ void __fi GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* r)
m_q = r->STQ.Q; m_q = r->STQ.Q;
} }
void __fi GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
{ {
#if _M_SSE >= 0x200 #if _M_SSE >= 0x200
@ -441,7 +440,7 @@ void __fi GSState::GIFPackedRegHandlerUV(const GIFPackedReg* r)
#endif #endif
} }
void __fi GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
{ {
m_v.XYZ.X = r->XYZF2.X; m_v.XYZ.X = r->XYZF2.X;
m_v.XYZ.Y = r->XYZF2.Y; m_v.XYZ.Y = r->XYZF2.Y;
@ -451,7 +450,7 @@ void __fi GSState::GIFPackedRegHandlerXYZF2(const GIFPackedReg* r)
VertexKick(r->XYZF2.ADC); VertexKick(r->XYZF2.ADC);
} }
void __fi GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
{ {
m_v.XYZ.X = r->XYZ2.X; m_v.XYZ.X = r->XYZ2.X;
m_v.XYZ.Y = r->XYZ2.Y; m_v.XYZ.Y = r->XYZ2.Y;
@ -460,17 +459,17 @@ void __fi GSState::GIFPackedRegHandlerXYZ2(const GIFPackedReg* r)
VertexKick(r->XYZ2.ADC); VertexKick(r->XYZ2.ADC);
} }
void __fi GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerFOG(const GIFPackedReg* r)
{ {
m_v.FOG.F = r->FOG.F; m_v.FOG.F = r->FOG.F;
} }
void __fi GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerA_D(const GIFPackedReg* r)
{ {
(this->*m_fpGIFRegHandlers[r->A_D.ADDR])(&r->r); (this->*m_fpGIFRegHandlers[r->A_D.ADDR])(&r->r);
} }
void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* r) __forceinline void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* r)
{ {
} }
@ -502,6 +501,8 @@ __forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim)
m_context = &m_env.CTXT[PRIM->CTXT]; m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
ResetPrim(); ResetPrim();
} }
@ -510,22 +511,22 @@ void GSState::GIFRegHandlerPRIM(const GIFReg* r)
ApplyPRIM(r->PRIM); ApplyPRIM(r->PRIM);
} }
void GSState::GIFRegHandlerRGBAQ(const GIFReg* r) __forceinline void GSState::GIFRegHandlerRGBAQ(const GIFReg* r)
{ {
m_v.RGBAQ = (GSVector4i)r->RGBAQ; m_v.RGBAQ = (GSVector4i)r->RGBAQ;
} }
void GSState::GIFRegHandlerST(const GIFReg* r) __forceinline void GSState::GIFRegHandlerST(const GIFReg* r)
{ {
m_v.ST = (GSVector4i)r->ST; m_v.ST = (GSVector4i)r->ST;
} }
void GSState::GIFRegHandlerUV(const GIFReg* r) __forceinline void GSState::GIFRegHandlerUV(const GIFReg* r)
{ {
m_v.UV.u32[0] = r->UV.u32[0] & 0x3fff3fff; m_v.UV.u32[0] = r->UV.u32[0] & 0x3fff3fff;
} }
__fi void GSState::GIFRegHandlerXYZF2(const GIFReg* r) void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
{ {
/* /*
m_v.XYZ.X = r->XYZF.X; m_v.XYZ.X = r->XYZF.X;
@ -540,14 +541,14 @@ __fi void GSState::GIFRegHandlerXYZF2(const GIFReg* r)
VertexKick(false); VertexKick(false);
} }
__fi void GSState::GIFRegHandlerXYZ2(const GIFReg* r) void GSState::GIFRegHandlerXYZ2(const GIFReg* r)
{ {
m_v.XYZ = (GSVector4i)r->XYZ; m_v.XYZ = (GSVector4i)r->XYZ;
VertexKick(false); VertexKick(false);
} }
__fi void GSState::ApplyTEX0( uint i, GIFRegTEX0& TEX0 ) void GSState::ApplyTEX0(uint i, GIFRegTEX0& TEX0)
{ {
// even if TEX0 did not change, a new palette may have been uploaded and will overwrite the currently queued for drawing // even if TEX0 did not change, a new palette may have been uploaded and will overwrite the currently queued for drawing
@ -578,7 +579,7 @@ __fi void GSState::ApplyTEX0( uint i, GIFRegTEX0& TEX0 )
} }
} }
template<int i> __fi void GSState::GIFRegHandlerTEX0(const GIFReg* r) template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* r)
{ {
GIFRegTEX0 TEX0 = r->TEX0; GIFRegTEX0 TEX0 = r->TEX0;
@ -588,7 +589,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX0(const GIFReg* r)
ApplyTEX0( i, TEX0 ); ApplyTEX0( i, TEX0 );
} }
template<int i> __fi void GSState::GIFRegHandlerCLAMP(const GIFReg* r) template<int i> void GSState::GIFRegHandlerCLAMP(const GIFReg* r)
{ {
if(PRIM->CTXT == i && r->CLAMP != m_env.CTXT[i].CLAMP) if(PRIM->CTXT == i && r->CLAMP != m_env.CTXT[i].CLAMP)
{ {
@ -603,7 +604,7 @@ void GSState::GIFRegHandlerFOG(const GIFReg* r)
m_v.FOG = (GSVector4i)r->FOG; m_v.FOG = (GSVector4i)r->FOG;
} }
__fi void GSState::GIFRegHandlerXYZF3(const GIFReg* r) void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
{ {
/* /*
m_v.XYZ.X = r->XYZF.X; m_v.XYZ.X = r->XYZF.X;
@ -618,7 +619,7 @@ __fi void GSState::GIFRegHandlerXYZF3(const GIFReg* r)
VertexKick(true); VertexKick(true);
} }
__fi void GSState::GIFRegHandlerXYZ3(const GIFReg* r) void GSState::GIFRegHandlerXYZ3(const GIFReg* r)
{ {
m_v.XYZ = (GSVector4i)r->XYZ; m_v.XYZ = (GSVector4i)r->XYZ;
@ -629,7 +630,7 @@ void GSState::GIFRegHandlerNOP(const GIFReg* r)
{ {
} }
template<int i> __fi void GSState::GIFRegHandlerTEX1(const GIFReg* r) template<int i> void GSState::GIFRegHandlerTEX1(const GIFReg* r)
{ {
if(PRIM->CTXT == i && r->TEX1 != m_env.CTXT[i].TEX1) if(PRIM->CTXT == i && r->TEX1 != m_env.CTXT[i].TEX1)
{ {
@ -639,7 +640,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX1(const GIFReg* r)
m_env.CTXT[i].TEX1 = (GSVector4i)r->TEX1; m_env.CTXT[i].TEX1 = (GSVector4i)r->TEX1;
} }
template<int i> __fi void GSState::GIFRegHandlerTEX2(const GIFReg* r) template<int i> void GSState::GIFRegHandlerTEX2(const GIFReg* r)
{ {
// m_env.CTXT[i].TEX2 = r->TEX2; // not used // m_env.CTXT[i].TEX2 = r->TEX2; // not used
@ -656,7 +657,7 @@ template<int i> __fi void GSState::GIFRegHandlerTEX2(const GIFReg* r)
ApplyTEX0(i, TEX0); ApplyTEX0(i, TEX0);
} }
template<int i> __fi void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r) template<int i> void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
{ {
GSVector4i o = (GSVector4i)r->XYOFFSET & GSVector4i::x0000ffff(); GSVector4i o = (GSVector4i)r->XYOFFSET & GSVector4i::x0000ffff();
@ -670,7 +671,7 @@ template<int i> __fi void GSState::GIFRegHandlerXYOFFSET(const GIFReg* r)
m_env.CTXT[i].UpdateScissor(); m_env.CTXT[i].UpdateScissor();
} }
__fi void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r) void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
{ {
if(r->PRMODECONT != m_env.PRMODECONT) if(r->PRMODECONT != m_env.PRMODECONT)
{ {
@ -684,9 +685,11 @@ __fi void GSState::GIFRegHandlerPRMODECONT(const GIFReg* r)
// if(PRIM->PRIM == 7) printf("Invalid PRMODECONT/PRIM\n"); // if(PRIM->PRIM == 7) printf("Invalid PRMODECONT/PRIM\n");
m_context = &m_env.CTXT[PRIM->CTXT]; m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
} }
__fi void GSState::GIFRegHandlerPRMODE(const GIFReg* r) void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
{ {
if(!m_env.PRMODECONT.AC) if(!m_env.PRMODECONT.AC)
{ {
@ -698,9 +701,11 @@ __fi void GSState::GIFRegHandlerPRMODE(const GIFReg* r)
m_env.PRMODE._PRIM = _PRIM; m_env.PRMODE._PRIM = _PRIM;
m_context = &m_env.CTXT[PRIM->CTXT]; m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
} }
__fi void GSState::GIFRegHandlerTEXCLUT(const GIFReg* r) void GSState::GIFRegHandlerTEXCLUT(const GIFReg* r)
{ {
if(r->TEXCLUT != m_env.TEXCLUT) if(r->TEXCLUT != m_env.TEXCLUT)
{ {
@ -730,7 +735,7 @@ template<int i> void GSState::GIFRegHandlerMIPTBP1(const GIFReg* r)
m_env.CTXT[i].MIPTBP1 = (GSVector4i)r->MIPTBP1; m_env.CTXT[i].MIPTBP1 = (GSVector4i)r->MIPTBP1;
} }
template<int i> __fi void GSState::GIFRegHandlerMIPTBP2(const GIFReg* r) template<int i> void GSState::GIFRegHandlerMIPTBP2(const GIFReg* r)
{ {
if(PRIM->CTXT == i && r->MIPTBP2 != m_env.CTXT[i].MIPTBP2) if(PRIM->CTXT == i && r->MIPTBP2 != m_env.CTXT[i].MIPTBP2)
{ {
@ -767,7 +772,7 @@ void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* r)
// InvalidateTextureCache(); // InvalidateTextureCache();
} }
template<int i> __fi void GSState::GIFRegHandlerSCISSOR(const GIFReg* r) template<int i> void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
{ {
if(PRIM->CTXT == i && r->SCISSOR != m_env.CTXT[i].SCISSOR) if(PRIM->CTXT == i && r->SCISSOR != m_env.CTXT[i].SCISSOR)
{ {
@ -779,7 +784,7 @@ template<int i> __fi void GSState::GIFRegHandlerSCISSOR(const GIFReg* r)
m_env.CTXT[i].UpdateScissor(); m_env.CTXT[i].UpdateScissor();
} }
template<int i> __fi void GSState::GIFRegHandlerALPHA(const GIFReg* r) template<int i> void GSState::GIFRegHandlerALPHA(const GIFReg* r)
{ {
ASSERT(r->ALPHA.A != 3); ASSERT(r->ALPHA.A != 3);
ASSERT(r->ALPHA.B != 3); ASSERT(r->ALPHA.B != 3);
@ -1142,66 +1147,6 @@ void GSState::Read(uint8* mem, int len)
m_mem.ReadImageX(m_tr.x, m_tr.y, mem, len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG); m_mem.ReadImageX(m_tr.x, m_tr.y, mem, len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
} }
// Use version 1 of the optimized local > local transfer, as per revision 887.
// Later (more optimized?) versions cause a crash in Dark Cloud 2.
#if 1
void GSState::Move()
{
// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
// guitar hero copies the far end of the board to do a similar blend too
int sx = m_env.TRXPOS.SSAX;
int dx = m_env.TRXPOS.DSAX;
int sy = m_env.TRXPOS.SSAY;
int dy = m_env.TRXPOS.DSAY;
int w = m_env.TRXREG.RRW;
int h = m_env.TRXREG.RRH;
int xinc = 1;
int yinc = 1;
InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
InvalidateVideoMem(m_env.BITBLTBUF, GSVector4i(dx, dy, dx + w, dy + h));
if(sx < dx) sx += w-1, dx += w-1, xinc = -1;
if(sy < dy) sy += h-1, dy += h-1, yinc = -1;
const GSLocalMemory::psm_t& spsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM];
const GSLocalMemory::psm_t& dpsm = GSLocalMemory::m_psm[m_env.BITBLTBUF.DPSM];
if(m_env.BITBLTBUF.SPSM == PSM_PSMCT32 && m_env.BITBLTBUF.DPSM == PSM_PSMCT32)
{
for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
{
DWORD sbase = spsm.pa(0, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW);
int* soffset = spsm.rowOffset[sy & 7];
DWORD dbase = dpsm.pa(0, dy, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
int* doffset = dpsm.rowOffset[dy & 7];
for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
{
m_mem.WritePixel32(dbase + doffset[dx], m_mem.ReadPixel32(sbase + soffset[sx]));
}
}
}
else
{
for(int y = 0; y < h; y++, sy += yinc, dy += yinc, sx -= xinc*w, dx -= xinc*w)
{
DWORD sbase = spsm.pa(0, sy, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW);
int* soffset = spsm.rowOffset[sy & 7];
DWORD dbase = dpsm.pa(0, dy, m_env.BITBLTBUF.DBP, m_env.BITBLTBUF.DBW);
int* doffset = dpsm.rowOffset[dy & 7];
for(int x = 0; x < w; x++, sx += xinc, dx += xinc)
{
(m_mem.*dpsm.wpa)(dbase + doffset[dx], (m_mem.*spsm.rpa)(sbase + soffset[sx]));
}
}
}
}
#else
void GSState::Move() void GSState::Move()
{ {
// ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect // ffxii uses this to move the top/bottom of the scrolling menus offscreen and then blends them back over the text to create a shading effect
@ -1346,10 +1291,7 @@ void GSState::Move()
int* RESTRICT scol = &spo->pixel.col[sy & 7][sx]; int* RESTRICT scol = &spo->pixel.col[sy & 7][sx];
int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx]; int* RESTRICT dcol = &dpo->pixel.col[dy & 7][dx];
for(int x = 0; x > -w; x--) { for(int x = 0; x > -w; x--) d[dcol[x]] = s[scol[x]];
printf("%d",x); //Dark Cloud 2 crashes at x = -63
d[dcol[x]] = s[scol[x]];
}
} }
} }
} }
@ -1412,7 +1354,7 @@ void GSState::Move()
} }
} }
} }
#endif
void GSState::SoftReset(uint32 mask) void GSState::SoftReset(uint32 mask)
{ {
if(mask & 1) if(mask & 1)
@ -1508,91 +1450,7 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{ {
do do
{ {
uint32 reg = path.GetReg(); (this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
#if 0
// I assume this was some sort of debugging code? Why intercept and perform
// special handling for the first three entries in the table, and then do
// a LUT for the rest? Either do a switch for the whole table (best idea)
// or do a LUT for the whole table.
switch(reg)
{
case GIF_REG_RGBA:
GIFPackedRegHandlerRGBA((GIFPackedReg*)mem);
break;
case GIF_REG_STQ:
GIFPackedRegHandlerSTQ((GIFPackedReg*)mem);
break;
case GIF_REG_UV:
GIFPackedRegHandlerUV((GIFPackedReg*)mem);
break;
default:
(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
break;
}
#endif
#if UsePackedRegSwitch
// This is a switch statement version of the LUT above. Since there are only
// 16 entries, this is almost certainly ideal, since the compiler can inline
// all the handlers, and PGO will further optimize the switch dispatcher.
if (FrameSkipIt)
{
// When skipping frames it looks like we only need to bother with the A_D handler
// and the TEX handlers. (and I'm thinking the TEX handlers might not be necessary
// if the PCSX2 side of the frameskipper is smart enough anyway).
switch(reg)
{
case GIF_REG_A_D: GIFPackedRegHandlerA_D ((GIFPackedReg*)mem); break;
case GIF_REG_TEX0_1: GIFRegHandlerTEX0<0> ((GIFReg*)mem); break;
case GIF_REG_TEX0_2: GIFRegHandlerTEX0<1> ((GIFReg*)mem); break;
// Should RGBA/STQ/UV be NOPs when skipping frames? I think so, but maybe the original
// switch() (above) was some hack to enable them in frameskipping mode. --air
case GIF_REG_RGBA: //GIFPackedRegHandlerRGBA ((GIFPackedReg*)mem); break;
case GIF_REG_STQ: //GIFPackedRegHandlerSTQ ((GIFPackedReg*)mem); break;
case GIF_REG_UV: //GIFPackedRegHandlerUV ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF2: //GIFPackedRegHandlerXYZF2((GIFPackedReg*)mem); break;
case GIF_REG_XYZ2: //GIFPackedRegHandlerXYZ2 ((GIFPackedReg*)mem); break;
case GIF_REG_CLAMP_1: //GIFRegHandlerCLAMP<0> ((GIFReg*)mem); break;
case GIF_REG_CLAMP_2: //GIFRegHandlerCLAMP<1> ((GIFReg*)mem); break;
case GIF_REG_FOG: //GIFPackedRegHandlerFOG ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF3: //GIFRegHandlerXYZF3 ((GIFReg*)mem); break;
case GIF_REG_XYZ3: //GIFRegHandlerXYZ3 ((GIFReg*)mem); break;
case GIF_REG_NOP: break;
}
}
else
{
switch(reg)
{
case GIF_REG_RGBA: GIFPackedRegHandlerRGBA ((GIFPackedReg*)mem); break;
case GIF_REG_STQ: GIFPackedRegHandlerSTQ ((GIFPackedReg*)mem); break;
case GIF_REG_UV: GIFPackedRegHandlerUV ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF2: GIFPackedRegHandlerXYZF2((GIFPackedReg*)mem); break;
case GIF_REG_XYZ2: GIFPackedRegHandlerXYZ2 ((GIFPackedReg*)mem); break;
case GIF_REG_TEX0_1: GIFRegHandlerTEX0<0> ((GIFReg*)mem); break;
case GIF_REG_TEX0_2: GIFRegHandlerTEX0<1> ((GIFReg*)mem); break;
case GIF_REG_CLAMP_1: GIFRegHandlerCLAMP<0> ((GIFReg*)mem); break;
case GIF_REG_CLAMP_2: GIFRegHandlerCLAMP<1> ((GIFReg*)mem); break;
case GIF_REG_FOG: GIFPackedRegHandlerFOG ((GIFPackedReg*)mem); break;
case GIF_REG_XYZF3: GIFRegHandlerXYZF3 ((GIFReg*)mem); break;
case GIF_REG_XYZ3: GIFRegHandlerXYZ3 ((GIFReg*)mem); break;
case GIF_REG_A_D: GIFPackedRegHandlerA_D ((GIFPackedReg*)mem); break;
case GIF_REG_NOP: break;
}
}
#else
// This is the original LUT implementation of the packed reg dispatcher.
// Simple and clean, but the switch system below is probably more efficient.
(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
#endif
mem += sizeof(GIFPackedReg); mem += sizeof(GIFPackedReg);
size--; size--;
@ -1779,7 +1637,7 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
WriteState(data, &m_tr.y); WriteState(data, &m_tr.y);
WriteState(data, m_mem.m_vm8, m_mem.m_vmsize); WriteState(data, m_mem.m_vm8, m_mem.m_vmsize);
for(int i = 0; i < ArraySize(m_path); i++) for(int i = 0; i < countof(m_path); i++)
{ {
m_path[i].tag.NREG = m_path[i].nreg; m_path[i].tag.NREG = m_path[i].nreg;
m_path[i].tag.NLOOP = m_path[i].nloop; m_path[i].tag.NLOOP = m_path[i].nloop;
@ -1874,7 +1732,7 @@ int GSState::Defrost(const GSFreezeData* fd)
m_tr.total = 0; // TODO: restore transfer state m_tr.total = 0; // TODO: restore transfer state
for(int i = 0; i < ArraySize(m_path); i++) for(int i = 0; i < countof(m_path); i++)
{ {
ReadState(&m_path[i].tag, data); ReadState(&m_path[i].tag, data);
ReadState(&m_path[i].reg, data); ReadState(&m_path[i].reg, data);
@ -1888,6 +1746,8 @@ int GSState::Defrost(const GSFreezeData* fd)
m_context = &m_env.CTXT[PRIM->CTXT]; m_context = &m_env.CTXT[PRIM->CTXT];
UpdateVertexKick();
m_env.UpdateDIMX(); m_env.UpdateDIMX();
for(int i = 0; i < 2; i++) for(int i = 0; i < 2; i++)
@ -1918,7 +1778,7 @@ GSState::GSTransferBuffer::GSTransferBuffer()
{ {
x = y = 0; x = y = 0;
start = end = total = 0; start = end = total = 0;
buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 16); buff = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32);
} }
GSState::GSTransferBuffer::~GSTransferBuffer() GSState::GSTransferBuffer::~GSTransferBuffer()

View File

@ -36,17 +36,11 @@
#include "GSAlignedClass.h" #include "GSAlignedClass.h"
#include "GSDump.h" #include "GSDump.h"
// Set this to 1 to enable a switch statement instead of a LUT for the packed register handler class GSState : public GSAlignedClass<32>
// in the GifTransfer code. Switch statement is probably faster, but it isn't fully implemented
// yet (not properly supporting frameskipping).
#define UsePackedRegSwitch 0
class GSState : public GSAlignedClass<16>
{ {
#if !UsePackedRegSwitch
typedef void (GSState::*GIFPackedRegHandler)(const GIFPackedReg* r); typedef void (GSState::*GIFPackedRegHandler)(const GIFPackedReg* r);
GIFPackedRegHandler m_fpGIFPackedRegHandlers[16]; GIFPackedRegHandler m_fpGIFPackedRegHandlers[16];
#endif
void GIFPackedRegHandlerNull(const GIFPackedReg* r); void GIFPackedRegHandlerNull(const GIFPackedReg* r);
void GIFPackedRegHandlerRGBA(const GIFPackedReg* r); void GIFPackedRegHandlerRGBA(const GIFPackedReg* r);
@ -62,7 +56,7 @@ class GSState : public GSAlignedClass<16>
GIFRegHandler m_fpGIFRegHandlers[256]; GIFRegHandler m_fpGIFRegHandlers[256];
void ApplyTEX0( uint i, GIFRegTEX0& TEX0 ); void ApplyTEX0(uint i, GIFRegTEX0& TEX0);
void ApplyPRIM(const GIFRegPRIM& PRIM); void ApplyPRIM(const GIFRegPRIM& PRIM);
void GIFRegHandlerNull(const GIFReg* r); void GIFRegHandlerNull(const GIFReg* r);
@ -136,33 +130,67 @@ class GSState : public GSAlignedClass<16>
protected: protected:
bool IsBadFrame(int& skip, int UserHacks_SkipDraw); bool IsBadFrame(int& skip, int UserHacks_SkipDraw);
typedef void (GSState::*DrawingKickPtr)(bool skip); typedef void (GSState::*VertexKickPtr)(bool skip);
DrawingKickPtr m_dk[8]; VertexKickPtr m_vk[8][2][2];
VertexKickPtr m_vkf;
template<class T> void InitVertexKick() template<class T> void InitVertexKick()
{ {
m_dk[GS_POINTLIST] = (DrawingKickPtr)&T::DrawingKick<GS_POINTLIST>; m_vk[GS_POINTLIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 0, 0>;
m_dk[GS_LINELIST] = (DrawingKickPtr)&T::DrawingKick<GS_LINELIST>; m_vk[GS_POINTLIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 0, 0>;
m_dk[GS_LINESTRIP] = (DrawingKickPtr)&T::DrawingKick<GS_LINESTRIP>; m_vk[GS_POINTLIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 1, 0>;
m_dk[GS_TRIANGLELIST] = (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLELIST>; m_vk[GS_POINTLIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_POINTLIST, 1, 1>;
m_dk[GS_TRIANGLESTRIP] = (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLESTRIP>;
m_dk[GS_TRIANGLEFAN] = (DrawingKickPtr)&T::DrawingKick<GS_TRIANGLEFAN>; m_vk[GS_LINELIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 0, 0>;
m_dk[GS_SPRITE] = (DrawingKickPtr)&T::DrawingKick<GS_SPRITE>; m_vk[GS_LINELIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 0, 0>;
m_dk[GS_INVALID] = &GSState::DrawingKickNull; m_vk[GS_LINELIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 1, 0>;
m_vk[GS_LINELIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_LINELIST, 1, 1>;
m_vk[GS_LINESTRIP][0][0] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 0, 0>;
m_vk[GS_LINESTRIP][0][1] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 0, 0>;
m_vk[GS_LINESTRIP][1][0] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 1, 0>;
m_vk[GS_LINESTRIP][1][1] = (VertexKickPtr)&T::VertexKick<GS_LINESTRIP, 1, 1>;
m_vk[GS_TRIANGLELIST][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 0, 0>;
m_vk[GS_TRIANGLELIST][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 0, 0>;
m_vk[GS_TRIANGLELIST][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 1, 0>;
m_vk[GS_TRIANGLELIST][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLELIST, 1, 1>;
m_vk[GS_TRIANGLESTRIP][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 0, 0>;
m_vk[GS_TRIANGLESTRIP][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 0, 0>;
m_vk[GS_TRIANGLESTRIP][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 1, 0>;
m_vk[GS_TRIANGLESTRIP][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLESTRIP, 1, 1>;
m_vk[GS_TRIANGLEFAN][0][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 0, 0>;
m_vk[GS_TRIANGLEFAN][0][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 0, 0>;
m_vk[GS_TRIANGLEFAN][1][0] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 1, 0>;
m_vk[GS_TRIANGLEFAN][1][1] = (VertexKickPtr)&T::VertexKick<GS_TRIANGLEFAN, 1, 1>;
m_vk[GS_SPRITE][0][0] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 0, 0>;
m_vk[GS_SPRITE][0][1] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 0, 0>;
m_vk[GS_SPRITE][1][0] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 1, 0>;
m_vk[GS_SPRITE][1][1] = (VertexKickPtr)&T::VertexKick<GS_SPRITE, 1, 1>;
m_vk[GS_INVALID][0][0] = &GSState::VertexKickNull;
m_vk[GS_INVALID][0][1] = &GSState::VertexKickNull;
m_vk[GS_INVALID][1][0] = &GSState::VertexKickNull;
m_vk[GS_INVALID][1][1] = &GSState::VertexKickNull;
} }
void DrawingKickNull(bool skip) void UpdateVertexKick()
{
m_vkf = m_vk[PRIM->PRIM][PRIM->TME][PRIM->FST];
}
void VertexKickNull(bool skip)
{ {
ASSERT(0); ASSERT(0);
} }
virtual void DoVertexKick()=0; void VertexKick(bool skip)
__fi void VertexKick(bool skip)
{ {
DoVertexKick(); (this->*m_vkf)(skip);
(this->*m_dk[PRIM->PRIM])(skip);
} }
public: public:
@ -221,6 +249,6 @@ public:
void SetFrameSkip(int skip); void SetFrameSkip(int skip);
void SetRegsMem(uint8* basemem); void SetRegsMem(uint8* basemem);
void SetIrqCallback(void (*irq)()); void SetIrqCallback(void (*irq)());
void SetMultithreaded(bool isMT=true); void SetMultithreaded(bool mt = true);
}; };

View File

@ -37,9 +37,12 @@ extern const uint8 clutTableT32I8[128];
extern const uint8 clutTableT32I4[16]; extern const uint8 clutTableT32I4[16];
extern const uint8 clutTableT16I8[32]; extern const uint8 clutTableT16I8[32];
extern const uint8 clutTableT16I4[16]; extern const uint8 clutTableT16I4[16];
struct D3D9Blend {
struct D3D9Blend
{
int bogus; int bogus;
D3DBLENDOP op; D3DBLENDOP op;
D3DBLEND src, dst; D3DBLEND src, dst;
}; };
extern const D3D9Blend blendMapD3D9[3*3*3*3]; extern const D3D9Blend blendMapD3D9[3*3*3*3];

View File

@ -27,6 +27,6 @@ GSTexture::GSTexture()
, m_size(0, 0) , m_size(0, 0)
, m_type(None) , m_type(None)
, m_msaa(false) , m_msaa(false)
, LikelyOffset (false) , LikelyOffset(false)
{ {
} }

View File

@ -836,11 +836,11 @@ GSTextureCache::Source::Source(GSRenderer* r)
{ {
memset(m_valid, 0, sizeof(m_valid)); memset(m_valid, 0, sizeof(m_valid));
m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 16); m_clut = (uint32*)_aligned_malloc(256 * sizeof(uint32), 32);
memset(m_clut, 0, sizeof(m_clut)); memset(m_clut, 0, sizeof(m_clut));
m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 16); m_write.rect = (GSVector4i*)_aligned_malloc(3 * sizeof(GSVector4i), 32);
m_write.count = 0; m_write.count = 0;
} }
@ -1082,7 +1082,7 @@ void GSTextureCache::Target::Update()
} }
else else
{ {
static uint8* buff = (uint8*)::_aligned_malloc(1024 * 1024 * 4, 16); static uint8* buff = (uint8*)::_aligned_malloc(1024 * 1024 * 4, 32);
int pitch = ((w + 3) & ~3) * 4; int pitch = ((w + 3) & ~3) * 4;

View File

@ -39,7 +39,7 @@ public:
FMT_8, FMT_8,
}; };
class Surface : public GSAlignedClass<16> class Surface : public GSAlignedClass<32>
{ {
protected: protected:
GSRenderer* m_renderer; GSRenderer* m_renderer;

View File

@ -253,7 +253,7 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
if(m_buff == NULL) if(m_buff == NULL)
{ {
m_buff = _aligned_malloc(tw * th * sizeof(uint32), 16); m_buff = _aligned_malloc(tw * th * sizeof(uint32), 32);
if(m_buff == NULL) if(m_buff == NULL)
{ {

View File

@ -137,6 +137,7 @@ void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
} }
VSSetShader(i->second.vs, m_vs_cb); VSSetShader(i->second.vs, m_vs_cb);
IASetInputLayout(i->second.il); IASetInputLayout(i->second.il);
} }

View File

@ -69,7 +69,7 @@ void GSDevice9::SetupIA(const void* vertices, int count, int prim)
void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb) void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
{ {
hash_map< uint32, GSVertexShader9 >::const_iterator i = m_vs.find(sel); hash_map<uint32, GSVertexShader9>::const_iterator i = m_vs.find(sel);
if(i == m_vs.end()) if(i == m_vs.end())
{ {
@ -110,6 +110,7 @@ void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
} }
VSSetShader(i->second.vs, (const float*)cb, sizeof(*cb) / sizeof(GSVector4)); VSSetShader(i->second.vs, (const float*)cb, sizeof(*cb) / sizeof(GSVector4));
IASetInputLayout(i->second.il); IASetInputLayout(i->second.il);
} }

View File

@ -27,26 +27,6 @@ const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000))); const GSVector4 GSVector4::m_x3f800000(_mm_castsi128_ps(_mm_set1_epi32(0x3f800000)));
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
GSVector4i::GSVector4i(const GSVector4& v)
{
m = _mm_cvttps_epi32(v);
}
GSVector4::GSVector4(const GSVector4i& v)
{
m = _mm_cvtepi32_ps(v);
}
GSVector4i GSVector4i::cast(const GSVector4& v)
{
return GSVector4i(_mm_castps_si128(v.m));
}
GSVector4 GSVector4::cast(const GSVector4i& v)
{
return GSVector4(_mm_castsi128_ps(v.m));
}
GSVector4i GSVector4i::fit(int arx, int ary) const GSVector4i GSVector4i::fit(int arx, int ary) const
{ {
GSVector4i r = *this; GSVector4i r = *this;

File diff suppressed because it is too large Load Diff

View File

@ -28,7 +28,7 @@
#pragma pack(push, 1) #pragma pack(push, 1)
__aligned16 struct GSVertex __aligned32 struct GSVertex
{ {
union union
{ {

View File

@ -26,7 +26,7 @@
#pragma pack(push, 1) #pragma pack(push, 1)
__aligned16 union GSVertexHW9 __aligned32 union GSVertexHW9
{ {
struct struct
{ {
@ -56,7 +56,7 @@ __aligned16 union GSVertexHW9
float GetQ() {return p.w;} float GetQ() {return p.w;}
}; };
__aligned16 union GSVertexHW11 __aligned32 union GSVertexHW11
{ {
struct struct
{ {

View File

@ -31,7 +31,7 @@ public:
GSVertexList() GSVertexList()
: m_count(0) : m_count(0)
{ {
m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 16); m_base = _aligned_malloc(sizeof(Vertex) * countof(m_v), 32);
for(int i = 0; i < countof(m_v); i++) for(int i = 0; i < countof(m_v); i++)
{ {

View File

@ -23,12 +23,16 @@
#include "GSVector.h" #include "GSVector.h"
__aligned16 union GSVertexSW __aligned32 union GSVertexSW
{ {
struct {GSVector4 c, p, t;}; struct {GSVector4 c, p, t;};
struct {GSVector4 v[3];}; struct {GSVector4 v[3];};
struct {float f[12];}; struct {float f[12];};
#if _M_SSE >= 0x500
struct {GSVector8 cp, t_;};
#endif
GSVertexSW() {} GSVertexSW() {}
GSVertexSW(const GSVertexSW& v) {*this = v;} GSVertexSW(const GSVertexSW& v) {*this = v;}
@ -213,4 +217,3 @@ __forceinline GSVertexSW operator / (const GSVertexSW& v, float f)
v0.t = v.t / vf; v0.t = v.t / vf;
return v0; return v0;
} }

View File

@ -120,8 +120,8 @@ void GSVertexTrace::Update(const GSVertexHW11* v, int count, GS_PRIM_CLASS primc
using namespace Xbyak; using namespace Xbyak;
GSVertexTrace::CGSW::CGSW(uint32 key, void* ptr, size_t maxsize) GSVertexTrace::CGSW::CGSW(uint32 key, void* code, size_t maxsize)
: CodeGenerator(maxsize, ptr) : CodeGenerator(maxsize, code)
{ {
#if _M_AMD64 #if _M_AMD64
#error TODO #error TODO
@ -161,10 +161,10 @@ GSVertexTrace::CGSW::CGSW(uint32 key, void* ptr, size_t maxsize)
static const float fmin = -FLT_MAX; static const float fmin = -FLT_MAX;
static const float fmax = FLT_MAX; static const float fmax = FLT_MAX;
movss(xmm0, xmmword[&fmax]); movss(xmm0, ptr[&fmax]);
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
movss(xmm1, xmmword[&fmin]); movss(xmm1, ptr[&fmin]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
if(color) if(color)
@ -202,7 +202,7 @@ L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS) if(tme && !fst && primclass == GS_SPRITE_CLASS)
{ {
movaps(xmm1, xmmword[edx + 1 * sizeof(GSVertexSW) + 32]); movaps(xmm1, ptr[edx + 1 * sizeof(GSVertexSW) + 32]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
} }
@ -213,7 +213,7 @@ L("loop");
// min.c = min.c.minv(v[i + j].c); // min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c); // max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW)]); movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW)]);
minps(xmm2, xmm0); minps(xmm2, xmm0);
maxps(xmm3, xmm0); maxps(xmm3, xmm0);
@ -222,7 +222,7 @@ L("loop");
// min.p = min.p.minv(v[i + j].p); // min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW) + 16]); movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 16]);
minps(xmm4, xmm0); minps(xmm4, xmm0);
maxps(xmm5, xmm0); maxps(xmm5, xmm0);
@ -232,7 +232,7 @@ L("loop");
// min.t = min.t.minv(v[i + j].t); // min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t); // max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexSW) + 32]); movaps(xmm0, ptr[edx + j * sizeof(GSVertexSW) + 32]);
if(!fst) if(!fst)
{ {
@ -265,27 +265,27 @@ L("loop");
{ {
cvttps2dq(xmm2, xmm2); cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7); psrld(xmm2, 7);
movaps(xmmword[eax], xmm2); movaps(ptr[eax], xmm2);
cvttps2dq(xmm3, xmm3); cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7); psrld(xmm3, 7);
movaps(xmmword[edx], xmm3); movaps(ptr[edx], xmm3);
} }
movaps(xmmword[eax + 16], xmm4); movaps(ptr[eax + 16], xmm4);
movaps(xmmword[edx + 16], xmm5); movaps(ptr[edx + 16], xmm5);
if(tme) if(tme)
{ {
movaps(xmmword[eax + 32], xmm6); movaps(ptr[eax + 32], xmm6);
movaps(xmmword[edx + 32], xmm7); movaps(ptr[edx + 32], xmm7);
} }
ret(); ret();
} }
GSVertexTrace::CGHW9::CGHW9(uint32 key, void* ptr, size_t maxsize) GSVertexTrace::CGHW9::CGHW9(uint32 key, void* code, size_t maxsize)
: CodeGenerator(maxsize, ptr) : CodeGenerator(maxsize, code)
{ {
#if _M_AMD64 #if _M_AMD64
#error TODO #error TODO
@ -327,10 +327,10 @@ GSVertexTrace::CGHW9::CGHW9(uint32 key, void* ptr, size_t maxsize)
static const float fmin = -FLT_MAX; static const float fmin = -FLT_MAX;
static const float fmax = FLT_MAX; static const float fmax = FLT_MAX;
movss(xmm0, xmmword[&fmax]); movss(xmm0, ptr[&fmax]);
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
movss(xmm1, xmmword[&fmin]); movss(xmm1, ptr[&fmin]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
if(color) if(color)
@ -368,7 +368,7 @@ L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS) if(tme && !fst && primclass == GS_SPRITE_CLASS)
{ {
movaps(xmm1, xmmword[edx + 5 * sizeof(GSVertexHW9) + 16]); movaps(xmm1, ptr[edx + 5 * sizeof(GSVertexHW9) + 16]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
} }
@ -377,7 +377,7 @@ L("loop");
// min.p = min.p.minv(v[i + j].p); // min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p); // max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW9) + 16]); movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9) + 16]);
minps(xmm4, xmm0); minps(xmm4, xmm0);
maxps(xmm5, xmm0); maxps(xmm5, xmm0);
@ -390,7 +390,7 @@ L("loop");
if(color && (iip || j == n - 1) || tme) if(color && (iip || j == n - 1) || tme)
{ {
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW9)]); movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW9)]);
} }
if(color && (iip || j == n - 1)) if(color && (iip || j == n - 1))
@ -455,15 +455,15 @@ L("loop");
punpcklwd(xmm3, xmm0); punpcklwd(xmm3, xmm0);
} }
movaps(xmmword[eax], xmm2); movaps(ptr[eax], xmm2);
movaps(xmmword[edx], xmm3); movaps(ptr[edx], xmm3);
} }
// m_min.p = pmin; // m_min.p = pmin;
// m_max.p = pmax; // m_max.p = pmax;
movaps(xmmword[eax + 16], xmm4); movaps(ptr[eax + 16], xmm4);
movaps(xmmword[edx + 16], xmm5); movaps(ptr[edx + 16], xmm5);
if(tme) if(tme)
{ {
@ -473,15 +473,15 @@ L("loop");
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(xmmword[eax + 32], xmm6); movaps(ptr[eax + 32], xmm6);
movaps(xmmword[edx + 32], xmm7); movaps(ptr[edx + 32], xmm7);
} }
ret(); ret();
} }
GSVertexTrace::CGHW11::CGHW11(uint32 key, void* ptr, size_t maxsize) GSVertexTrace::CGHW11::CGHW11(uint32 key, void* code, size_t maxsize)
: CodeGenerator(maxsize, ptr) : CodeGenerator(maxsize, code)
{ {
#if _M_AMD64 #if _M_AMD64
#error TODO #error TODO
@ -521,10 +521,10 @@ GSVertexTrace::CGHW11::CGHW11(uint32 key, void* ptr, size_t maxsize)
static const float fmin = -FLT_MAX; static const float fmin = -FLT_MAX;
static const float fmax = FLT_MAX; static const float fmax = FLT_MAX;
movss(xmm0, xmmword[&fmax]); movss(xmm0, ptr[&fmax]);
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
movss(xmm1, xmmword[&fmin]); movss(xmm1, ptr[&fmin]);
shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
if(color) if(color)
@ -564,7 +564,7 @@ L("loop");
{ {
if(color && (iip || j == n - 1) || tme) if(color && (iip || j == n - 1) || tme)
{ {
movaps(xmm0, xmmword[edx + j * sizeof(GSVertexHW11)]); movaps(xmm0, ptr[edx + j * sizeof(GSVertexHW11)]);
} }
if(color && (iip || j == n - 1)) if(color && (iip || j == n - 1))
@ -593,7 +593,7 @@ L("loop");
maxps(xmm7, xmm0); maxps(xmm7, xmm0);
} }
movdqa(xmm0, xmmword[edx + j * sizeof(GSVertexHW11) + 16]); movdqa(xmm0, ptr[edx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41)) if(m_cpu.has(util::Cpu::tSSE41))
{ {
@ -648,8 +648,8 @@ L("loop");
punpcklwd(xmm3, xmm0); punpcklwd(xmm3, xmm0);
} }
movaps(xmmword[eax], xmm2); movaps(ptr[eax], xmm2);
movaps(xmmword[edx], xmm3); movaps(ptr[edx], xmm3);
} }
// m_min.p = pmin.xyww(); // m_min.p = pmin.xyww();
@ -658,16 +658,16 @@ L("loop");
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(xmmword[eax + 16], xmm4); movaps(ptr[eax + 16], xmm4);
movaps(xmmword[edx + 16], xmm5); movaps(ptr[edx + 16], xmm5);
if(tme) if(tme)
{ {
// m_min.t = tmin; // m_min.t = tmin;
// m_max.t = tmax; // m_max.t = tmax;
movaps(xmmword[eax + 32], xmm6); movaps(ptr[eax + 32], xmm6);
movaps(xmmword[edx + 32], xmm7); movaps(ptr[edx + 32], xmm7);
} }
ret(); ret();

View File

@ -31,7 +31,7 @@
class GSState; class GSState;
__aligned16 class GSVertexTrace __aligned32 class GSVertexTrace
{ {
struct Vertex {GSVector4i c; GSVector4 p, t;}; struct Vertex {GSVector4i c; GSVector4 p, t;};
struct VertexAlpha {int min, max; bool valid;}; struct VertexAlpha {int min, max; bool valid;};
@ -41,14 +41,14 @@ __aligned16 class GSVertexTrace
class CGSW : public Xbyak::CodeGenerator class CGSW : public Xbyak::CodeGenerator
{ {
public: public:
CGSW(uint32 key, void* ptr, size_t maxsize); CGSW(uint32 key, void* code, size_t maxsize);
}; };
class GSVertexTraceMapSW : public GSCodeGeneratorFunctionMap<CGSW, uint32, VertexTracePtr> class GSVertexTraceMapSW : public GSCodeGeneratorFunctionMap<CGSW, uint32, VertexTracePtr>
{ {
public: public:
GSVertexTraceMapSW() : GSCodeGeneratorFunctionMap("VertexTraceSW") {} GSVertexTraceMapSW() : GSCodeGeneratorFunctionMap("VertexTraceSW") {}
CGSW* Create(uint32 key, void* ptr, size_t maxsize) {return new CGSW(key, ptr, maxsize);} CGSW* Create(uint32 key, void* code, size_t maxsize) {return new CGSW(key, code, maxsize);}
}; };
class CGHW9 : public Xbyak::CodeGenerator class CGHW9 : public Xbyak::CodeGenerator
@ -63,7 +63,7 @@ __aligned16 class GSVertexTrace
{ {
public: public:
GSVertexTraceMapHW9() : GSCodeGeneratorFunctionMap("VertexTraceHW9") {} GSVertexTraceMapHW9() : GSCodeGeneratorFunctionMap("VertexTraceHW9") {}
CGHW9* Create(uint32 key, void* ptr, size_t maxsize) {return new CGHW9(key, ptr, maxsize);} CGHW9* Create(uint32 key, void* code, size_t maxsize) {return new CGHW9(key, code, maxsize);}
}; };
class CGHW11 : public Xbyak::CodeGenerator class CGHW11 : public Xbyak::CodeGenerator
@ -78,7 +78,7 @@ __aligned16 class GSVertexTrace
{ {
public: public:
GSVertexTraceMapHW11() : GSCodeGeneratorFunctionMap("VertexTraceHW11") {} GSVertexTraceMapHW11() : GSCodeGeneratorFunctionMap("VertexTraceHW11") {}
CGHW11* Create(uint32 key, void* ptr, size_t maxsize) {return new CGHW11(key, ptr, maxsize);} CGHW11* Create(uint32 key, void* code, size_t maxsize) {return new CGHW11(key, code, maxsize);}
}; };
GSVertexTraceMapSW m_map_sw; GSVertexTraceMapSW m_map_sw;

View File

@ -174,6 +174,7 @@ GSVector4i GSWnd::GetClientRect()
// Returns FALSE if the window has no title, or if th window title is under the strict // Returns FALSE if the window has no title, or if th window title is under the strict
// management of the emulator. // management of the emulator.
bool GSWnd::SetWindowText(const char* title) bool GSWnd::SetWindowText(const char* title)
{ {
if( !m_IsManaged ) return false; if( !m_IsManaged ) return false;

View File

@ -40,4 +40,4 @@ EXPORTS
GSgetLastTag GSgetLastTag
GSReplay GSReplay
GSBenchmark GSBenchmark
GSgetTitleInfo2 GSgetTitleInfo2

View File

@ -57,6 +57,7 @@
#include <algorithm> #include <algorithm>
// Let's take advantage of the work that's already been done on making things cross-platform by bringing this in. // Let's take advantage of the work that's already been done on making things cross-platform by bringing this in.
#include "Pcsx2Defs.h" #include "Pcsx2Defs.h"
using namespace std; using namespace std;
@ -126,7 +127,7 @@ typedef signed long long int64;
#define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA) #define D3DCOLORWRITEENABLE_RGBA (D3DCOLORWRITEENABLE_RED | D3DCOLORWRITEENABLE_GREEN | D3DCOLORWRITEENABLE_BLUE | D3DCOLORWRITEENABLE_ALPHA)
#define USE_UPSCALE_HACKS //Hacks intended to fix upscaling / rendering glitches in HW renderers #define USE_UPSCALE_HACKS // Hacks intended to fix upscaling / rendering glitches in HW renderers
// dxsdk beta missing these: // dxsdk beta missing these:
#define D3D11_SHADER_MACRO D3D10_SHADER_MACRO #define D3D11_SHADER_MACRO D3D10_SHADER_MACRO

View File

@ -1,12 +1,12 @@
#ifndef XBYAK_H_ #ifndef XBYAK_XBYAK_H_
#define XBYAK_H_ #define XBYAK_XBYAK_H_
/*! /*!
@file xbyak.h @file xbyak.h
@brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++ @brief Xbyak ; JIT assembler for x86(IA32)/x64 by C++
@author herumi @author herumi
@version $Revision: 1.157 $ @version $Revision: 1.238 $
@url http://homepage1.nifty.com/herumi/soft/xbyak.html @url http://homepage1.nifty.com/herumi/soft/xbyak.html
@date $Date: 2008/12/30 04:53:11 $ @date $Date: 2011/02/04 03:46:09 $
@note modified new BSD license @note modified new BSD license
http://www.opensource.org/licenses/bsd-license.php http://www.opensource.org/licenses/bsd-license.php
*/ */
@ -15,9 +15,12 @@
#include <assert.h> #include <assert.h>
#include <map> #include <map>
#include <string> #include <string>
#ifdef __GNUC__ #include <algorithm>
#include <unistd.h> #ifdef _WIN32
#include <sys/mman.h> #include <windows.h>
#elif defined(__GNUC__)
#include <unistd.h>
#include <sys/mman.h>
#endif #endif
#ifdef __x86_64__ #ifdef __x86_64__
@ -45,13 +48,6 @@
#pragma warning(disable : 4127) /* condition is constant(for "if" trick) */ #pragma warning(disable : 4127) /* condition is constant(for "if" trick) */
#endif #endif
#endif #endif
#include <windows.h>
#endif
#ifndef NUM_OF_ARRAY
// template<class T, int N>
// size_t num_of_array(const T (&)[N]) { return N; }
#define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(*x))
#endif #endif
namespace Xbyak { namespace Xbyak {
@ -59,29 +55,35 @@ namespace Xbyak {
#include "xbyak_bin2hex.h" #include "xbyak_bin2hex.h"
enum { enum {
DEFAULT_MAX_CODE_SIZE = 2048, DEFAULT_MAX_CODE_SIZE = 4096,
VERSION = 0x2070, /* 0xABCD = A.BC(D) */ VERSION = 0x2990, /* 0xABCD = A.BC(D) */
}; };
/* /*
#ifndef MIE_DEFINED_UINT32 #ifndef MIE_INTEGER_TYPE_DEFINED
#define MIE_DEFINED_UINT32 #define MIE_INTEGER_TYPE_DEFINED
#ifdef _MSC_VER #ifdef _MSC_VER
typedef unsigned __int64 uint64; typedef unsigned __int64 uint64;
#else typedef __int64 sint64;
typedef unsigned long long uint64; #else
#endif typedef unsigned long long uint64;
typedef unsigned int uint32; typedef long long sint64;
typedef unsigned short uint16; #endif
typedef unsigned char uint8; typedef unsigned int uint32;
#ifndef MIE_ALIGN typedef unsigned short uint16;
#ifdef _MSC_VER typedef unsigned char uint8;
#define MIE_ALIGN(x) __declspec(align(x))
#else
#define MIE_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#endif #endif
*/ */
#ifndef MIE_ALIGN
#ifdef _MSC_VER
#define MIE_ALIGN(x) __declspec(align(x))
#else
#define MIE_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif
#ifndef MIE_PACK // for shufps
#define MIE_PACK(x, y, z, w) ((x) * 64 + (y) * 16 + (z) * 4 + (w))
#endif
enum Error { enum Error {
ERR_NONE = 0, ERR_NONE = 0,
ERR_BAD_ADDRESSING, ERR_BAD_ADDRESSING,
@ -101,6 +103,10 @@ enum Error {
ERR_CANT_USE_64BIT_DISP, ERR_CANT_USE_64BIT_DISP,
ERR_OFFSET_IS_TOO_BIG, ERR_OFFSET_IS_TOO_BIG,
ERR_MEM_SIZE_IS_NOT_SPECIFIED, ERR_MEM_SIZE_IS_NOT_SPECIFIED,
ERR_BAD_MEM_SIZE,
ERR_BAD_ST_COMBINATION,
ERR_OVER_LOCAL_LABEL,
ERR_UNDER_LOCAL_LABEL,
ERR_INTERNAL ERR_INTERNAL
}; };
@ -125,6 +131,10 @@ static inline const char *ConvertErrorToString(Error err)
"can't use 64bit disp(use (void*))", "can't use 64bit disp(use (void*))",
"offset is too big", "offset is too big",
"MEM size is not specified", "MEM size is not specified",
"bad mem size",
"bad st combination",
"over local label",
"under local label",
"internal error", "internal error",
}; };
if (err < 0 || err > ERR_INTERNAL) return 0; if (err < 0 || err > ERR_INTERNAL) return 0;
@ -135,7 +145,7 @@ namespace inner {
enum { debug = 1 }; enum { debug = 1 };
static inline uint32 GetPtrDist(const void *p1, const void *p2 = 0) static inline uint32 GetPtrDist(const void *p1, const void *p2)
{ {
uint64 diff = static_cast<const char *>(p1) - static_cast<const char *>(p2); uint64 diff = static_cast<const char *>(p1) - static_cast<const char *>(p2);
#ifdef XBYAK64 #ifdef XBYAK64
@ -145,6 +155,7 @@ static inline uint32 GetPtrDist(const void *p1, const void *p2 = 0)
} }
static inline bool IsInDisp8(uint32 x) { return 0xFFFFFF80 <= x || x <= 0x7F; } static inline bool IsInDisp8(uint32 x) { return 0xFFFFFF80 <= x || x <= 0x7F; }
static inline bool IsInInt32(uint64 x) { return 0xFFFFFFFF80000000ULL <= x || x <= 0x7FFFFFFFU; }
} }
@ -163,7 +174,8 @@ public:
REG = 1 << 3, REG = 1 << 3,
MMX = 1 << 4, MMX = 1 << 4,
XMM = 1 << 5, XMM = 1 << 5,
FPU = 1 << 6 FPU = 1 << 6,
YMM = 1 << 7
}; };
enum Code { enum Code {
#ifdef XBYAK64 #ifdef XBYAK64
@ -191,10 +203,11 @@ public:
bool isNone() const { return kind_ == 0; } bool isNone() const { return kind_ == 0; }
bool isMMX() const { return is(MMX); } bool isMMX() const { return is(MMX); }
bool isXMM() const { return is(XMM); } bool isXMM() const { return is(XMM); }
bool isYMM() const { return is(YMM); }
bool isREG(int bit = 0) const { return is(REG, bit); } bool isREG(int bit = 0) const { return is(REG, bit); }
bool isMEM(int bit = 0) const { return is(MEM, bit); } bool isMEM(int bit = 0) const { return is(MEM, bit); }
bool isFPU() const { return is(FPU); }
bool isExt8bit() const { return ext8bit_ != 0; } bool isExt8bit() const { return ext8bit_ != 0; }
Operand changeBit(int bit) const { return Operand(idx_, static_cast<Kind>(kind_), bit, ext8bit_); }
// any bit is accetable if bit == 0 // any bit is accetable if bit == 0
bool is(int kind, uint32 bit = 0) const bool is(int kind, uint32 bit = 0) const
{ {
@ -216,12 +229,18 @@ public:
{ "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" }, { "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" },
}; };
return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx_]; return tbl[bit_ == 8 ? 0 : bit_ == 16 ? 1 : bit_ == 32 ? 2 : 3][idx_];
} else if (isMMX()) { } else if (isYMM()) {
static const char tbl[8][4] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" }; static const char tbl[16][5] = { "ym0", "ym1", "ym2", "ym3", "ym4", "ym5", "ym6", "ym7", "ym8", "ym9", "ym10", "ym11", "ym12", "ym13", "ym14", "ym15" };
return tbl[idx_]; return tbl[idx_];
} else if (isXMM()) { } else if (isXMM()) {
static const char tbl[16][5] = { "xm0", "xm1", "xm2", "xm3", "xm4", "xm5", "xm6", "xm7", "xm8", "xm9", "xm10", "xm11", "xm12", "xm13", "xm14", "xm15" }; static const char tbl[16][5] = { "xm0", "xm1", "xm2", "xm3", "xm4", "xm5", "xm6", "xm7", "xm8", "xm9", "xm10", "xm11", "xm12", "xm13", "xm14", "xm15" };
return tbl[idx_]; return tbl[idx_];
} else if (isMMX()) {
static const char tbl[8][4] = { "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" };
return tbl[idx_];
} else if (isFPU()) {
static const char tbl[8][4] = { "st0", "st1", "st2", "st3", "st4", "st5", "st6", "st7" };
return tbl[idx_];
} }
throw ERR_INTERNAL; throw ERR_INTERNAL;
} }
@ -229,14 +248,15 @@ public:
class Reg : public Operand { class Reg : public Operand {
void operator=(const Reg&); void operator=(const Reg&);
bool hasRex() const { return isExt8bit() | isREG(64) | isExtIdx(); }
public: public:
Reg() { } Reg() { }
Reg(int idx, Kind kind, int bit = 0, int ext8bit = 0) : Operand(idx, kind, bit, ext8bit) { } Reg(int idx, Kind kind, int bit = 0, int ext8bit = 0) : Operand(idx, kind, bit, ext8bit) { }
// reg = this Reg changeBit(int bit) const { return Reg(getIdx(), getKind(), bit, isExt8bit()); }
uint8 getRex(const Reg& index = Reg(), const Reg& base = Reg()) const bool isExtIdx() const { return getIdx() > 7; }
uint8 getRex(const Reg& base = Reg()) const
{ {
if ((!isExt8bit() && !index.isExt8bit() && !base.isExt8bit()) && (getIdx() | index.getIdx() | base.getIdx()) < 8) return 0; return (hasRex() || base.hasRex()) ? uint8(0x40 | ((isREG(64) | base.isREG(64)) ? 8 : 0) | (isExtIdx() ? 4 : 0)| (base.isExtIdx() ? 1 : 0)) : 0;
return uint8(0x40 | ((getIdx() >> 3) << 2)| ((index.getIdx() >> 3) << 1) | (base.getIdx() >> 3));
} }
}; };
@ -261,7 +281,19 @@ public:
class Xmm : public Mmx { class Xmm : public Mmx {
void operator=(const Xmm&); void operator=(const Xmm&);
public: public:
explicit Xmm(int idx) : Mmx(idx, Operand::XMM, 128) { } explicit Xmm(int idx, Kind kind = Operand::XMM, int bit = 128) : Mmx(idx, kind, bit) { }
};
class Ymm : public Xmm {
void operator=(const Ymm&);
public:
explicit Ymm(int idx) : Xmm(idx, Operand::YMM, 256) { }
};
class Fpu : public Reg {
void operator=(const Fpu&);
public:
explicit Fpu(int idx) : Reg(idx, Operand::FPU, 32) { }
}; };
// register for addressing(32bit or 64bit) // register for addressing(32bit or 64bit)
@ -307,7 +339,7 @@ private:
{ {
return operator+(r, -static_cast<int>(disp)); return operator+(r, -static_cast<int>(disp));
} }
void operator=(const Reg32e&); // don't call void operator=(const Reg32e&);
public: public:
explicit Reg32e(int idx, int bit) explicit Reg32e(int idx, int bit)
: Reg(idx, REG, bit) : Reg(idx, REG, bit)
@ -362,7 +394,7 @@ struct RegRip {
class CodeArray { class CodeArray {
enum { enum {
ALIGN_SIZE = 16, ALIGN_PAGE_SIZE = 4096,
MAX_FIXED_BUF_SIZE = 8 MAX_FIXED_BUF_SIZE = 8
}; };
enum Type { enum Type {
@ -381,13 +413,12 @@ protected:
public: public:
CodeArray(size_t maxSize = MAX_FIXED_BUF_SIZE, void *userPtr = 0) CodeArray(size_t maxSize = MAX_FIXED_BUF_SIZE, void *userPtr = 0)
: type_(userPtr ? USER_BUF : maxSize <= MAX_FIXED_BUF_SIZE ? FIXED_BUF : ALLOC_BUF) : type_(userPtr ? USER_BUF : maxSize <= MAX_FIXED_BUF_SIZE ? FIXED_BUF : ALLOC_BUF)
, allocPtr_(type_ == ALLOC_BUF ? new uint8[maxSize + ALIGN_SIZE] : 0) , allocPtr_(type_ == ALLOC_BUF ? new uint8[maxSize + ALIGN_PAGE_SIZE] : 0)
, maxSize_(maxSize) , maxSize_(maxSize)
, top_(type_ == ALLOC_BUF ? getAlignedAddress(allocPtr_) : type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : buf_) , top_(type_ == ALLOC_BUF ? getAlignedAddress(allocPtr_, ALIGN_PAGE_SIZE) : type_ == USER_BUF ? reinterpret_cast<uint8*>(userPtr) : buf_)
, size_(0) , size_(0)
{ {
if (type_ == ALLOC_BUF && !protect(top_, maxSize, true)) { if (type_ == ALLOC_BUF && !protect(top_, maxSize, true)) {
// fprintf(stderr, "can't protect (addr=%p, size=%u, canExec=%d)\n", addr, size, canExec);
throw ERR_CANT_PROTECT; throw ERR_CANT_PROTECT;
} }
} }
@ -452,19 +483,19 @@ public:
/* /*
@param data [in] address of jmp data @param data [in] address of jmp data
@param disp [in] offset from the next of jmp @param disp [in] offset from the next of jmp
@param isShort [in] true if short jmp @param size [in] write size(1, 2, 4, 8)
*/ */
void rewrite(uint8 *data, uint32 disp, bool isShort) void rewrite(uint8 *data, uint64 disp, size_t size)
{ {
if (isShort) { if (size != 1 && size != 2 && size != 4 && size != 8) throw ERR_BAD_PARAMETER;
data[0] = static_cast<uint8>(disp); for (size_t i = 0; i < size; i++) {
} else { data[i] = static_cast<uint8>(disp >> (i * 8));
data[0] = static_cast<uint8>(disp);
data[1] = static_cast<uint8>(disp >> 8);
data[2] = static_cast<uint8>(disp >> 16);
data[3] = static_cast<uint8>(disp >> 24);
} }
} }
void updateRegField(uint8 regIdx) const
{
*top_ = (*top_ & B11000111) | ((regIdx << 3) & B00111000);
}
/** /**
change exec permission of memory change exec permission of memory
@param addr [in] buffer address @param addr [in] buffer address
@ -474,15 +505,15 @@ public:
*/ */
static inline bool protect(const void *addr, size_t size, bool canExec) static inline bool protect(const void *addr, size_t size, bool canExec)
{ {
#ifdef __GNUC__ #if defined(_WIN32)
DWORD oldProtect;
return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
#elif defined(__GNUC__)
size_t pageSize = sysconf(_SC_PAGESIZE); size_t pageSize = sysconf(_SC_PAGESIZE);
size_t iaddr = reinterpret_cast<size_t>(addr); size_t iaddr = reinterpret_cast<size_t>(addr);
size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1)); size_t roundAddr = iaddr & ~(pageSize - static_cast<size_t>(1));
int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0); int mode = PROT_READ | PROT_WRITE | (canExec ? PROT_EXEC : 0);
return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0; return mprotect(reinterpret_cast<void*>(roundAddr), size + (iaddr - roundAddr), mode) == 0;
#elif defined(_WIN32)
DWORD oldProtect;
return VirtualProtect(const_cast<void*>(addr), size, canExec ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE, &oldProtect) != 0;
#else #else
return true; return true;
#endif #endif
@ -493,7 +524,7 @@ public:
@param alingedSize [in] power of two @param alingedSize [in] power of two
@return aligned addr by alingedSize @return aligned addr by alingedSize
*/ */
static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE) static inline uint8 *getAlignedAddress(uint8 *addr, size_t alignedSize = 16)
{ {
return reinterpret_cast<uint8*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1))); return reinterpret_cast<uint8*>((reinterpret_cast<size_t>(addr) + alignedSize - 1) & ~(alignedSize - static_cast<size_t>(1)));
} }
@ -521,11 +552,7 @@ public:
uint64 getDisp() const { return disp_; } uint64 getDisp() const { return disp_; }
uint8 getRex() const { return rex_; } uint8 getRex() const { return rex_; }
bool is64bitDisp() const { return is64bitDisp_; } // for moffset bool is64bitDisp() const { return is64bitDisp_; } // for moffset
#ifdef XBYAK64
void setRex(uint8 rex) { rex_ = rex; } void setRex(uint8 rex) { rex_ = rex; }
#else
void setRex(uint8) { }
#endif
}; };
class AddressFrame { class AddressFrame {
@ -536,7 +563,11 @@ public:
explicit AddressFrame(uint32 bit) : bit_(bit) { } explicit AddressFrame(uint32 bit) : bit_(bit) { }
Address operator[](const void *disp) const Address operator[](const void *disp) const
{ {
Reg32e r(Reg(), Reg(), 0, inner::GetPtrDist(disp)); size_t adr = reinterpret_cast<size_t>(disp);
#ifdef XBYAK64
if (adr > 0xFFFFFFFFU) throw ERR_OFFSET_IS_TOO_BIG;
#endif
Reg32e r(Reg(), Reg(), 0, static_cast<uint32>(adr));
return operator[](r); return operator[](r);
} }
#ifdef XBYAK64 #ifdef XBYAK64
@ -587,7 +618,8 @@ public:
} else if (mod == mod10 || (mod == mod00 && r.isNone())) { } else if (mod == mod10 || (mod == mod00 && r.isNone())) {
frame.dd(r.disp_); frame.dd(r.disp_);
} }
frame.setRex(Reg().getRex(r.index_, r)); uint8 rex = ((r.getIdx() | r.index_.getIdx()) < 8) ? 0 : uint8(0x40 | ((r.index_.getIdx() >> 3) << 1) | (r.getIdx() >> 3));
frame.setRex(rex);
return frame; return frame;
} }
}; };
@ -600,6 +632,12 @@ struct JmpLabel {
class Label { class Label {
CodeArray *base_; CodeArray *base_;
int anonymousCount_; // for @@, @f, @b int anonymousCount_; // for @@, @f, @b
enum {
maxStack = 10
};
int stack_[maxStack];
int stackPos_;
int usedCount_;
int localCount_; // for .*** int localCount_; // for .***
typedef std::map<const std::string, const uint8*> DefinedList; typedef std::map<const std::string, const uint8*> DefinedList;
typedef std::multimap<const std::string, const JmpLabel> UndefinedList; typedef std::multimap<const std::string, const JmpLabel> UndefinedList;
@ -628,15 +666,22 @@ public:
Label() Label()
: base_(0) : base_(0)
, anonymousCount_(0) , anonymousCount_(0)
, stackPos_(1)
, usedCount_(0)
, localCount_(0) , localCount_(0)
{ {
} }
void incLocalCount() { localCount_++; } void enterLocal()
void decLocalCount() { localCount_--; }
void set(CodeArray *base)
{ {
base_ = base; if (stackPos_ == maxStack) throw ERR_OVER_LOCAL_LABEL;
localCount_ = stack_[stackPos_++] = ++usedCount_;
} }
void leaveLocal()
{
if (stackPos_ == 1) throw ERR_UNDER_LOCAL_LABEL;
localCount_ = stack_[--stackPos_ - 1];
}
void set(CodeArray *base) { base_ = base; }
void define(const char *label, const uint8 *address) void define(const char *label, const uint8 *address)
{ {
std::string newLabel(label); std::string newLabel(label);
@ -657,8 +702,9 @@ public:
const JmpLabel *jmp = &itr->second; const JmpLabel *jmp = &itr->second;
uint32 disp = inner::GetPtrDist(address, jmp->endOfJmp); uint32 disp = inner::GetPtrDist(address, jmp->endOfJmp);
if (jmp->isShort && !inner::IsInDisp8(disp)) throw ERR_LABEL_IS_TOO_FAR; if (jmp->isShort && !inner::IsInDisp8(disp)) throw ERR_LABEL_IS_TOO_FAR;
uint8 *data = jmp->endOfJmp - (jmp->isShort ? 1 : 4); size_t jmpSize = jmp->isShort ? 1 : 4;
base_->rewrite(data, disp, jmp->isShort); uint8 *data = jmp->endOfJmp - jmpSize;
base_->rewrite(data, disp, jmpSize);
undefinedList_.erase(itr); undefinedList_.erase(itr);
} }
} }
@ -689,22 +735,22 @@ public:
static inline std::string toStr(int num) static inline std::string toStr(int num)
{ {
char buf[16]; char buf[16];
static const char fmt[] = ".%08x";
#ifdef _WIN32 #ifdef _WIN32
#if _MSC_VER < 1400 #if _MSC_VER < 1400
_snprintf(buf, sizeof(buf), fmt, num); _snprintf
#else #else
_snprintf_s(buf, sizeof(buf), fmt, num); _snprintf_s
#endif #endif
#else #else
snprintf(buf, sizeof(buf), fmt, num); snprintf
#endif #endif
(buf, sizeof(buf), ".%08x", num);
return buf; return buf;
} }
}; };
class CodeGenerator : public CodeArray { class CodeGenerator : public CodeArray {
protected: public:
enum LabelType { enum LabelType {
T_SHORT, T_SHORT,
T_NEAR, T_NEAR,
@ -747,35 +793,43 @@ private:
{ {
return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM()); return op1.isREG(i32e) && (op2.isXMM() || op2.isMEM());
} }
void if16bit(const Operand& reg1, const Operand& reg2)
{
// except movsx(16bit, 32/64bit)
if ((reg1.isBit(16) && !reg2.isBit(i32e)) || (reg2.isBit(16) && !reg1.isBit(i32e))) db(0x66);
}
void rexAddr(const Address& addr, const Reg& reg = Reg())
{
#ifdef XBYAK64
if (addr.is32bit_) db(0x67);
#endif
if16bit(reg, addr);
uint32 rex = addr.getRex() | reg.getRex();
if (reg.isREG(64)) rex |= 0x48;
if (rex) db(rex);
}
void rex(const Operand& op1, const Operand& op2 = Operand()) void rex(const Operand& op1, const Operand& op2 = Operand())
{ {
if (op1.isMEM()) { uint8 rex = 0;
rexAddr(static_cast<const Address&>(op1), static_cast<const Reg&>(op2)); const Operand *p1 = &op1, *p2 = &op2;
} else if (op2.isMEM()) { if (p1->isMEM()) std::swap(p1, p2);
rexAddr(static_cast<const Address&>(op2), static_cast<const Reg&>(op1)); if (p1->isMEM()) throw ERR_BAD_COMBINATION;
if (p2->isMEM()) {
const Address& addr = static_cast<const Address&>(*p2);
if (BIT == 64 && addr.is32bit_) db(0x67);
rex = addr.getRex() | static_cast<const Reg&>(*p1).getRex();
} else { } else {
const Reg& reg1 = static_cast<const Reg&>(op1);
const Reg& reg2 = static_cast<const Reg&>(op2);
// ModRM(reg, base); // ModRM(reg, base);
if16bit(reg1, reg2); rex = static_cast<const Reg&>(op2).getRex(static_cast<const Reg&>(op1));
uint8 rex = reg2.getRex(Reg(), reg1); }
if (reg1.isREG(64) || reg2.isREG(64)) rex |= 0x48; // except movsx(16bit, 32/64bit)
if (rex) db(rex); if ((op1.isBit(16) && !op2.isBit(i32e)) || (op2.isBit(16) && !op1.isBit(i32e))) db(0x66);
if (rex) db(rex);
}
enum AVXtype {
PP_NONE = 1 << 0,
PP_66 = 1 << 1,
PP_F3 = 1 << 2,
PP_F2 = 1 << 3,
MM_RESERVED = 1 << 4,
MM_0F = 1 << 5,
MM_0F38 = 1 << 6,
MM_0F3A = 1 << 7
};
void vex(bool r, int idx, bool is256, int type, bool x = false, bool b = false, int w = 1)
{
uint32 pp = (type & PP_66) ? 1 : (type & PP_F3) ? 2 : (type & PP_F2) ? 3 : 0;
uint32 vvvv = (((~idx) & 15) << 3) | (is256 ? 4 : 0) | pp;
if (!b && !x && !w && (type & MM_0F)) {
db(0xC5); db((r ? 0 : 0x80) | vvvv);
} else {
uint32 mmmm = (type & MM_0F) ? 1 : (type & MM_0F38) ? 2 : (type & MM_0F3A) ? 3 : 0;
db(0xC4); db((r ? 0 : 0x80) | (x ? 0 : 0x40) | (b ? 0 : 0x20) | mmmm); db((w << 7) | vvvv);
} }
} }
Label label_; Label label_;
@ -792,10 +846,8 @@ private:
if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP; if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
rex(addr, reg); rex(addr, reg);
db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2); db(code0 | (reg.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
uint8 t = *addr.getCode(); addr.updateRegField(static_cast<uint8>(reg.getIdx()));
assert((t & ~0xC7) == 0); /* 0b11000111 */ db(addr.getCode(), static_cast<int>(addr.getSize()));
db(t | ((reg.getIdx() & 7) << 3)); // update reg field
db(addr.getCode() + 1, static_cast<int>(addr.getSize()) - 1);
} }
void opJmp(const char *label, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref) void opJmp(const char *label, LabelType type, uint8 shortCode, uint8 longCode, uint8 longPref)
{ {
@ -835,13 +887,13 @@ private:
if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) { if (type != T_NEAR && inner::IsInDisp8(disp - shortJmpSize)) {
db(shortCode); db(shortCode);
db(0); db(0);
rewrite(top + shortHeaderSize, disp - shortJmpSize, true); rewrite(top + shortHeaderSize, disp - shortJmpSize, 1);
} else { } else {
if (type == T_SHORT) throw ERR_LABEL_IS_TOO_FAR; if (type == T_SHORT) throw ERR_LABEL_IS_TOO_FAR;
if (longPref) db(longPref); if (longPref) db(longPref);
db(longCode); db(longCode);
dd(0); dd(0);
rewrite(top + longHeaderSize, disp - longJmpSize, false); rewrite(top + longHeaderSize, disp - longJmpSize, 4);
} }
} }
/* preCode is for SSSE3/SSE4 */ /* preCode is for SSSE3/SSE4 */
@ -864,8 +916,7 @@ private:
} }
void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE) void opMMX(const Mmx& mmx, const Operand& op, int code, int pref = 0x66, int imm8 = NONE, int preCode = NONE)
{ {
pref = mmx.isXMM() ? pref : NONE; opGen(mmx, op, code, mmx.isXMM() ? pref : NONE, isXMMorMMX_MEM, imm8, preCode);
opGen(mmx, op, code, pref, isXMMorMMX_MEM, imm8, preCode);
} }
void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref) void opMovXMM(const Operand& op1, const Operand& op2, int code, int pref)
{ {
@ -887,14 +938,14 @@ private:
opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, B00111010); opGen(mmx, op, code, 0x66, isXMM_REG32orMEM, imm, B00111010);
} }
} }
void opR_ModM(const Operand& op, int bit, uint8 mod, int ext, int code0, int code1 = NONE, int code2 = NONE) void opR_ModM(const Operand& op, int bit, int ext, int code0, int code1 = NONE, int code2 = NONE, bool disableRex = false)
{ {
int opBit = op.getBit();
if (disableRex && opBit == 64) opBit = 32;
if (op.isREG(bit)) { if (op.isREG(bit)) {
rex(op); opModR(Reg(ext, Operand::REG, opBit), static_cast<const Reg&>(op).changeBit(opBit), code0, code1, code2);
db(code0 | (op.isBit(8) ? 0 : 1)); if (code1 != NONE) db(code1); if (code2 != NONE) db(code2);
db(getModRM(mod, ext, op.getIdx()));
} else if (op.isMEM()) { } else if (op.isMEM()) {
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code0, code1, code2); opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, opBit), code0, code1, code2);
} else { } else {
throw ERR_BAD_COMBINATION; throw ERR_BAD_COMBINATION;
} }
@ -902,13 +953,13 @@ private:
void opShift(const Operand& op, int imm, int ext) void opShift(const Operand& op, int imm, int ext)
{ {
verifyMemHasSize(op); verifyMemHasSize(op);
opR_ModM(op, 0, 3, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4))); opR_ModM(op, 0, ext, (B11000000 | ((imm == 1 ? 1 : 0) << 4)));
if (imm != 1) db(imm); if (imm != 1) db(imm);
} }
void opShift(const Operand& op, const Reg8& cl, int ext) void opShift(const Operand& op, const Reg8& cl, int ext)
{ {
if (cl.getIdx() != Operand::CL) throw ERR_BAD_COMBINATION; if (cl.getIdx() != Operand::CL) throw ERR_BAD_COMBINATION;
opR_ModM(op, 0, 3, ext, B11010010); opR_ModM(op, 0, ext, B11010010);
} }
void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE) void opModRM(const Operand& op1, const Operand& op2, bool condR, bool condM, int code0, int code1 = NONE, int code2 = NONE)
{ {
@ -941,20 +992,19 @@ private:
verifyMemHasSize(op); verifyMemHasSize(op);
uint32 immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32; uint32 immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
if (op.getBit() < immBit) throw ERR_IMM_IS_TOO_BIG; if (op.getBit() < immBit) throw ERR_IMM_IS_TOO_BIG;
if (op.isREG()) { if (op.isREG(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
if (immBit == 16 && op.isBit(32)) immBit = 32; /* don't use MEM16 if 32bit mode */
}
if (op.isREG() && op.getIdx() == 0 && (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) { // rax, eax, ax, al if (op.isREG() && op.getIdx() == 0 && (op.getBit() == immBit || (op.isBit(64) && immBit == 32))) { // rax, eax, ax, al
rex(op); rex(op);
db(code | 4 | (immBit == 8 ? 0 : 1)); db(code | 4 | (immBit == 8 ? 0 : 1));
} else { } else {
int tmp = (op.getBit() > immBit && 32 > immBit) ? 2 : 0; int tmp = immBit < (std::min)(op.getBit(), 32U) ? 2 : 0;
opR_ModM(op, 0, 3, ext, B10000000 | tmp); opR_ModM(op, 0, ext, B10000000 | tmp);
} }
db(imm, immBit / 8); db(imm, immBit / 8);
} }
void opIncDec(const Operand& op, int code, int ext) void opIncDec(const Operand& op, int code, int ext)
{ {
verifyMemHasSize(op);
#ifndef XBYAK64 #ifndef XBYAK64
if (op.isREG() && !op.isBit(8)) { if (op.isREG() && !op.isBit(8)) {
rex(op); db(code | op.getIdx()); rex(op); db(code | op.getIdx());
@ -964,21 +1014,15 @@ private:
code = B11111110; code = B11111110;
if (op.isREG()) { if (op.isREG()) {
opModR(Reg(ext, Operand::REG, op.getBit()), static_cast<const Reg&>(op), code); opModR(Reg(ext, Operand::REG, op.getBit()), static_cast<const Reg&>(op), code);
} else if (op.isMEM() && op.getBit() > 0) {
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
} else { } else {
throw ERR_BAD_COMBINATION; opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
} }
} }
void opPushPop(const Operand& op, int code, int ext, int alt) void opPushPop(const Operand& op, int code, int ext, int alt)
{ {
if (op.isREG()) { if (op.isREG()) {
#ifdef XBYAK64
if (op.isBit(16)) db(0x66); if (op.isBit(16)) db(0x66);
if (static_cast<const Reg&>(op).getIdx() >= 8) db(0x41); if (static_cast<const Reg&>(op).getIdx() >= 8) db(0x41);
#else
rex(op);
#endif
db(alt | (op.getIdx() & 7)); db(alt | (op.getIdx() & 7));
} else if (op.isMEM()) { } else if (op.isMEM()) {
opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code); opModM(static_cast<const Address&>(op), Reg(ext, Operand::REG, op.getBit()), code);
@ -990,16 +1034,51 @@ private:
{ {
if (op.isMEM() && op.getBit() == 0) throw ERR_MEM_SIZE_IS_NOT_SPECIFIED; if (op.isMEM() && op.getBit() == 0) throw ERR_MEM_SIZE_IS_NOT_SPECIFIED;
} }
protected: void opMovxx(const Reg& reg, const Operand& op, uint8 code)
{
int w = op.isBit(16);
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
}
void opFpuMem(const Address& addr, uint8 m16, uint8 m32, uint8 m64, uint8 ext, uint8 m64ext)
{
if (addr.is64bitDisp()) throw ERR_CANT_USE_64BIT_DISP;
uint8 code = addr.isBit(16) ? m16 : addr.isBit(32) ? m32 : addr.isBit(64) ? m64 : 0;
if (!code) throw ERR_BAD_MEM_SIZE;
if (m64ext && addr.isBit(64)) ext = m64ext;
rex(addr, st0);
db(code);
addr.updateRegField(ext);
db(addr.getCode(), static_cast<int>(addr.getSize()));
}
// like yasm not nasm
// use code1 if reg1 == st0
// use code2 if reg1 != st0 && reg2 == st0
void opFpuFpu(const Fpu& reg1, const Fpu& reg2, uint32 code1, uint32 code2)
{
uint32 code = reg1.getIdx() == 0 ? code1 : reg2.getIdx() == 0 ? code2 : 0;
if (!code) throw ERR_BAD_ST_COMBINATION;
db(uint8(code >> 8));
db(uint8(code | (reg1.getIdx() | reg2.getIdx())));
}
void opFpu(const Fpu& reg, uint8 code1, uint8 code2)
{
db(code1); db(code2 | reg.getIdx());
}
public:
unsigned int getVersion() const { return VERSION; } unsigned int getVersion() const { return VERSION; }
using CodeArray::db; using CodeArray::db;
const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; const Mmx mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; const Xmm xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
const Ymm ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7;
const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7; const Xmm &xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7;
const Ymm &ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7;
const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi; const Reg32 eax, ecx, edx, ebx, esp, ebp, esi, edi;
const Reg16 ax, cx, dx, bx, sp, bp, si, di; const Reg16 ax, cx, dx, bx, sp, bp, si, di;
const Reg8 al, cl, dl, bl, ah, ch, dh, bh; const Reg8 al, cl, dl, bl, ah, ch, dh, bh;
const AddressFrame ptr, byte, word, dword, qword, xmmword; const AddressFrame ptr, byte, word, dword, qword;
const Fpu st0, st1, st2, st3, st4, st5, st6, st7;
#ifdef XBYAK64 #ifdef XBYAK64
const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15; const Reg64 rax, rcx, rdx, rbx, rsp, rbp, rsi, rdi, r8, r9, r10, r11, r12, r13, r14, r15;
const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d; const Reg32 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d;
@ -1007,7 +1086,9 @@ protected:
const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b; const Reg8 r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b;
const Reg8 spl, bpl, sil, dil; const Reg8 spl, bpl, sil, dil;
const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; const Xmm xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15; const Ymm ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15;
const Xmm &xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15; // for my convenience
const Ymm &ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15;
const RegRip rip; const RegRip rip;
#endif #endif
@ -1015,8 +1096,8 @@ protected:
{ {
label_.define(label, getCurr()); label_.define(label, getCurr());
} }
void inLocalLabel() { label_.incLocalCount(); } void inLocalLabel() { label_.enterLocal(); }
void outLocalLabel() { label_.decLocalCount(); } void outLocalLabel() { label_.leaveLocal(); }
void jmp(const char *label, LabelType type = T_AUTO) void jmp(const char *label, LabelType type = T_AUTO)
{ {
opJmp(label, type, B11101011, B11101001, 0); opJmp(label, type, B11101011, B11101001, 0);
@ -1027,7 +1108,11 @@ protected:
} }
void jmp(const Operand& op) void jmp(const Operand& op)
{ {
opR_ModM(op, i32e, 3, 4, 0xFF); opR_ModM(op, BIT, 4, 0xFF, NONE, NONE, true);
}
void call(const Operand& op)
{
opR_ModM(op, 16 | i32e, 2, 0xFF, NONE, NONE, true);
} }
// (REG|MEM, REG) // (REG|MEM, REG)
void test(const Operand& op, const Reg& reg) void test(const Operand& op, const Reg& reg)
@ -1042,10 +1127,9 @@ protected:
rex(op); rex(op);
db(B10101000 | (op.isBit(8) ? 0 : 1)); db(B10101000 | (op.isBit(8) ? 0 : 1));
} else { } else {
opR_ModM(op, 0, 3, 0, B11110110); opR_ModM(op, 0, 0, B11110110);
} }
int size = op.getBit() / 8; if (size > 4) size = 4; db(imm, (std::min)(op.getBit() / 8, 4U));
db(imm, size);
} }
void ret(int imm = 0) void ret(int imm = 0)
{ {
@ -1134,24 +1218,39 @@ protected:
opRM_RM(reg1, reg2, B10001000); opRM_RM(reg1, reg2, B10001000);
} }
} }
void mov(const Operand& op, uint64 imm) void mov(const Operand& op,
#ifdef XBYAK64
uint64
#else
uint32
#endif
imm)
{ {
verifyMemHasSize(op); verifyMemHasSize(op);
if (op.isREG()) { if (op.isREG()) {
int w = op.isBit(8) ? 0 : 1; rex(op);
rex(op); db(B10110000 | (w << 3) | (op.getIdx() & 7)); int code, size;
#ifdef XBYAK64
if (op.isBit(64) && inner::IsInInt32(imm)) {
db(B11000111);
code = B11000000;
size = 4;
} else
#endif
{
code = B10110000 | ((op.isBit(8) ? 0 : 1) << 3);
size = op.getBit() / 8;
}
db(code | (op.getIdx() & 7));
db(imm, size);
} else if (op.isMEM()) { } else if (op.isMEM()) {
opModM(static_cast<const Address&>(op), Reg(0, Operand::REG, op.getBit()), B11000110); opModM(static_cast<const Address&>(op), Reg(0, Operand::REG, op.getBit()), B11000110);
int size = op.getBit() / 8; if (size > 4) size = 4;
db(static_cast<uint32>(imm), size);
} else { } else {
throw ERR_BAD_COMBINATION; throw ERR_BAD_COMBINATION;
} }
db(imm, op.getBit() / 8);
}
void opMovxx(const Reg& reg, const Operand& op, uint8 code)
{
int w = op.isBit(16);
bool cond = reg.isREG() && (reg.getBit() > op.getBit());
opModRM(reg, op, cond && op.isREG(), cond && op.isMEM(), 0x0F, code | w);
} }
void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, B11000111); } void cmpxchg8b(const Address& addr) { opModM(addr, Reg32(1), 0x0F, B11000111); }
#ifdef XBYAK64 #ifdef XBYAK64
@ -1180,20 +1279,17 @@ protected:
} }
void call(const char *label) void call(const char *label)
{ {
opJmp(label, T_NEAR, 0, B10011010, 0); opJmp(label, T_NEAR, 0, B11101000, 0);
} }
void call(const void *addr) void call(const void *addr)
{ {
opJmp(addr, T_NEAR, 0, B11101000, 0); opJmp(addr, T_NEAR, 0, B11101000, 0);
} }
void call(const Operand& op)
{
opR_ModM(op, 16 | i32e, 3, 2, B11111111);
}
// special case // special case
void movd(const Address& addr, const Mmx& mmx) void movd(const Address& addr, const Mmx& mmx)
{ {
opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, B01111110); if (mmx.isXMM()) db(0x66);
opModM(addr, mmx, 0x0F, B01111110);
} }
void movd(const Reg32& reg, const Mmx& mmx) void movd(const Reg32& reg, const Mmx& mmx)
{ {
@ -1202,8 +1298,8 @@ protected:
} }
void movd(const Mmx& mmx, const Address& addr) void movd(const Mmx& mmx, const Address& addr)
{ {
ASSERT(!addr.isBit(32)); // don't use dword ptr, bogus, won't output 0x66 for xmm dest op if (mmx.isXMM()) db(0x66);
opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, B01101110); opModM(addr, mmx, 0x0F, B01101110);
} }
void movd(const Mmx& mmx, const Reg32& reg) void movd(const Mmx& mmx, const Reg32& reg)
{ {
@ -1225,8 +1321,31 @@ protected:
} }
void movq(const Address& addr, const Mmx& mmx) void movq(const Address& addr, const Mmx& mmx)
{ {
opModM(addr, Reg(mmx.getIdx(), Operand::REG, mmx.getBit() / 8), 0x0F, mmx.isXMM() ? B11010110 : B01111111); if (mmx.isXMM()) db(0x66);
opModM(addr, mmx, 0x0F, mmx.isXMM() ? B11010110 : B01111111);
} }
#ifdef XBYAK64
void movq(const Reg64& reg, const Mmx& mmx)
{
if (mmx.isXMM()) db(0x66);
opModR(mmx, reg, 0x0F, B01111110);
}
void movq(const Mmx& mmx, const Reg64& reg)
{
if (mmx.isXMM()) db(0x66);
opModR(mmx, reg, 0x0F, B01101110);
}
void pextrq(const Operand& op, const Xmm& xmm, uint8 imm)
{
if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION;
opGen(Reg64(xmm.getIdx()), op, 0x16, 0x66, 0, imm, B00111010); // force to 64bit
}
void pinsrq(const Xmm& xmm, const Operand& op, uint8 imm)
{
if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION;
opGen(Reg64(xmm.getIdx()), op, 0x22, 0x66, 0, imm, B00111010); // force to 64bit
}
#endif
// MMX2 : pextrw : reg, mmx/xmm, imm // MMX2 : pextrw : reg, mmx/xmm, imm
// SSE4 : pextrw, pextrb, pextrd, extractps : reg/mem, mmx/xmm, imm // SSE4 : pextrw, pextrb, pextrd, extractps : reg/mem, mmx/xmm, imm
void pextrw(const Operand& op, const Mmx& xmm, uint8 imm) { opExt(op, xmm, 0x15, imm, true); } void pextrw(const Operand& op, const Mmx& xmm, uint8 imm) { opExt(op, xmm, 0x15, imm, true); }
@ -1270,7 +1389,7 @@ protected:
bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM()); bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
if (!is16bit && !(reg.isREG(i32e) && (op.isREG(i32e) || op.isMEM()))) throw ERR_BAD_COMBINATION; if (!is16bit && !(reg.isREG(i32e) && (op.isREG(i32e) || op.isMEM()))) throw ERR_BAD_COMBINATION;
if (is16bit) db(0x66); if (is16bit) db(0x66);
db(0xF3); opModRM(Reg(reg.getIdx(), Operand::REG, i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, 0x0F, 0xB8); db(0xF3); opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, 0x0F, 0xB8);
} }
void crc32(const Reg32e& reg, const Operand& op) void crc32(const Reg32e& reg, const Operand& op)
{ {
@ -1278,17 +1397,86 @@ protected:
db(0xF2); db(0xF2);
opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1)); opModRM(reg, op, op.isREG(), op.isMEM(), 0x0F, 0x38, 0xF0 | (op.isBit(8) ? 0 : 1));
} }
public: void vextractps(const Operand& op, const Xmm& xmm, uint8 imm)
{
if (!(op.isREG(32) || op.isMEM()) || xmm.isYMM()) throw ERR_BAD_COMBINATION;
opAVX_X_XM_IMM(xmm, cvtReg(op, op.isREG(), Operand::XMM), MM_0F3A | PP_66, 0x17, false, 0, imm);
}
// support (x, x, x/m), (y, y, y/m)
void opAVX_X_X_XM(const Xmm& xm1, const Operand& op1, const Operand& op2, int type, int code0, bool supportYMM, int w = -1)
{
const Xmm *xm2;
const Operand *op;
if (op2.isNone()) {
xm2 = &xm1;
op = &op1;
} else {
if (!(op1.isXMM() || (supportYMM && op1.isYMM()))) throw ERR_BAD_COMBINATION;
xm2 = static_cast<const Xmm*>(&op1);
op = &op2;
}
// (xm1, xm2, op)
if (!((xm1.isXMM() && xm2->isXMM()) || (supportYMM && xm1.isYMM() && xm2->isYMM()))) throw ERR_BAD_COMBINATION;
bool x, b;
if (op->isMEM()) {
const Address& addr = *static_cast<const Address*>(op);
uint8 rex = addr.getRex();
x = (rex & 2) != 0;
b = (rex & 1) != 0;
if (BIT == 64 && addr.is32bit_) db(0x67);
if (BIT == 64 && w == -1) w = (rex & 4) ? 1 : 0;
} else {
x = false;
b = static_cast<const Reg*>(op)->isExtIdx();
}
if (w == -1) w = 0;
vex(xm1.isExtIdx(), xm2->getIdx(), xm1.isYMM(), type, x, b, w);
db(code0);
if (op->isMEM()) {
const Address& addr = *static_cast<const Address*>(op);
addr.updateRegField(static_cast<uint8>(xm1.getIdx()));
db(addr.getCode(), static_cast<int>(addr.getSize()));
} else {
db(getModRM(3, xm1.getIdx(), op->getIdx()));
}
}
// if cvt then return pointer to Xmm(idx) (or Ymm(idx)), otherwise return op
const Operand& cvtReg(const Operand& op, bool cvt, Operand::Kind kind) const
{
if (!cvt) return op;
static const Xmm* xmTbl[] = {
&xm0, &xm1, &xm2, &xm3, &xm4, &xm5, &xm6, &xm7,
#ifdef XBYAK64
&xm8, &xm9, &xm10, &xm11, &xm12, &xm13, &xm14, &xm15
#endif
};
static const Ymm* ymTbl[] = {
&ym0, &ym1, &ym2, &ym3, &ym4, &ym5, &ym6, &ym7,
#ifdef XBYAK64
&ym8, &ym9, &ym10, &ym11, &ym12, &ym13, &ym14, &ym15
#endif
};
return (kind == Operand::XMM) ? *xmTbl[op.getIdx()] : *ymTbl[op.getIdx()];
}
// support (x, x/m, imm), (y, y/m, imm)
void opAVX_X_XM_IMM(const Xmm& xmm, const Operand& op, int type, int code, bool supportYMM, int w = -1, int imm = NONE)
{
opAVX_X_X_XM(xmm, xmm.isXMM() ? xm0 : ym0, op, type, code, supportYMM, w); if (imm != NONE) db((uint8)imm);
}
enum { NONE = 256 }; enum { NONE = 256 };
public:
CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0) CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0)
: CodeArray(maxSize, userPtr) : CodeArray(maxSize, userPtr)
, mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7) , mm0(0), mm1(1), mm2(2), mm3(3), mm4(4), mm5(5), mm6(6), mm7(7)
, xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7) , xmm0(0), xmm1(1), xmm2(2), xmm3(3), xmm4(4), xmm5(5), xmm6(6), xmm7(7)
, ymm0(0), ymm1(1), ymm2(2), ymm3(3), ymm4(4), ymm5(5), ymm6(6), ymm7(7)
, xm0(xmm0), xm1(xmm1), xm2(xmm2), xm3(xmm3), xm4(xmm4), xm5(xmm5), xm6(xmm6), xm7(xmm7) // for my convenience , xm0(xmm0), xm1(xmm1), xm2(xmm2), xm3(xmm3), xm4(xmm4), xm5(xmm5), xm6(xmm6), xm7(xmm7) // for my convenience
, ym0(ymm0), ym1(ymm1), ym2(ymm2), ym3(ymm3), ym4(ymm4), ym5(ymm5), ym6(ymm6), ym7(ymm7) // for my convenience
, eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI) , eax(Operand::EAX), ecx(Operand::ECX), edx(Operand::EDX), ebx(Operand::EBX), esp(Operand::ESP), ebp(Operand::EBP), esi(Operand::ESI), edi(Operand::EDI)
, ax(Operand::EAX), cx(Operand::ECX), dx(Operand::EDX), bx(Operand::EBX), sp(Operand::ESP), bp(Operand::EBP), si(Operand::ESI), di(Operand::EDI) , ax(Operand::EAX), cx(Operand::ECX), dx(Operand::EDX), bx(Operand::EBX), sp(Operand::ESP), bp(Operand::EBP), si(Operand::ESI), di(Operand::EDI)
, al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH) , al(Operand::AL), cl(Operand::CL), dl(Operand::DL), bl(Operand::BL), ah(Operand::AH), ch(Operand::CH), dh(Operand::DH), bh(Operand::BH)
, ptr(0), byte(8), word(16), dword(32), qword(64), xmmword(128) , ptr(0), byte(8), word(16), dword(32), qword(64)
, st0(0), st1(1), st2(2), st3(3), st4(4), st5(5), st6(6), st7(7)
#ifdef XBYAK64 #ifdef XBYAK64
, rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15) , rax(Operand::RAX), rcx(Operand::RCX), rdx(Operand::RDX), rbx(Operand::RBX), rsp(Operand::RSP), rbp(Operand::RBP), rsi(Operand::RSI), rdi(Operand::RDI), r8(Operand::R8), r9(Operand::R9), r10(Operand::R10), r11(Operand::R11), r12(Operand::R12), r13(Operand::R13), r14(Operand::R14), r15(Operand::R15)
, r8d(Operand::R8D), r9d(Operand::R9D), r10d(Operand::R10D), r11d(Operand::R11D), r12d(Operand::R12D), r13d(Operand::R13D), r14d(Operand::R14D), r15d(Operand::R15D) , r8d(Operand::R8D), r9d(Operand::R9D), r10d(Operand::R10D), r11d(Operand::R11D), r12d(Operand::R12D), r13d(Operand::R13D), r14d(Operand::R14D), r15d(Operand::R15D)
@ -1296,7 +1484,9 @@ public:
, r8b(Operand::R8B), r9b(Operand::R9B), r10b(Operand::R10B), r11b(Operand::R11B), r12b(Operand::R12B), r13b(Operand::R13B), r14b(Operand::R14B), r15b(Operand::R15B) , r8b(Operand::R8B), r9b(Operand::R9B), r10b(Operand::R10B), r11b(Operand::R11B), r12b(Operand::R12B), r13b(Operand::R13B), r14b(Operand::R14B), r15b(Operand::R15B)
, spl(Operand::SPL, 1), bpl(Operand::BPL, 1), sil(Operand::SIL, 1), dil(Operand::DIL, 1) , spl(Operand::SPL, 1), bpl(Operand::BPL, 1), sil(Operand::SIL, 1), dil(Operand::DIL, 1)
, xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15) , xmm8(8), xmm9(9), xmm10(10), xmm11(11), xmm12(12), xmm13(13), xmm14(14), xmm15(15)
, ymm8(8), ymm9(9), ymm10(10), ymm11(11), ymm12(12), ymm13(13), ymm14(14), ymm15(15)
, xm8(xmm8), xm9(xmm9), xm10(xmm10), xm11(xmm11), xm12(xmm12), xm13(xmm13), xm14(xmm14), xm15(xmm15) // for my convenience , xm8(xmm8), xm9(xmm9), xm10(xmm10), xm11(xmm11), xm12(xmm12), xm13(xmm13), xm14(xmm14), xm15(xmm15) // for my convenience
, ym8(ymm8), ym9(ymm9), ym10(ymm10), ym11(ymm11), ym12(ymm12), ym13(ymm13), ym14(ymm14), ym15(ymm15) // for my convenience
, rip() , rip()
#endif #endif
{ {
@ -1309,7 +1499,7 @@ public:
// if (hasUndefinedLabel()) throw ERR_LABEL_IS_NOT_FOUND; // if (hasUndefinedLabel()) throw ERR_LABEL_IS_NOT_FOUND;
return top_; return top_;
} }
#ifdef TEST_NM #ifdef XBYAK_TEST
void dump(bool doClear = true) void dump(bool doClear = true)
{ {
CodeArray::dump(); CodeArray::dump();
@ -1322,7 +1512,7 @@ public:
void align(int x = 16) void align(int x = 16)
{ {
if (x != 4 && x != 8 && x != 16 && x != 32) throw ERR_BAD_ALIGN; if (x != 4 && x != 8 && x != 16 && x != 32) throw ERR_BAD_ALIGN;
while (inner::GetPtrDist(getCurr()) % x) { while (size_t(getCurr()) % x) {
nop(); nop();
} }
} }
@ -1335,4 +1525,4 @@ public:
} // end of namespace } // end of namespace
#endif // XBYAK_H_ #endif // XBYAK_XBYAK_H_

View File

@ -1,4 +1,4 @@
const char *getVersionString() const { return "2.07"; } const char *getVersionString() const { return "2.99"; }
void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); } void packssdw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x6B); }
void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); } void packsswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x63); }
void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); } void packuswb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x67); }
@ -184,88 +184,94 @@ void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x16, 0
void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); } void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, 0x12, 0x66); }
void cmovo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 0); } void cmovo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 0); }
void jo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); } void jo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x70, 0x80, 0x0F); }
void seto(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 0); } void seto(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 0); }
void cmovno(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 1); } void cmovno(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 1); }
void jno(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); } void jno(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x71, 0x81, 0x0F); }
void setno(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 1); } void setno(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 1); }
void cmovb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); } void cmovb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
void jb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); } void jb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
void setb(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 2); } void setb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
void cmovc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
void jc(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
void setc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
void cmovnae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); } void cmovnae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 2); }
void jnae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); } void jnae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x72, 0x82, 0x0F); }
void setnae(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 2); } void setnae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 2); }
void cmovnb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); } void cmovnb(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
void jnb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); } void jnb(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
void setnb(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 3); } void setnb(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
void cmovae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); } void cmovae(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
void jae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); } void jae(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
void setae(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 3); } void setae(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
void cmovnc(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 3); }
void jnc(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x73, 0x83, 0x0F); }
void setnc(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 3); }
void cmove(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); } void cmove(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
void je(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); } void je(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
void sete(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 4); } void sete(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
void cmovz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); } void cmovz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 4); }
void jz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); } void jz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }
void setz(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 4); } void setz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 4); }
void cmovne(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); } void cmovne(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
void jne(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); } void jne(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
void setne(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 5); } void setne(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
void cmovnz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); } void cmovnz(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 5); }
void jnz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); } void jnz(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x75, 0x85, 0x0F); }
void setnz(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 5); } void setnz(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 5); }
void cmovbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); } void cmovbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
void jbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); } void jbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
void setbe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 6); } void setbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
void cmovna(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); } void cmovna(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 6); }
void jna(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); } void jna(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x76, 0x86, 0x0F); }
void setna(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 6); } void setna(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 6); }
void cmovnbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); } void cmovnbe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
void jnbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); } void jnbe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
void setnbe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 7); } void setnbe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
void cmova(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); } void cmova(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 7); }
void ja(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); } void ja(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x77, 0x87, 0x0F); }
void seta(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 7); } void seta(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 7); }
void cmovs(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 8); } void cmovs(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 8); }
void js(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); } void js(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x78, 0x88, 0x0F); }
void sets(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 8); } void sets(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 8); }
void cmovns(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 9); } void cmovns(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 9); }
void jns(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); } void jns(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x79, 0x89, 0x0F); }
void setns(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 9); } void setns(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 9); }
void cmovp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); } void cmovp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
void jp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); } void jp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
void setp(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 10); } void setp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
void cmovpe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); } void cmovpe(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 10); }
void jpe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); } void jpe(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7A, 0x8A, 0x0F); }
void setpe(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 10); } void setpe(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 10); }
void cmovnp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); } void cmovnp(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
void jnp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); } void jnp(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
void setnp(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 11); } void setnp(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
void cmovpo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); } void cmovpo(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 11); }
void jpo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); } void jpo(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7B, 0x8B, 0x0F); }
void setpo(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 11); } void setpo(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 11); }
void cmovl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); } void cmovl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
void jl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); } void jl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
void setl(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 12); } void setl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
void cmovnge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); } void cmovnge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 12); }
void jnge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); } void jnge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7C, 0x8C, 0x0F); }
void setnge(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 12); } void setnge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 12); }
void cmovnl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); } void cmovnl(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
void jnl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); } void jnl(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
void setnl(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 13); } void setnl(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
void cmovge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); } void cmovge(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 13); }
void jge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); } void jge(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7D, 0x8D, 0x0F); }
void setge(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 13); } void setge(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 13); }
void cmovle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); } void cmovle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
void jle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); } void jle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
void setle(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 14); } void setle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
void cmovng(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); } void cmovng(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 14); }
void jng(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); } void jng(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7E, 0x8E, 0x0F); }
void setng(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 14); } void setng(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 14); }
void cmovnle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); } void cmovnle(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
void jnle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); } void jnle(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
void setnle(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 15); } void setnle(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
void cmovg(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); } void cmovg(const Reg32e& reg, const Operand& op) { opModRM(reg, op, op.isREG(i32e), op.isMEM(), 0x0F, B01000000 | 15); }
void jg(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); } void jg(const char *label, LabelType type = T_AUTO) { opJmp(label, type, 0x7F, 0x8F, 0x0F); }
void setg(const Operand& op) { opR_ModM(op, 8, 3, 0, 0x0F, B10010000 | 15); } void setg(const Operand& op) { opR_ModM(op, 8, 0, 0x0F, B10010000 | 15); }
#ifdef XBYAK64 #ifdef XBYAK64
void cdqe() { db(0x48); db(0x98); } void cdqe() { db(0x48); db(0x98); }
#else #else
@ -308,12 +314,57 @@ void mwait() { db(0x0F); db(0x01); db(0xC9); }
void rdmsr() { db(0x0F); db(0x32); } void rdmsr() { db(0x0F); db(0x32); }
void rdpmc() { db(0x0F); db(0x33); } void rdpmc() { db(0x0F); db(0x33); }
void rdtsc() { db(0x0F); db(0x31); } void rdtsc() { db(0x0F); db(0x31); }
void rdtscp() { db(0x0F); db(0x01); db(0xF9); }
void wait() { db(0x9B); } void wait() { db(0x9B); }
void wbinvd() { db(0x0F); db(0x09); } void wbinvd() { db(0x0F); db(0x09); }
void wrmsr() { db(0x0F); db(0x30); } void wrmsr() { db(0x0F); db(0x30); }
void xlatb() { db(0xD7); } void xlatb() { db(0xD7); }
void popf() { db(0x9D); } void popf() { db(0x9D); }
void pushf() { db(0x9C); } void pushf() { db(0x9C); }
void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
void xgetbv() { db(0x0F); db(0x01); db(0xD0); }
void f2xm1() { db(0xD9); db(0xF0); }
void fabs() { db(0xD9); db(0xE1); }
void faddp() { db(0xDE); db(0xC1); }
void fchs() { db(0xD9); db(0xE0); }
void fcom() { db(0xD8); db(0xD1); }
void fcomp() { db(0xD8); db(0xD9); }
void fcompp() { db(0xDE); db(0xD9); }
void fcos() { db(0xD9); db(0xFF); }
void fdecstp() { db(0xD9); db(0xF6); }
void fdivp() { db(0xDE); db(0xF9); }
void fdivrp() { db(0xDE); db(0xF1); }
void fincstp() { db(0xD9); db(0xF7); }
void fld1() { db(0xD9); db(0xE8); }
void fldl2t() { db(0xD9); db(0xE9); }
void fldl2e() { db(0xD9); db(0xEA); }
void fldpi() { db(0xD9); db(0xEB); }
void fldlg2() { db(0xD9); db(0xEC); }
void fldln2() { db(0xD9); db(0xED); }
void fldz() { db(0xD9); db(0xEE); }
void fmulp() { db(0xDE); db(0xC9); }
void fnop() { db(0xD9); db(0xD0); }
void fpatan() { db(0xD9); db(0xF3); }
void fprem() { db(0xD9); db(0xF8); }
void fprem1() { db(0xD9); db(0xF5); }
void fptan() { db(0xD9); db(0xF2); }
void frndint() { db(0xD9); db(0xFC); }
void fscale() { db(0xD9); db(0xFD); }
void fsin() { db(0xD9); db(0xFE); }
void fsincos() { db(0xD9); db(0xFB); }
void fsqrt() { db(0xD9); db(0xFA); }
void fsubp() { db(0xDE); db(0xE9); }
void fsubrp() { db(0xDE); db(0xE1); }
void ftst() { db(0xD9); db(0xE4); }
void fucom() { db(0xDD); db(0xE1); }
void fucomp() { db(0xDD); db(0xE9); }
void fucompp() { db(0xDA); db(0xE9); }
void fxam() { db(0xD9); db(0xE5); }
void fxch() { db(0xD9); db(0xC9); }
void fxtract() { db(0xD9); db(0xF4); }
void fyl2x() { db(0xD9); db(0xF1); }
void fyl2xp1() { db(0xD9); db(0xF9); }
void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); } void adc(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x10); }
void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); } void adc(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x10, 2); }
void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); } void add(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x00); }
@ -332,12 +383,12 @@ void xor(const Operand& op1, const Operand& op2) { opRM_RM(op1, op2, 0x30); }
void xor(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x30, 6); } void xor(const Operand& op, uint32 imm) { opRM_I(op, imm, 0x30, 6); }
void dec(const Operand& op) { opIncDec(op, 0x48, 1); } void dec(const Operand& op) { opIncDec(op, 0x48, 1); }
void inc(const Operand& op) { opIncDec(op, 0x40, 0); } void inc(const Operand& op) { opIncDec(op, 0x40, 0); }
void div(const Operand& op) { opR_ModM(op, 0, 3, 6, 0xF6); } void div(const Operand& op) { opR_ModM(op, 0, 6, 0xF6); }
void idiv(const Operand& op) { opR_ModM(op, 0, 3, 7, 0xF6); } void idiv(const Operand& op) { opR_ModM(op, 0, 7, 0xF6); }
void imul(const Operand& op) { opR_ModM(op, 0, 3, 5, 0xF6); } void imul(const Operand& op) { opR_ModM(op, 0, 5, 0xF6); }
void mul(const Operand& op) { opR_ModM(op, 0, 3, 4, 0xF6); } void mul(const Operand& op) { opR_ModM(op, 0, 4, 0xF6); }
void neg(const Operand& op) { opR_ModM(op, 0, 3, 3, 0xF6); } void neg(const Operand& op) { opR_ModM(op, 0, 3, 0xF6); }
void not(const Operand& op) { opR_ModM(op, 0, 3, 2, 0xF6); } void not(const Operand& op) { opR_ModM(op, 0, 2, 0xF6); }
void rcl(const Operand& op, int imm) { opShift(op, imm, 2); } void rcl(const Operand& op, int imm) { opShift(op, imm, 2); }
void rcl(const Operand& op, const Reg8& cl) { opShift(op, cl, 2); } void rcl(const Operand& op, const Reg8& cl) { opShift(op, cl, 2); }
void rcr(const Operand& op, int imm) { opShift(op, imm, 3); } void rcr(const Operand& op, int imm) { opShift(op, imm, 3); }
@ -360,52 +411,57 @@ void shrd(const Operand& op, const Reg& reg, uint8 imm) { opShxd(op, reg, imm, 0
void shrd(const Operand& op, const Reg& reg, const Reg8& cl) { opShxd(op, reg, 0, 0xAC, &cl); } void shrd(const Operand& op, const Reg& reg, const Reg8& cl) { opShxd(op, reg, 0, 0xAC, &cl); }
void bsf(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); } void bsf(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBC); }
void bsr(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); } void bsr(const Reg&reg, const Operand& op) { opModRM(reg, op, op.isREG(16 | i32e), op.isMEM(), 0x0F, 0xBD); }
void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, 256, 0x38); } void pshufb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x00, 0x66, NONE, 0x38); }
void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, 256, 0x38); } void phaddw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x01, 0x66, NONE, 0x38); }
void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, 256, 0x38); } void phaddd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x02, 0x66, NONE, 0x38); }
void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, 256, 0x38); } void phaddsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x03, 0x66, NONE, 0x38); }
void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, 256, 0x38); } void pmaddubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x04, 0x66, NONE, 0x38); }
void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, 256, 0x38); } void phsubw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x05, 0x66, NONE, 0x38); }
void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, 256, 0x38); } void phsubd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x06, 0x66, NONE, 0x38); }
void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, 256, 0x38); } void phsubsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x07, 0x66, NONE, 0x38); }
void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, 256, 0x38); } void psignb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x08, 0x66, NONE, 0x38); }
void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, 256, 0x38); } void psignw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x09, 0x66, NONE, 0x38); }
void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, 256, 0x38); } void psignd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0A, 0x66, NONE, 0x38); }
void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, 256, 0x38); } void pmulhrsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x0B, 0x66, NONE, 0x38); }
void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, 256, 0x38); } void pabsb(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1C, 0x66, NONE, 0x38); }
void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, 256, 0x38); } void pabsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1D, 0x66, NONE, 0x38); }
void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, 256, 0x38); } void pabsd(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0x1E, 0x66, NONE, 0x38); }
void palignr(const Mmx& mmx, const Operand& op, int imm) { opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8>(imm), 0x3a); } void palignr(const Mmx& mmx, const Operand& op, int imm) { opMMX(mmx, op, 0x0f, 0x66, static_cast<uint8>(imm), 0x3a); }
void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, 256, 0x38); } void blendvpd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x15, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, 256, 0x38); } void blendvps(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x14, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, 256, 0x38); } void packusdw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x2B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pblendvb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x10, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pcmpeqq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x29, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, 256, 0x38); } void ptest(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x17, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovsxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x20, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovsxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x21, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovsxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x22, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovsxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x23, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovsxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x24, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovsxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x25, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovzxbw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x30, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovzxbd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x31, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovzxbq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x32, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovzxwd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x33, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovzxwq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x34, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmovzxdq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x35, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pminsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x38, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pminsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x39, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pminuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3A, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pminud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3B, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmaxsb(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3C, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmaxsd(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3D, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmaxuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3E, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmaxud(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x3F, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmuldq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x28, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pmulld(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x40, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, 256, 0x38); } void phminposuw(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, 256, 0x38); } void pcmpgtq(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0x37, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesdec(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDE, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesdeclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesenc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDC, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesenclast(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDD, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void aesimc(const Xmm& xmm, const Operand& op) { opGen(xmm, op, 0xDB, 0x66, isXMM_XMMorMEM, NONE, 0x38); }
void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); } void blendpd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0D, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); } void blendps(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x0C, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void dppd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); } void dppd(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x41, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
@ -420,6 +476,8 @@ void pcmpestrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x60
void pcmpestri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); } void pcmpestri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x61, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void pcmpistrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); } void pcmpistrm(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x62, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void pcmpistri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); } void pcmpistri(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x63, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void pclmulqdq(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0x44, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void aeskeygenassist(const Xmm& xmm, const Operand& op, int imm) { opGen(xmm, op, 0xDF, 0x66, isXMM_XMMorMEM, static_cast<uint8>(imm), 0x3A); }
void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); } void ldmxcsr(const Address& addr) { opModM(addr, Reg32(2), 0x0F, 0xAE); }
void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); } void stmxcsr(const Address& addr) { opModM(addr, Reg32(3), 0x0F, 0xAE); }
void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); } void clflush(const Address& addr) { opModM(addr, Reg32(7), 0x0F, 0xAE); }
@ -427,3 +485,540 @@ void movntpd(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getId
void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); } void movntdq(const Address& addr, const Xmm& reg) { opModM(addr, Reg16(reg.getIdx()), 0x0F, 0xE7); }
void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); } void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); } void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
void fadd(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 0, 0); }
void fiadd(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 0, 0); }
void fcom(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 2, 0); }
void fcomp(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 3, 0); }
void fdiv(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 6, 0); }
void fidiv(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 6, 0); }
void fdivr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 7, 0); }
void fidivr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 7, 0); }
void ficom(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 2, 0); }
void ficomp(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 3, 0); }
void fild(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 0, 5); }
void fist(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0x00, 2, 0); }
void fistp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDF, 3, 7); }
void fisttp(const Address& addr) { opFpuMem(addr, 0xDF, 0xDB, 0xDD, 1, 0); }
void fld(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 0, 0); }
void fmul(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 1, 0); }
void fimul(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 1, 0); }
void fst(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 2, 0); }
void fstp(const Address& addr) { opFpuMem(addr, 0x00, 0xD9, 0xDD, 3, 0); }
void fsub(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 4, 0); }
void fisub(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 4, 0); }
void fsubr(const Address& addr) { opFpuMem(addr, 0x00, 0xD8, 0xDC, 5, 0); }
void fisubr(const Address& addr) { opFpuMem(addr, 0xDE, 0xDA, 0x00, 5, 0); }
void fadd(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C0, 0xDCC0); }
void faddp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC0); }
void fcmovb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC0, 0x00C0); }
void fcmove(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAC8, 0x00C8); }
void fcmovbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD0, 0x00D0); }
void fcmovu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDAD8, 0x00D8); }
void fcmovnb(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC0, 0x00C0); }
void fcmovne(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBC8, 0x00C8); }
void fcmovnbe(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD0, 0x00D0); }
void fcmovnu(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBD8, 0x00D8); }
void fcomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBF0, 0x00F0); }
void fcomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFF0, 0x00F0); }
void fucomi(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDBE8, 0x00E8); }
void fucomip(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xDFE8, 0x00E8); }
void fdiv(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F0, 0xDCF8); }
void fdivp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF8); }
void fdivr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8F8, 0xDCF0); }
void fdivrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEF0); }
void fmul(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8C8, 0xDCC8); }
void fmulp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEC8); }
void fsub(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E0, 0xDCE8); }
void fsubp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE8); }
void fsubr(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0xD8E8, 0xDCE0); }
void fsubrp(const Fpu& reg1, const Fpu& reg2) { opFpuFpu(reg1, reg2, 0x0000, 0xDEE0); }
void fcom(const Fpu& reg) { opFpu(reg, 0xD8, 0xD0); }
void fcomp(const Fpu& reg) { opFpu(reg, 0xD8, 0xD8); }
void ffree(const Fpu& reg) { opFpu(reg, 0xDD, 0xC0); }
void fld(const Fpu& reg) { opFpu(reg, 0xD9, 0xC0); }
void fst(const Fpu& reg) { opFpu(reg, 0xDD, 0xD0); }
void fstp(const Fpu& reg) { opFpu(reg, 0xDD, 0xD8); }
void fucom(const Fpu& reg) { opFpu(reg, 0xDD, 0xE0); }
void fucomp(const Fpu& reg) { opFpu(reg, 0xDD, 0xE8); }
void fxch(const Fpu& reg) { opFpu(reg, 0xD9, 0xC8); }
void vaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x58, true); }
void vaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x58, true); }
void vaddsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x58, false); }
void vaddss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x58, false); }
void vsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5C, true); }
void vsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5C, true); }
void vsubsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5C, false); }
void vsubss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5C, false); }
void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x59, true); }
void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x59, true); }
void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x59, false); }
void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x59, false); }
void vdivpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5E, true); }
void vdivps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5E, true); }
void vdivsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5E, false); }
void vdivss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5E, false); }
void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5F, true); }
void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5F, true); }
void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5F, false); }
void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5F, false); }
void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x5D, true); }
void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x5D, true); }
void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x5D, false); }
void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F3, 0x5D, false); }
void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x54, true); }
void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x54, true); }
void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x55, true); }
void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x55, true); }
void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x56, true); }
void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x56, true); }
void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x57, true); }
void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F, 0x57, true); }
void vblendpd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
void vblendpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0D, true, 0); db(imm); }
void vblendps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
void vblendps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0C, true, 0); db(imm); }
void vdppd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
void vdppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x41, false, 0); db(imm); }
void vdpps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
void vdpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x40, true, 0); db(imm); }
void vmpsadbw(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x42, false, 0); db(imm); }
void vmpsadbw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x42, false, 0); db(imm); }
void vpblendw(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0E, false, 0); db(imm); }
void vpblendw(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0E, false, 0); db(imm); }
void vroundsd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
void vroundsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0B, false, 0); db(imm); }
void vroundss(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
void vroundss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0A, false, 0); db(imm); }
void vpclmulqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
void vpclmulqdq(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x44, false, 0); db(imm); }
void vpermilps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0C, true, 0); }
void vpermilpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0D, true, 0); }
void vcmppd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
void vcmppd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC2, true, -1); db(imm); }
void vcmpps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0xC2, true, -1); db(imm); }
void vcmpps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC2, true, -1); db(imm); }
void vcmpsd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
void vcmpsd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0xC2, false, -1); db(imm); }
void vcmpss(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
void vcmpss(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0xC2, false, -1); db(imm); }
void vcvtsd2ss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0x5A, false, -1); }
void vcvtsd2ss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x5A, false, -1); }
void vcvtss2sd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x5A, false, -1); }
void vcvtss2sd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x5A, false, -1); }
void vinsertps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
void vinsertps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x21, false, 0); db(imm); }
void vpacksswb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x63, false, -1); }
void vpacksswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x63, false, -1); }
void vpackssdw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6B, false, -1); }
void vpackssdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6B, false, -1); }
void vpackuswb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x67, false, -1); }
void vpackuswb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x67, false, -1); }
void vpackusdw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x2B, false, -1); }
void vpackusdw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x2B, false, -1); }
void vpaddb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFC, false, -1); }
void vpaddb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFC, false, -1); }
void vpaddw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFD, false, -1); }
void vpaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFD, false, -1); }
void vpaddd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFE, false, -1); }
void vpaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFE, false, -1); }
void vpaddq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD4, false, -1); }
void vpaddq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD4, false, -1); }
void vpaddsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEC, false, -1); }
void vpaddsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEC, false, -1); }
void vpaddsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xED, false, -1); }
void vpaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xED, false, -1); }
void vpaddusb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDC, false, -1); }
void vpaddusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDC, false, -1); }
void vpaddusw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDD, false, -1); }
void vpaddusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDD, false, -1); }
void vpalignr(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F3A | PP_66, 0x0F, false, -1); db(imm); }
void vpalignr(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F3A | PP_66, 0x0F, false, -1); db(imm); }
void vpand(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDB, false, -1); }
void vpand(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDB, false, -1); }
void vpandn(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDF, false, -1); }
void vpandn(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDF, false, -1); }
void vpavgb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE0, false, -1); }
void vpavgb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE0, false, -1); }
void vpavgw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE3, false, -1); }
void vpavgw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE3, false, -1); }
void vpcmpeqb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x74, false, -1); }
void vpcmpeqb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x74, false, -1); }
void vpcmpeqw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x75, false, -1); }
void vpcmpeqw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x75, false, -1); }
void vpcmpeqd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x76, false, -1); }
void vpcmpeqd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x76, false, -1); }
void vpcmpeqq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x29, false, -1); }
void vpcmpeqq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x29, false, -1); }
void vpcmpgtb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x64, false, -1); }
void vpcmpgtb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x64, false, -1); }
void vpcmpgtw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x65, false, -1); }
void vpcmpgtw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x65, false, -1); }
void vpcmpgtd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x66, false, -1); }
void vpcmpgtd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x66, false, -1); }
void vpcmpgtq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x37, false, -1); }
void vpcmpgtq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x37, false, -1); }
void vphaddw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x01, false, -1); }
void vphaddw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x01, false, -1); }
void vphaddd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x02, false, -1); }
void vphaddd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x02, false, -1); }
void vphaddsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x03, false, -1); }
void vphaddsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x03, false, -1); }
void vphsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x05, false, -1); }
void vphsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x05, false, -1); }
void vphsubd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x06, false, -1); }
void vphsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x06, false, -1); }
void vphsubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x07, false, -1); }
void vphsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x07, false, -1); }
void vpmaddwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF5, false, -1); }
void vpmaddwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF5, false, -1); }
void vpmaddubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x04, false, -1); }
void vpmaddubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x04, false, -1); }
void vpmaxsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3C, false, -1); }
void vpmaxsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3C, false, -1); }
void vpmaxsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEE, false, -1); }
void vpmaxsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEE, false, -1); }
void vpmaxsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3D, false, -1); }
void vpmaxsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3D, false, -1); }
void vpmaxub(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDE, false, -1); }
void vpmaxub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDE, false, -1); }
void vpmaxuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3E, false, -1); }
void vpmaxuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3E, false, -1); }
void vpmaxud(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3F, false, -1); }
void vpmaxud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3F, false, -1); }
void vpminsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x38, false, -1); }
void vpminsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x38, false, -1); }
void vpminsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEA, false, -1); }
void vpminsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEA, false, -1); }
void vpminsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x39, false, -1); }
void vpminsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x39, false, -1); }
void vpminub(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xDA, false, -1); }
void vpminub(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xDA, false, -1); }
void vpminuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3A, false, -1); }
void vpminuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3A, false, -1); }
void vpminud(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x3B, false, -1); }
void vpminud(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x3B, false, -1); }
void vpmulhuw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE4, false, -1); }
void vpmulhuw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE4, false, -1); }
void vpmulhrsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0B, false, -1); }
void vpmulhrsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0B, false, -1); }
void vpmulhw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE5, false, -1); }
void vpmulhw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE5, false, -1); }
void vpmullw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD5, false, -1); }
void vpmullw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD5, false, -1); }
void vpmulld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x40, false, -1); }
void vpmulld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x40, false, -1); }
void vpmuludq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF4, false, -1); }
void vpmuludq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF4, false, -1); }
void vpmuldq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x28, false, -1); }
void vpmuldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x28, false, -1); }
void vpor(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEB, false, -1); }
void vpor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEB, false, -1); }
void vpsadbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF6, false, -1); }
void vpsadbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF6, false, -1); }
void vpshufb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x00, false, -1); }
void vpsignb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x08, false, -1); }
void vpsignb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x08, false, -1); }
void vpsignw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x09, false, -1); }
void vpsignw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x09, false, -1); }
void vpsignd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F38 | PP_66, 0x0A, false, -1); }
void vpsignd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F38 | PP_66, 0x0A, false, -1); }
void vpsllw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF1, false, -1); }
void vpsllw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF1, false, -1); }
void vpslld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF2, false, -1); }
void vpslld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF2, false, -1); }
void vpsllq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF3, false, -1); }
void vpsllq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF3, false, -1); }
void vpsraw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE1, false, -1); }
void vpsraw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE1, false, -1); }
void vpsrad(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE2, false, -1); }
void vpsrad(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE2, false, -1); }
void vpsrlw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD1, false, -1); }
void vpsrlw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD1, false, -1); }
void vpsrld(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD2, false, -1); }
void vpsrld(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD2, false, -1); }
void vpsrlq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD3, false, -1); }
void vpsrlq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD3, false, -1); }
void vpsubb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF8, false, -1); }
void vpsubb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF8, false, -1); }
void vpsubw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xF9, false, -1); }
void vpsubw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xF9, false, -1); }
void vpsubd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFA, false, -1); }
void vpsubd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFA, false, -1); }
void vpsubq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xFB, false, -1); }
void vpsubq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xFB, false, -1); }
void vpsubsb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE8, false, -1); }
void vpsubsb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE8, false, -1); }
void vpsubsw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xE9, false, -1); }
void vpsubsw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xE9, false, -1); }
void vpsubusb(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD8, false, -1); }
void vpsubusb(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD8, false, -1); }
void vpsubusw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xD9, false, -1); }
void vpsubusw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xD9, false, -1); }
void vpunpckhbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x68, false, -1); }
void vpunpckhbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x68, false, -1); }
void vpunpckhwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x69, false, -1); }
void vpunpckhwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x69, false, -1); }
void vpunpckhdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6A, false, -1); }
void vpunpckhdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6A, false, -1); }
void vpunpckhqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6D, false, -1); }
void vpunpckhqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6D, false, -1); }
void vpunpcklbw(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x60, false, -1); }
void vpunpcklbw(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x60, false, -1); }
void vpunpcklwd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x61, false, -1); }
void vpunpcklwd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x61, false, -1); }
void vpunpckldq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x62, false, -1); }
void vpunpckldq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x62, false, -1); }
void vpunpcklqdq(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x6C, false, -1); }
void vpunpcklqdq(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x6C, false, -1); }
void vpxor(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xEF, false, -1); }
void vpxor(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xEF, false, -1); }
void vrcpss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x53, false, -1); }
void vrcpss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x53, false, -1); }
void vrsqrtss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x52, false, -1); }
void vrsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x52, false, -1); }
void vshufpd(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
void vshufpd(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0xC6, true, -1); db(imm); }
void vshufps(const Xmm& xm1, const Xmm& xm2, const Operand& op, uint8 imm) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0xC6, true, -1); db(imm); }
void vshufps(const Xmm& xmm, const Operand& op, uint8 imm) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0xC6, true, -1); db(imm); }
void vsqrtsd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F2, 0x51, false, -1); }
void vsqrtsd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F2, 0x51, false, -1); }
void vsqrtss(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_F3, 0x51, false, -1); }
void vsqrtss(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_F3, 0x51, false, -1); }
void vunpckhpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x15, true, -1); }
void vunpckhpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x15, true, -1); }
void vunpckhps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0x15, true, -1); }
void vunpckhps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x15, true, -1); }
void vunpcklpd(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F | PP_66, 0x14, true, -1); }
void vunpcklpd(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F | PP_66, 0x14, true, -1); }
void vunpcklps(const Xmm& xm1, const Xmm& xm2, const Operand& op) { opAVX_X_X_XM(xm1, xm2, op, MM_0F, 0x14, true, -1); }
void vunpcklps(const Xmm& xmm, const Operand& op) { opAVX_X_X_XM(xmm, xmm, op, MM_0F, 0x14, true, -1); }
void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0xDF, false, 0, imm); }
void vroundpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x09, true, 0, imm); }
void vroundps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x08, true, 0, imm); }
void vpermilpd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x05, true, 0, imm); }
void vpermilps(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x04, true, 0, imm); }
void vpcmpestri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x61, false, 0, imm); }
void vpcmpestrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x60, false, 0, imm); }
void vpcmpistri(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x63, false, 0, imm); }
void vpcmpistrm(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F3A | PP_66, 0x62, false, 0, imm); }
void vtestps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0E, true, 0); }
void vtestpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x0F, true, 0); }
void vcomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2F, false, -1); }
void vcomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2F, false, -1); }
void vcvtdq2ps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x5B, true, -1); }
void vcvtps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x5B, true, -1); }
void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x5B, true, -1); }
void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x28, true, -1); }
void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x28, true, -1); }
void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x12, true, -1); }
void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x6F, true, -1); }
void vmovdqu(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x6F, true, -1); }
void vmovshdup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x16, true, -1); }
void vmovsldup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x12, true, -1); }
void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x10, true, -1); }
void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x10, true, -1); }
void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1C, false, -1); }
void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1D, false, -1); }
void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x1E, false, -1); }
void vphminposuw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x41, false, -1); }
void vpmovsxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x20, false, -1); }
void vpmovsxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x21, false, -1); }
void vpmovsxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x22, false, -1); }
void vpmovsxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x23, false, -1); }
void vpmovsxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x24, false, -1); }
void vpmovsxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x25, false, -1); }
void vpmovzxbw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x30, false, -1); }
void vpmovzxbd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x31, false, -1); }
void vpmovzxbq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x32, false, -1); }
void vpmovzxwd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x33, false, -1); }
void vpmovzxwq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x34, false, -1); }
void vpmovzxdq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x35, false, -1); }
void vpshufd(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x70, false, -1, imm); }
void vpshufhw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F3, 0x70, false, -1, imm); }
void vpshuflw(const Xmm& xm, const Operand& op, uint8 imm) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_F2, 0x70, false, -1, imm); }
void vptest(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F38 | PP_66, 0x17, false, -1); }
void vrcpps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x53, true, -1); }
void vrsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x52, true, -1); }
void vsqrtpd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x51, true, -1); }
void vsqrtps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x51, true, -1); }
void vucomisd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F | PP_66, 0x2E, false, -1); }
void vucomiss(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, MM_0F, 0x2E, false, -1); }
void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x29, true, -1); }
void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x29, true, -1); }
void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x7F, true, -1); }
void vmovdqu(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_F3, 0x7F, true, -1); }
void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F | PP_66, 0x11, true, -1); }
void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, MM_0F, 0x11, true, -1); }
void vaddsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0xD0, true, -1); }
void vaddsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0xD0, true, -1); }
void vhaddpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7C, true, -1); }
void vhaddps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7C, true, -1); }
void vhsubpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_66, 0x7D, true, -1); }
void vhsubps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F | PP_F2, 0x7D, true, -1); }
void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDC, false, 0); }
void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDD, false, 0); }
void vaesdec(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDE, false, 0); }
void vaesdeclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xDF, false, 0); }
void vmaskmovps(const Xmm& xm1, const Xmm& xm2, const Address& addr) { opAVX_X_X_XM(xm1, xm2, addr, MM_0F38 | PP_66, 0x2C, true, 0); }
void vmaskmovps(const Address& addr, const Xmm& xm1, const Xmm& xm2) { opAVX_X_X_XM(xm2, xm1, addr, MM_0F38 | PP_66, 0x2E, true, 0); }
void vmaskmovpd(const Xmm& xm1, const Xmm& xm2, const Address& addr) { opAVX_X_X_XM(xm1, xm2, addr, MM_0F38 | PP_66, 0x2D, true, 0); }
void vmaskmovpd(const Address& addr, const Xmm& xm1, const Xmm& xm2) { opAVX_X_X_XM(xm2, xm1, addr, MM_0F38 | PP_66, 0x2F, true, 0); }
void vmovhpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x16, false); }
void vmovhpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x17, false); }
void vmovhps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F, 0x16, false); }
void vmovhps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x17, false); }
void vmovlpd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F | PP_66, 0x12, false); }
void vmovlpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x13, false); }
void vmovlps(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !op2.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, op2, MM_0F, 0x12, false); }
void vmovlps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F, 0x13, false); }
void vfmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 1); }
void vfmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 1); }
void vfmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 1); }
void vfmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x98, true, 0); }
void vfmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA8, true, 0); }
void vfmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB8, true, 0); }
void vfmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 1); }
void vfmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 1); }
void vfmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 1); }
void vfmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x99, false, 0); }
void vfmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA9, false, 0); }
void vfmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB9, false, 0); }
void vfmaddsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 1); }
void vfmaddsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 1); }
void vfmaddsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 1); }
void vfmaddsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x96, true, 0); }
void vfmaddsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA6, true, 0); }
void vfmaddsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB6, true, 0); }
void vfmsubadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 1); }
void vfmsubadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 1); }
void vfmsubadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 1); }
void vfmsubadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x97, true, 0); }
void vfmsubadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xA7, true, 0); }
void vfmsubadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xB7, true, 0); }
void vfmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 1); }
void vfmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 1); }
void vfmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 1); }
void vfmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9A, true, 0); }
void vfmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAA, true, 0); }
void vfmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBA, true, 0); }
void vfmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 1); }
void vfmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 1); }
void vfmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 1); }
void vfmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9B, false, 0); }
void vfmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAB, false, 0); }
void vfmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBB, false, 0); }
void vfnmadd132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 1); }
void vfnmadd213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 1); }
void vfnmadd231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 1); }
void vfnmadd132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9C, true, 0); }
void vfnmadd213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAC, true, 0); }
void vfnmadd231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBC, true, 0); }
void vfnmadd132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 1); }
void vfnmadd213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 1); }
void vfnmadd231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 1); }
void vfnmadd132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9D, false, 0); }
void vfnmadd213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAD, false, 0); }
void vfnmadd231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBD, false, 0); }
void vfnmsub132pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 1); }
void vfnmsub213pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 1); }
void vfnmsub231pd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 1); }
void vfnmsub132ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9E, true, 0); }
void vfnmsub213ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAE, true, 0); }
void vfnmsub231ps(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBE, true, 0); }
void vfnmsub132sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 1); }
void vfnmsub213sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 1); }
void vfnmsub231sd(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 1); }
void vfnmsub132ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0x9F, false, 0); }
void vfnmsub213ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xAF, false, 0); }
void vfnmsub231ss(const Xmm& xmm, const Xmm& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, MM_0F38 | PP_66, 0xBF, false, 0); }
void vaesimc(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, MM_0F38 | PP_66, 0xDB, false, 0); }
void vbroadcastf128(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x1A, true, 0); }
void vbroadcastsd(const Ymm& y, const Address& addr) { opAVX_X_XM_IMM(y, addr, MM_0F38 | PP_66, 0x19, true, 0); }
void vbroadcastss(const Xmm& x, const Address& addr) { opAVX_X_XM_IMM(x, addr, MM_0F38 | PP_66, 0x18, true, 0); }
void vextractf128(const Operand& op, const Ymm& y, uint8 imm) { opAVX_X_XM_IMM(y, cvtReg(op, op.isXMM(), Operand::YMM), MM_0F3A | PP_66, 0x19, true, 0, imm); }
void vinsertf128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, cvtReg(op, op.isXMM(), Operand::YMM), MM_0F3A | PP_66, 0x18, true, 0); db(imm); }
void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8 imm) { opAVX_X_X_XM(y1, y2, op, MM_0F3A | PP_66, 0x06, true, 0); db(imm); }
void vlddqu(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_F2, 0xF0, true, 0); }
void vldmxcsr(const Address& addr) { opAVX_X_X_XM(xm2, xm0, addr, MM_0F, 0xAE, false, -1); }
void vstmxcsr(const Address& addr) { opAVX_X_X_XM(xm3, xm0, addr, MM_0F, 0xAE, false, -1); }
void vmaskmovdqu(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_66, 0xF7, false, -1); }
void vpextrb(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(i32e) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x14, false); db(imm); }
void vpextrw(const Reg& r, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, Xmm(r.getIdx()), MM_0F | PP_66, 0xC5, false); db(imm); }
void vpextrw(const Address& addr, const Xmm& x, uint8 imm) { opAVX_X_X_XM(x, xm0, addr, MM_0F3A | PP_66, 0x15, false); db(imm); }
void vpextrd(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x16, false, 0); db(imm); }
void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x20, false); db(imm); }
void vpinsrb(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x20, false); db(imm); }
void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F | PP_66, 0xC4, false); db(imm); }
void vpinsrw(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F | PP_66, 0xC4, false); db(imm); }
void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
void vpinsrd(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(32) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 0); db(imm); }
void vpmovmskb(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, x, MM_0F | PP_66, 0xD7, false); }
void vpslldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm7, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpslldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm7, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsrldq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm3, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsrldq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm3, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsllw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsllw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
void vpslld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
void vpslld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsllq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm6, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsllq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm6, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsraw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm4, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsraw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm4, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsrad(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm4, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrad(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm4, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrlw(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsrlw(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x71, false); db(imm); }
void vpsrld(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrld(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x72, false); db(imm); }
void vpsrlq(const Xmm& x1, const Xmm& x2, uint8 imm) { opAVX_X_X_XM(xm2, x1, x2, MM_0F | PP_66, 0x73, false); db(imm); }
void vpsrlq(const Xmm& x, uint8 imm) { opAVX_X_X_XM(xm2, x, x, MM_0F | PP_66, 0x73, false); db(imm); }
void vblendvpd(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
void vblendvpd(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4B, true); db(x4.getIdx() << 4); }
void vblendvps(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
void vblendvps(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4A, true); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Xmm& x2, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x2, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
void vpblendvb(const Xmm& x1, const Operand& op, const Xmm& x4) { opAVX_X_X_XM(x1, x1, op, MM_0F3A | PP_66, 0x4C, false); db(x4.getIdx() << 4); }
void vmovd(const Xmm& x, const Reg32& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 0); }
void vmovd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x6E, false, 0); }
void vmovd(const Reg32& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 0); }
void vmovd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0x7E, false, 0); }
void vmovhlps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F, 0x12, false); }
void vmovlhps(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F, 0x16, false); }
void vmovmskpd(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F | PP_66, 0x50, true, 0); }
void vmovmskps(const Reg& r, const Xmm& x) { if (!r.isBit(i32e)) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x.isXMM() ? Xmm(r.getIdx()) : Ymm(r.getIdx()), x.isXMM() ? xm0 : ym0, x, MM_0F, 0x50, true, 0); }
void vmovntdq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0xE7, true); }
void vmovntpd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F | PP_66, 0x2B, true); }
void vmovntps(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, addr, MM_0F, 0x2B, true); }
void vmovntdqa(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F38 | PP_66, 0x2A, false); }
void vmovsd(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F2, 0x10, false); }
void vmovsd(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x10, false); }
void vmovsd(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F2, 0x11, false); }
void vmovss(const Xmm& x1, const Xmm& x2, const Operand& op = Operand()) { if (!op.isNone() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, op, MM_0F | PP_F3, 0x10, false); }
void vmovss(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x10, false); }
void vmovss(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x11, false); }
void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 0); }
void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 0); }
void vcvtsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 0); }
void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 0); }
void vcvtsi2ss(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, cvtReg(op2, op2.isREG(), Operand::XMM), MM_0F | PP_F3, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
void vcvtsi2sd(const Xmm& x, const Operand& op1, const Operand& op2 = Operand()) { if (!op2.isNone() && !(op2.isREG(i32e) || op2.isMEM())) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, op1, cvtReg(op2, op2.isREG(), Operand::XMM), MM_0F | PP_F2, 0x2A, false, (op1.isMEM() || op2.isMEM()) ? -1 : (op1.isREG(32) || op2.isREG(32)) ? 0 : 1); }
void vcvtps2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, cvtReg(op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM), MM_0F, 0x5A, true); }
void vcvtdq2pd(const Xmm& x, const Operand& op) { if (!op.isMEM() && !op.isXMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x.isXMM() ? xm0 : ym0, cvtReg(op, !op.isMEM(), x.isXMM() ? Operand::XMM : Operand::YMM), MM_0F | PP_F3, 0xE6, true); }
void vcvtpd2ps(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0x5A, true); }
void vcvtpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_F2, 0xE6, true); }
void vcvttpd2dq(const Xmm& x, const Operand& op) { if (x.isYMM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(op.isYMM() ? Ymm(x.getIdx()) : x, op.isYMM() ? ym0 : xm0, op, MM_0F | PP_66, 0xE6, true); }
#ifdef XBYAK64
void vmovq(const Xmm& x, const Reg64& reg) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x6E, false, 1); }
void vmovq(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_F3, 0x7E, false, -1); }
void vmovq(const Reg64& reg, const Xmm& x) { opAVX_X_X_XM(x, xm0, Xmm(reg.getIdx()), MM_0F | PP_66, 0x7E, false, 1); }
void vmovq(const Address& addr, const Xmm& x) { opAVX_X_X_XM(x, xm0, addr, MM_0F | PP_66, 0xD6, false, -1); }
void vmovq(const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x1, xm0, x2, MM_0F | PP_F3, 0x7E, false, -1); }
void vpextrq(const Operand& op, const Xmm& x, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, xm0, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x16, false, 1); db(imm); }
void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x1, x2, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
void vpinsrq(const Xmm& x, const Operand& op, uint8 imm) { if (!op.isREG(64) && !op.isMEM()) throw ERR_BAD_COMBINATION; opAVX_X_X_XM(x, x, cvtReg(op, !op.isMEM(), Operand::XMM), MM_0F3A | PP_66, 0x22, false, 1); db(imm); }
void vcvtss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2D, false, 1); }
void vcvttss2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F3, 0x2C, false, 1); }
void vcvtsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2D, false, 1); }
void vcvttsd2si(const Reg64& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, MM_0F | PP_F2, 0x2C, false, 1); }
#endif

View File

@ -2,9 +2,10 @@
#define XBYAK_XBYAK_UTIL_H_ #define XBYAK_XBYAK_UTIL_H_
/** /**
utility class for Xbyak utility class and functions for Xbyak
@note this header is under construction @note this header is UNDER CONSTRUCTION!
*/ */
#include "xbyak/xbyak.h"
#ifdef _WIN32 #ifdef _WIN32
#if (_MSC_VER < 1400) && defined(XBYAK32) #if (_MSC_VER < 1400) && defined(XBYAK32)
@ -29,10 +30,17 @@
#include <intrin.h> // for __cpuid #include <intrin.h> // for __cpuid
#endif #endif
#else #else
#if __GNUC_PREREQ(4, 3) #ifndef __GNUC_PREREQ
#define __GNUC_PREREQ(major, minor) (((major) << 16) + (minor))
#endif
#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
#include <cpuid.h> #include <cpuid.h>
#else #else
#define __cpuid(eaxIn, a, b, c, d) __asm__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn)) #if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#else
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
#endif
#endif #endif
#endif #endif
@ -43,6 +51,10 @@ namespace Xbyak { namespace util {
*/ */
class Cpu { class Cpu {
unsigned int type_; unsigned int type_;
unsigned int get32bitAsBE(const char *x) const
{
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
}
public: public:
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4]) static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
{ {
@ -64,11 +76,17 @@ public:
tSSE41 = 1 << 7, tSSE41 = 1 << 7,
tSSE42 = 1 << 8, tSSE42 = 1 << 8,
tPOPCNT = 1 << 9, tPOPCNT = 1 << 9,
tAESNI = 1 << 10,
tSSE5 = 1 << 11,
tOSXSACE = 1 << 12,
tPCLMULQDQ = 1 << 13,
tAVX = 1 << 14,
tFMA = 1 << 15,
t3DN = 1 << 16, t3DN = 1 << 16,
tE3DN = 1 << 17, tE3DN = 1 << 17,
tSSE4a = 1 << 18, tSSE4a = 1 << 18,
tSSE5 = 1 << 11, tRDTSCP = 1 << 19,
tINTEL = 1 << 24, tINTEL = 1 << 24,
tAMD = 1 << 25 tAMD = 1 << 25
@ -80,28 +98,39 @@ public:
getCpuid(0, data); getCpuid(0, data);
static const char intel[] = "ntel"; static const char intel[] = "ntel";
static const char amd[] = "cAMD"; static const char amd[] = "cAMD";
if (data[2] == *reinterpret_cast<const unsigned int*>(amd)) { if (data[2] == get32bitAsBE(amd)) {
type_ |= tAMD; type_ |= tAMD;
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (data[3] & (1 << 31)) type_ |= t3DN; if (data[3] & (1U << 31)) type_ |= t3DN;
if (data[3] & (1 << 15)) type_ |= tCMOV; if (data[3] & (1U << 15)) type_ |= tCMOV;
if (data[3] & (1 << 30)) type_ |= tE3DN; if (data[3] & (1U << 30)) type_ |= tE3DN;
if (data[3] & (1 << 22)) type_ |= tMMX2; if (data[3] & (1U << 22)) type_ |= tMMX2;
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
} }
if (data[2] == *reinterpret_cast<const unsigned int*>(intel)) { if (data[2] == get32bitAsBE(intel)) {
type_ |= tINTEL; type_ |= tINTEL;
getCpuid(0x80000001, data);
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
} }
getCpuid(1, data); getCpuid(1, data);
if (data[2] & (1 << 0)) type_ |= tSSE3; if (data[2] & (1U << 0)) type_ |= tSSE3;
if (data[2] & (1 << 9)) type_ |= tSSSE3; if (data[2] & (1U << 9)) type_ |= tSSSE3;
if (data[2] & (1 << 19)) type_ |= tSSE41; if (data[2] & (1U << 19)) type_ |= tSSE41;
if (data[2] & (1 << 20)) type_ |= tSSE42; if (data[2] & (1U << 20)) type_ |= tSSE42;
if (data[2] & (1 << 23)) type_ |= tPOPCNT; if (data[2] & (1U << 23)) type_ |= tPOPCNT;
if (data[2] & (1U << 25)) type_ |= tAESNI;
if (data[3] & (1 << 15)) type_ |= tCMOV; if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ;
if (data[3] & (1 << 23)) type_ |= tMMX; if (data[2] & (1U << 27)) type_ |= tOSXSACE;
if (data[3] & (1 << 25)) type_ |= tMMX2 | tSSE; #if _M_SSE >= 0x500
if (data[3] & (1 << 26)) type_ |= tSSE2; // QQQ
// should check XFEATURE_ENABLED_MASK[2:1] = '11b' by xgetvb
if (data[2] & (1U << 28)) type_ |= tAVX;
if (data[2] & (1U << 12)) type_ |= tFMA;
#endif
if (data[3] & (1U << 15)) type_ |= tCMOV;
if (data[3] & (1U << 23)) type_ |= tMMX;
if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE;
if (data[3] & (1U << 26)) type_ |= tSSE2;
} }
bool has(Type type) const bool has(Type type) const
{ {
@ -109,6 +138,40 @@ public:
} }
}; };
class Clock {
public:
static inline uint64 getRdtsc()
{
#ifdef _MSC_VER
return __rdtsc();
#else
unsigned int eax, edx;
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
return ((uint64)edx << 32) | eax;
#endif
}
Clock()
: clock_(0)
, count_(0)
{
}
void begin()
{
clock_ -= getRdtsc();
}
void end()
{
clock_ += getRdtsc();
count_++;
}
int getCount() const { return count_; }
uint64 getClock() const { return clock_; }
void clear() { count_ = 0; clock_ = 0; }
private:
uint64 clock_;
int count_;
};
#ifdef XBYAK32 #ifdef XBYAK32
namespace local { namespace local {
@ -133,53 +196,47 @@ XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG(ebp)
#undef XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG #undef XBYAK_LOCAL_DEFINE_SET_EIP_TO_REG
} // end of local } // end of local
template<class Gen> /**
struct EnableSetEip : public Gen { get eip to out register
EnableSetEip(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0) @note out is not esp
: Gen(maxSize, userPtr) */
{ template<class T>
} void setEipTo(T *self, const Xbyak::Reg32& out)
/** {
get pid to out register
@note out = eax or ecx or edx
*/
void setEipTo(const Xbyak::Reg32& out)
{
#if 0 #if 0
Gen::call(Gen::getCurr() + 5); self->call("@f");
Gen::pop(out); self->L("@@");
self->pop(out);
#else #else
int idx = out.getIdx(); int idx = out.getIdx();
switch (idx) { switch (idx) {
case Xbyak::Operand::EAX: case Xbyak::Operand::EAX:
Gen::call((void*)local::set_eip_to_eax); self->call((void*)local::set_eip_to_eax);
break; break;
case Xbyak::Operand::ECX: case Xbyak::Operand::ECX:
Gen::call((void*)local::set_eip_to_ecx); self->call((void*)local::set_eip_to_ecx);
break; break;
case Xbyak::Operand::EDX: case Xbyak::Operand::EDX:
Gen::call((void*)local::set_eip_to_edx); self->call((void*)local::set_eip_to_edx);
break; break;
case Xbyak::Operand::EBX: case Xbyak::Operand::EBX:
Gen::call((void*)local::set_eip_to_ebx); self->call((void*)local::set_eip_to_ebx);
break; break;
case Xbyak::Operand::ESI: case Xbyak::Operand::ESI:
Gen::call((void*)local::set_eip_to_esi); self->call((void*)local::set_eip_to_esi);
break; break;
case Xbyak::Operand::EDI: case Xbyak::Operand::EDI:
Gen::call((void*)local::set_eip_to_edi); self->call((void*)local::set_eip_to_edi);
break; break;
case Xbyak::Operand::EBP: case Xbyak::Operand::EBP:
Gen::call((void*)local::set_eip_to_ebp); self->call((void*)local::set_eip_to_ebp);
break; break;
default: default:
assert(0); assert(0);
}
#endif
} }
}; #endif
}
#endif #endif
} } // end of util } } // end of util
#endif #endif