dep: Add xbyak
This commit is contained in:
parent
b9089cac95
commit
0e8ff85f04
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
Copyright (c) 2007 MITSUNARI Shigeo
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
Redistributions of source code must retain the above copyright notice, this
|
||||||
|
list of conditions and the following disclaimer.
|
||||||
|
Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
Neither the name of the copyright owner nor the names of its contributors may
|
||||||
|
be used to endorse or promote products derived from this software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||||
|
THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
ソースコード形式かバイナリ形式か、変更するかしないかを問わず、以下の条件を満た
|
||||||
|
す場合に限り、再頒布および使用が許可されます。
|
||||||
|
|
||||||
|
ソースコードを再頒布する場合、上記の著作権表示、本条件一覧、および下記免責条項
|
||||||
|
を含めること。
|
||||||
|
バイナリ形式で再頒布する場合、頒布物に付属のドキュメント等の資料に、上記の著作
|
||||||
|
権表示、本条件一覧、および下記免責条項を含めること。
|
||||||
|
書面による特別の許可なしに、本ソフトウェアから派生した製品の宣伝または販売促進
|
||||||
|
に、著作権者の名前またはコントリビューターの名前を使用してはならない。
|
||||||
|
本ソフトウェアは、著作権者およびコントリビューターによって「現状のまま」提供さ
|
||||||
|
れており、明示黙示を問わず、商業的な使用可能性、および特定の目的に対する適合性
|
||||||
|
に関する暗黙の保証も含め、またそれに限定されない、いかなる保証もありません。
|
||||||
|
著作権者もコントリビューターも、事由のいかんを問わず、 損害発生の原因いかんを
|
||||||
|
問わず、かつ責任の根拠が契約であるか厳格責任であるか(過失その他の)不法行為で
|
||||||
|
あるかを問わず、仮にそのような損害が発生する可能性を知らされていたとしても、
|
||||||
|
本ソフトウェアの使用によって発生した(代替品または代用サービスの調達、使用の
|
||||||
|
喪失、データの喪失、利益の喪失、業務の中断も含め、またそれに限定されない)直接
|
||||||
|
損害、間接損害、偶発的な損害、特別損害、懲罰的損害、または結果損害について、
|
||||||
|
一切責任を負わないものとします。
|
|
@ -0,0 +1,443 @@
|
||||||
|
|
||||||
|
Xbyak 5.41 ; JIT assembler for x86(IA32), x64(AMD64, x86-64) by C++
|
||||||
|
=============
|
||||||
|
|
||||||
|
Abstract
|
||||||
|
-------------
|
||||||
|
|
||||||
|
This is a header file which enables dynamically to assemble x86(IA32), x64(AMD64, x86-64) mnemonic.
|
||||||
|
|
||||||
|
Feature
|
||||||
|
-------------
|
||||||
|
header file only
|
||||||
|
you can use Xbyak's functions at once if xbyak.h is included.
|
||||||
|
|
||||||
|
### Supported Instructions Sets
|
||||||
|
|
||||||
|
MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(*partial*)/AVX/AVX2/FMA/VEX-encoded GPR/AVX-512
|
||||||
|
|
||||||
|
### Supported OS
|
||||||
|
|
||||||
|
* Windows Xp, Vista, Windows 7(32bit, 64bit)
|
||||||
|
* Linux(32bit, 64bit)
|
||||||
|
* Intel Mac OSX
|
||||||
|
|
||||||
|
### Supported Compilers
|
||||||
|
|
||||||
|
* Visual Studio C++ VC2012 or later
|
||||||
|
* gcc 4.7 or later
|
||||||
|
* clang 3.3
|
||||||
|
* cygwin gcc 4.5.3
|
||||||
|
* icc 7.2
|
||||||
|
|
||||||
|
>Note: Xbyak uses and(), or(), xor(), not() functions, so "-fno-operator-names" option is required on gcc.
|
||||||
|
Or define XBYAK_NO_OP_NAMES and use and_(), or_(), xor_(), not_() instead of them.
|
||||||
|
and_(), or_(), xor_(), not_() are always available.
|
||||||
|
|
||||||
|
Install
|
||||||
|
-------------
|
||||||
|
|
||||||
|
The following files are necessary. Please add the path to your compile directories.
|
||||||
|
|
||||||
|
* xbyak.h
|
||||||
|
* xbyak_mnemonic.h
|
||||||
|
|
||||||
|
Linux:
|
||||||
|
|
||||||
|
make install
|
||||||
|
|
||||||
|
These files are copied into /usr/local/include/xbyak
|
||||||
|
|
||||||
|
New Feature
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Add support for AVX-512 instruction set.
|
||||||
|
|
||||||
|
Syntax
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Make Xbyak::CodeGenerator and make the class method and get the function
|
||||||
|
pointer by calling cgetCode() and casting the return value.
|
||||||
|
|
||||||
|
NASM Xbyak
|
||||||
|
mov eax, ebx --> mov(eax, ebx);
|
||||||
|
inc ecx inc(ecx);
|
||||||
|
ret --> ret();
|
||||||
|
|
||||||
|
### Addressing
|
||||||
|
|
||||||
|
(ptr|dword|word|byte) [base + index * (1|2|4|8) + displacement]
|
||||||
|
[rip + 32bit disp] ; x64 only
|
||||||
|
|
||||||
|
NASM Xbyak
|
||||||
|
mov eax, [ebx+ecx] --> mov (eax, ptr[ebx+ecx]);
|
||||||
|
test byte [esp], 4 --> test (byte [esp], 4);
|
||||||
|
|
||||||
|
|
||||||
|
How to use Selector(Segment Register)
|
||||||
|
|
||||||
|
>Note: Segment class is not derived from Operand.
|
||||||
|
|
||||||
|
```
|
||||||
|
mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]);
|
||||||
|
mov ax, cs --> mov(ax, cs);
|
||||||
|
```
|
||||||
|
|
||||||
|
>you can use ptr for almost memory access unless you specify the size of memory.
|
||||||
|
|
||||||
|
>dword, word and byte are member variables, then don't use dword as unsigned int, for example.
|
||||||
|
|
||||||
|
### AVX
|
||||||
|
|
||||||
|
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
|
||||||
|
vaddps(xmm2, xmm3, ptr [rax]); // use ptr to access memory
|
||||||
|
vgatherdpd(xmm1, ptr [ebp+123+xmm2*4], xmm3);
|
||||||
|
|
||||||
|
*Remark*
|
||||||
|
The omitted destination syntax as the following ss disabled.
|
||||||
|
```
|
||||||
|
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
|
||||||
|
```
|
||||||
|
define `XBYAK_ENABLE_OMITTED_OPERAND` if you use it for backward compatibility.
|
||||||
|
But the newer version will not support it.
|
||||||
|
|
||||||
|
### AVX-512
|
||||||
|
|
||||||
|
```
|
||||||
|
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
||||||
|
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
|
||||||
|
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
|
||||||
|
vaddpd zmm2{k5}, zmm4, zmm2 --> vaddpd(zmm2 | k5, zmm4, zmm2);
|
||||||
|
vaddpd zmm2{k5}{z}, zmm4, zmm2 --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2);
|
||||||
|
vaddpd zmm2{k5}{z}, zmm4, zmm2,{rd-sae} --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2 | T_rd_sae);
|
||||||
|
vaddpd(zmm2 | k5 | T_z | T_rd_sae, zmm4, zmm2); // the position of `|` is arbitrary.
|
||||||
|
vcmppd k4{k3}, zmm1, zmm2, {sae}, 5 --> vcmppd(k4 | k3, zmm1, zmm2 | T_sae, 5);
|
||||||
|
|
||||||
|
vaddpd xmm1, xmm2, [rax+256] --> vaddpd(xmm1, xmm2, ptr [rax+256]);
|
||||||
|
vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]);
|
||||||
|
vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]);
|
||||||
|
vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]);
|
||||||
|
vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]);
|
||||||
|
vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4);
|
||||||
|
|
||||||
|
vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, xword [eax+33]); // use xword for m128 instead of oword
|
||||||
|
vcvtpd2dq(xmm16, ptr [eax+33]); // default xword
|
||||||
|
vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]);
|
||||||
|
vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256
|
||||||
|
vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast
|
||||||
|
|
||||||
|
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
|
||||||
|
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
|
||||||
|
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit
|
||||||
|
```
|
||||||
|
Remark
|
||||||
|
* k1, ..., k7 are new opmask registers.
|
||||||
|
* use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively.
|
||||||
|
* `k4 | k3` is different from `k3 | k4`.
|
||||||
|
* use `ptr_b` for broadcast `{1toX}`. X is automatically determined.
|
||||||
|
* specify xword/yword/zword(_b) for m128/m256/m512 if necessary.
|
||||||
|
|
||||||
|
### Label
|
||||||
|
|
||||||
|
L("L1");
|
||||||
|
jmp ("L1");
|
||||||
|
|
||||||
|
jmp ("L2");
|
||||||
|
...
|
||||||
|
a few mnemonics(8-bit displacement jmp)
|
||||||
|
...
|
||||||
|
L("L2");
|
||||||
|
|
||||||
|
jmp ("L3", T_NEAR);
|
||||||
|
...
|
||||||
|
a lot of mnemonics(32-bit displacement jmp)
|
||||||
|
...
|
||||||
|
L("L3");
|
||||||
|
|
||||||
|
>Call hasUndefinedLabel() to verify your code has no undefined label.
|
||||||
|
> you can use a label for immediate value of mov like as mov (eax, "L2");
|
||||||
|
|
||||||
|
#### 1. support @@, @f, @b like MASM
|
||||||
|
|
||||||
|
L("@@"); // <A>
|
||||||
|
jmp("@b"); // jmp to <A>
|
||||||
|
jmp("@f"); // jmp to <B>
|
||||||
|
L("@@"); // <B>
|
||||||
|
jmp("@b"); // jmp to <B>
|
||||||
|
mov(eax, "@b");
|
||||||
|
jmp(eax); // jmp to <B>
|
||||||
|
|
||||||
|
#### 2. localization of label by calling inLocalLabel(), outLocallabel().
|
||||||
|
|
||||||
|
labels begining of period between inLocalLabel() and outLocalLabel()
|
||||||
|
are dealed with local label.
|
||||||
|
inLocalLabel() and outLocalLabel() can be nested.
|
||||||
|
|
||||||
|
void func1()
|
||||||
|
{
|
||||||
|
inLocalLabel();
|
||||||
|
L(".lp"); // <A> ; local label
|
||||||
|
...
|
||||||
|
jmp(".lp"); // jmpt to <A>
|
||||||
|
L("aaa"); // global label
|
||||||
|
outLocalLabel();
|
||||||
|
}
|
||||||
|
|
||||||
|
void func2()
|
||||||
|
{
|
||||||
|
inLocalLabel();
|
||||||
|
L(".lp"); // <B> ; local label
|
||||||
|
func1();
|
||||||
|
jmp(".lp"); // jmp to <B>
|
||||||
|
inLocalLabel();
|
||||||
|
}
|
||||||
|
|
||||||
|
### Label class
|
||||||
|
|
||||||
|
L() and jxx() functions support a new Label class.
|
||||||
|
|
||||||
|
Label label1, label2;
|
||||||
|
L(label1);
|
||||||
|
...
|
||||||
|
jmp(label1);
|
||||||
|
...
|
||||||
|
jmp(label2);
|
||||||
|
...
|
||||||
|
L(label2);
|
||||||
|
|
||||||
|
Moreover, assignL(dstLabel, srcLabel) method binds dstLabel with srcLabel.
|
||||||
|
|
||||||
|
Label label1, label2;
|
||||||
|
L(label1);
|
||||||
|
...
|
||||||
|
jmp(label2);
|
||||||
|
...
|
||||||
|
assignL(label2, label1); // label2 <= label1
|
||||||
|
|
||||||
|
The above jmp opecode jumps label1.
|
||||||
|
|
||||||
|
* Restriction:
|
||||||
|
* srcLabel must be used in L().
|
||||||
|
* dstLabel must not be used in L().
|
||||||
|
|
||||||
|
Label::getAddress() returns the address specified by the label instance and 0 if not specified.
|
||||||
|
```
|
||||||
|
// not AutoGrow mode
|
||||||
|
Label label;
|
||||||
|
assert(label.getAddress() == 0);
|
||||||
|
L(label);
|
||||||
|
assert(label.getAddress() == getCurr());
|
||||||
|
```
|
||||||
|
|
||||||
|
### Rip
|
||||||
|
```
|
||||||
|
Label label;
|
||||||
|
mov(eax, ptr [rip + label]); // eax = 4
|
||||||
|
...
|
||||||
|
|
||||||
|
L(label);
|
||||||
|
dd(4);
|
||||||
|
```
|
||||||
|
```
|
||||||
|
int x;
|
||||||
|
...
|
||||||
|
mov(eax, ptr[rip + &x]); // throw exception if the difference between &x and current position is larger than 2GiB
|
||||||
|
```
|
||||||
|
### Code size
|
||||||
|
The default max code size is 4096 bytes. Please set it in constructor of CodeGenerator() if you want to use large size.
|
||||||
|
|
||||||
|
class Quantize : public Xbyak::CodeGenerator {
|
||||||
|
public:
|
||||||
|
Quantize()
|
||||||
|
: CodeGenerator(8192)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
...
|
||||||
|
};
|
||||||
|
|
||||||
|
### use user allocated memory
|
||||||
|
|
||||||
|
You can make jit code on prepaired memory.
|
||||||
|
|
||||||
|
class Sample : public Xbyak::CodeGenerator {
|
||||||
|
public:
|
||||||
|
Sample(void *userPtr, size_t size)
|
||||||
|
: Xbyak::CodeGenerator(size, userPtr)
|
||||||
|
{
|
||||||
|
...
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const size_t codeSize = 1024;
|
||||||
|
uint8 buf[codeSize + 16];
|
||||||
|
|
||||||
|
// get 16-byte aligned address
|
||||||
|
uint8 *p = Xbyak::CodeArray::getAlignedAddress(buf);
|
||||||
|
|
||||||
|
// append executable attribute to the memory
|
||||||
|
Xbyak::CodeArray::protect(p, codeSize, true);
|
||||||
|
|
||||||
|
// construct your jit code on the memory
|
||||||
|
Sample s(p, codeSize);
|
||||||
|
|
||||||
|
>See *sample/test0.cpp*
|
||||||
|
|
||||||
|
AutoGrow
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Under `AutoGrow` mode, Xbyak extends memory automatically if necessary.
|
||||||
|
Call ready() before calling getCode() to calc address of jmp.
|
||||||
|
```
|
||||||
|
struct Code : Xbyak::CodeGenerator {
|
||||||
|
Code()
|
||||||
|
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
|
||||||
|
{
|
||||||
|
...
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Code c;
|
||||||
|
c.ready(); // Don't forget to call this function
|
||||||
|
```
|
||||||
|
>Don't use the address returned by getCurr() before calling ready().
|
||||||
|
>It may be invalid address.
|
||||||
|
>RESTRICTION : rip addressing is not supported in AutoGrow
|
||||||
|
|
||||||
|
Macro
|
||||||
|
-------------
|
||||||
|
|
||||||
|
* **XBYAK32** is defined on 32bit.
|
||||||
|
* **XBYAK64** is defined on 64bit.
|
||||||
|
* **XBYAK64_WIN** is defined on 64bit Windows(VC)
|
||||||
|
* **XBYAK64_GCC** is defined on 64bit gcc, cygwin
|
||||||
|
* define **XBYAK_NO_OP_NAMES** on gcc without `-fno-operator-names`
|
||||||
|
* define **XBYAK_ENABLE_OMITTED_OPERAND** if you use omitted destination such as `vaddps(xmm2, xmm3);`(duplicated in the future)
|
||||||
|
|
||||||
|
Sample
|
||||||
|
-------------
|
||||||
|
|
||||||
|
* test0.cpp ; tiny sample of Xbyak(x86, x64)
|
||||||
|
* quantize.cpp ; JIT optimized quantization by fast division(x86 only)
|
||||||
|
* calc.cpp ; assemble and estimate a given polynomial(x86, x64)
|
||||||
|
* bf.cpp ; JIT brainfuck(x86, x64)
|
||||||
|
|
||||||
|
License
|
||||||
|
-------------
|
||||||
|
|
||||||
|
modified new BSD License
|
||||||
|
http://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
The files under test/cybozu/ are copied from cybozulib(https://github.com/herumi/cybozulib/),
|
||||||
|
which is licensed by BSD-3-Clause and are used for only tests.
|
||||||
|
The header files under xbyak/ are independent of cybozulib.
|
||||||
|
|
||||||
|
History
|
||||||
|
-------------
|
||||||
|
* 2017/Jan/26 ver 5.41 add prefetchwt1 and support for scale == 0(thanks to rsdubtso)
|
||||||
|
* 2016/Dec/14 ver 5.40 add Label::getAddress() method to get the pointer specified by the label
|
||||||
|
* 2016/Dec/09 ver 5.34 fix handling of negative offsets when encoding disp8N(thanks to rsdubtso)
|
||||||
|
* 2016/Dec/08 ver 5.33 fix encoding of vpbroadcast{b,w,d,q}, vpinsr{b,w}, vpextr{b,w} for disp8N
|
||||||
|
* 2016/Dec/01 ver 5.32 rename __xgetbv() to _xgetbv() to support clang for Visual Studio(thanks to freiro)
|
||||||
|
* 2016/Nov/27 ver 5.31 rename AVX512_4VNNI to AVX512_4VNNIW
|
||||||
|
* 2016/Nov/27 ver 5.30 add AVX512_4VNNI, AVX512_4FMAPS instructions(thanks to rsdubtso)
|
||||||
|
* 2016/Nov/26 ver 5.20 add detection of AVX512_4VNNI and AVX512_4FMAPS(thanks to rsdubtso)
|
||||||
|
* 2016/Nov/20 ver 5.11 lost vptest for ymm(thanks to gregory38)
|
||||||
|
* 2016/Nov/20 ver 5.10 add addressing [rip+&var]
|
||||||
|
* 2016/Sep/29 ver 5.03 fix detection ERR_INVALID_OPMASK_WITH_MEMORY(thanks to PVS-Studio)
|
||||||
|
* 2016/Aug/15 ver 5.02 xbyak does not include xbyak_bin2hex.h
|
||||||
|
* 2016/Aug/15 ver 5.011 fix detection of version of gcc 5.4
|
||||||
|
* 2016/Aug/03 ver 5.01 disable omitted operand
|
||||||
|
* 2016/Jun/24 ver 5.00 support avx-512 instruction set
|
||||||
|
* 2016/Jun/13 avx-512 add mask instructions
|
||||||
|
* 2016/May/05 ver 4.91 add detection of AVX-512 to Xbyak::util::Cpu
|
||||||
|
* 2016/Mar/14 ver 4.901 comment to ready() function(thanks to skmp)
|
||||||
|
* 2016/Feb/04 ver 4.90 add jcc(const void *addr);
|
||||||
|
* 2016/Jan/30 ver 4.89 vpblendvb supports ymm reg(thanks to John Funnell)
|
||||||
|
* 2016/Jan/24 ver 4.88 lea, cmov supports 16-bit register(thanks to whyisthisfieldhere)
|
||||||
|
* 2015/Oct/05 ver 4.87 support segment selectors
|
||||||
|
* 2015/Aug/18 ver 4.86 fix [rip + label] addressing with immediate value(thanks to whyisthisfieldhere)
|
||||||
|
* 2015/Aug/10 ver 4.85 Address::operator==() is not correct(thanks to inolen)
|
||||||
|
* 2015/Jun/22 ver 4.84 call() support variadic template if available(thanks to randomstuff)
|
||||||
|
* 2015/Jun/16 ver 4.83 support movbe(thanks to benvanik)
|
||||||
|
* 2015/May/24 ver 4.82 support detection of F16C
|
||||||
|
* 2015/Apr/25 ver 4.81 fix the condition to throw exception for setSize(thanks to whyisthisfieldhere)
|
||||||
|
* 2015/Apr/22 ver 4.80 rip supports label(thanks to whyisthisfieldhere)
|
||||||
|
* 2015/Jar/28 ver 4.71 support adcx, adox, cmpxchg, rdseed, stac
|
||||||
|
* 2014/Oct/14 ver 4.70 support MmapAllocator
|
||||||
|
* 2014/Jun/13 ver 4.62 disable warning of VC2014
|
||||||
|
* 2014/May/30 ver 4.61 support bt, bts, btr, btc
|
||||||
|
* 2014/May/28 ver 4.60 support vcvtph2ps, vcvtps2ph
|
||||||
|
* 2014/Apr/11 ver 4.52 add detection of rdrand
|
||||||
|
* 2014/Mar/25 ver 4.51 remove state information of unreferenced labels
|
||||||
|
* 2014/Mar/16 ver 4.50 support new Label
|
||||||
|
* 2014/Mar/05 ver 4.40 fix wrong detection of BMI/enhanced rep on VirtualBox
|
||||||
|
* 2013/Dec/03 ver 4.30 support Reg::cvt8(), cvt16(), cvt32(), cvt64()
|
||||||
|
* 2013/Oct/16 ver 4.21 label support std::string
|
||||||
|
* 2013/Jul/30 ver 4.20 [break backward compatibility] split Reg32e class into RegExp(base+index*scale+disp) and Reg32e(means Reg32 or Reg64)
|
||||||
|
* 2013/Jul/04 ver 4.10 [break backward compatibility] change the type of Xbyak::Error from enum to a class
|
||||||
|
* 2013/Jun/21 ver 4.02 add putL(LABEL) function to put the address of the label
|
||||||
|
* 2013/Jun/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm).
|
||||||
|
support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest).
|
||||||
|
* 2013/May/30 ver 4.00 support AVX2, VEX-encoded GPR-instructions
|
||||||
|
* 2013/Mar/27 ver 3.80 support mov(reg, "label");
|
||||||
|
* 2013/Mar/13 ver 3.76 add cqo(), jcxz(), jecxz(), jrcxz()
|
||||||
|
* 2013/Jan/15 ver 3.75 add setSize() to modify generated code
|
||||||
|
* 2013/Jan/12 ver 3.74 add CodeGenerator::reset() ; add Allocator::useProtect()
|
||||||
|
* 2013/Jan/06 ver 3.73 use unordered_map if possible
|
||||||
|
* 2012/Dec/04 ver 3.72 eax, ebx, ... are member variables of CodeGenerator(revert), Xbyak::util::eax, ... are static const.
|
||||||
|
* 2012/Nov/17 ver 3.71 and_(), or_(), xor_(), not_() are available if XBYAK_NO_OP_NAMES is not defined.
|
||||||
|
* 2012/Nov/17 change eax, ebx, ptr and so on in CodeGenerator as static member and alias of them are defined in Xbyak::util.
|
||||||
|
* 2012/Nov/09 ver 3.70 XBYAK_NO_OP_NAMES macro is added to use and_() instead of and() (thanks to Mattias)
|
||||||
|
* 2012/Nov/01 ver 3.62 add fwait/fnwait/finit/fninit
|
||||||
|
* 2012/Nov/01 ver 3.61 add fldcw/fstcw
|
||||||
|
* 2012/May/03 ver 3.60 change interface of Allocator
|
||||||
|
* 2012/Mar/23 ver 3.51 fix userPtr mode
|
||||||
|
* 2012/Mar/19 ver 3.50 support AutoGrow mode
|
||||||
|
* 2011/Nov/09 ver 3.05 fix bit property of rip addresing / support movsxd
|
||||||
|
* 2011/Aug/15 ver 3.04 fix dealing with imm8 such as add(dword [ebp-8], 0xda); (thanks to lolcat)
|
||||||
|
* 2011/Jun/16 ver 3.03 fix __GNUC_PREREQ macro for Mac gcc(thanks to t_teruya)
|
||||||
|
* 2011/Apr/28 ver 3.02 do not use xgetbv on Mac gcc
|
||||||
|
* 2011/May/24 ver 3.01 fix typo of OSXSAVE
|
||||||
|
* 2011/May/23 ver 3.00 add vcmpeqps and so on
|
||||||
|
* 2011/Feb/16 ver 2.994 beta add vmovq for 32-bit mode(I forgot it)
|
||||||
|
* 2011/Feb/16 ver 2.993 beta remove cvtReg to avoid thread unsafe
|
||||||
|
* 2011/Feb/10 ver 2.992 beta support one argument syntax for fadd like nasm
|
||||||
|
* 2011/Feb/07 ver 2.991 beta fix pextrw reg, xmm, imm(Thanks to Gabest)
|
||||||
|
* 2011/Feb/04 ver 2.99 beta support AVX
|
||||||
|
* 2010/Dec/08 ver 2.31 fix ptr [rip + 32bit offset], support rdtscp
|
||||||
|
* 2010/Oct/19 ver 2.30 support pclmulqdq, aesdec, aesdeclast, aesenc, aesenclast, aesimc, aeskeygenassist
|
||||||
|
* 2010/Jun/07 ver 2.29 fix call(<label>)
|
||||||
|
* 2010/Jun/17 ver 2.28 move some member functions to public
|
||||||
|
* 2010/Jun/01 ver 2.27 support encoding of mov(reg64, imm) like yasm(not nasm)
|
||||||
|
* 2010/May/24 ver 2.26 fix sub(rsp, 1000)
|
||||||
|
* 2010/Apr/26 ver 2.25 add jc/jnc(I forgot to implement them...)
|
||||||
|
* 2010/Apr/16 ver 2.24 change the prototype of rewrite() method
|
||||||
|
* 2010/Apr/15 ver 2.23 fix align() and xbyak_util.h for Mac
|
||||||
|
* 2010/Feb/16 ver 2.22 fix inLocalLabel()/outLocalLabel()
|
||||||
|
* 2009/Dec/09 ver 2.21 support cygwin(gcc 4.3.2)
|
||||||
|
* 2009/Nov/28 support a part of FPU
|
||||||
|
* 2009/Jun/25 fix mov(qword[rax], imm); (thanks to Martin)
|
||||||
|
* 2009/Mar/10 fix redundant REX.W prefix on jmp/call reg64
|
||||||
|
* 2009/Feb/24 add movq reg64, mmx/xmm; movq mmx/xmm, reg64
|
||||||
|
* 2009/Feb/13 movd(xmm7, dword[eax]) drops 0x66 prefix (thanks to Gabest)
|
||||||
|
* 2008/Dec/30 fix call in short relative address(thanks to kato san)
|
||||||
|
* 2008/Sep/18 support @@, @f, @b and localization of label(thanks to nobu-q san)
|
||||||
|
* 2008/Sep/18 support (ptr[rip + 32bit offset]) (thanks to Dango-Chu san)
|
||||||
|
* 2008/Jun/03 fix align(). mov(ptr[eax],1) throws ERR_MEM_SIZE_IS_NOT_SPECIFIED.
|
||||||
|
* 2008/Jun/02 support memory interface allocated by user
|
||||||
|
* 2008/May/26 fix protect() to avoid invalid setting(thanks to shinichiro_h san)
|
||||||
|
* 2008/Apr/30 add cmpxchg16b, cdqe
|
||||||
|
* 2008/Apr/29 support x64
|
||||||
|
* 2008/Apr/14 code refactoring
|
||||||
|
* 2008/Mar/12 add bsr/bsf
|
||||||
|
* 2008/Feb/14 fix output of sub eax, 1234 (thanks to Robert)
|
||||||
|
* 2007/Nov/5 support lock, xadd, xchg
|
||||||
|
* 2007/Nov/2 support SSSE3/SSE4 (thanks to Dango-Chu san)
|
||||||
|
* 2007/Feb/4 fix the bug that exception doesn't occur under the condition which the offset of jmp mnemonic without T_NEAR is over 127.
|
||||||
|
* 2007/Jan/21 fix the bug to create address like [disp] select smaller representation for mov (eax|ax|al, [disp])
|
||||||
|
* 2007/Jan/4 first version
|
||||||
|
|
||||||
|
Author
|
||||||
|
-------------
|
||||||
|
|
||||||
|
MITSUNARI Shigeo(herumi@nifty.com)
|
||||||
|
|
|
@ -0,0 +1,464 @@
|
||||||
|
|
||||||
|
C++用x86(IA-32), x64(AMD64, x86-64) JITアセンブラ Xbyak 5.41
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎概要
|
||||||
|
|
||||||
|
これはx86, x64(AMD64, x86-64)のマシン語命令を生成するC++のクラスライブラリです。
|
||||||
|
プログラム実行時に動的にアセンブルすることが可能です。
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎特徴
|
||||||
|
|
||||||
|
・ヘッダファイルオンリー
|
||||||
|
xbyak.hをインクルードするだけですぐ利用することができます。
|
||||||
|
C++の枠組み内で閉じているため、外部アセンブラは不要です。
|
||||||
|
32bit/64bit両対応です。
|
||||||
|
対応ニーモニック:特権命令除くx86, MMX/MMX2/SSE/SSE2/SSE3/SSSE3/SSE4/FPU(一部)/AVX/AVX2/FMA/VEX-encoded GPR
|
||||||
|
|
||||||
|
・Windows Xp(32bit, 64bit), Windows 7/Linux(32bit, 64bit)/Intel Mac対応
|
||||||
|
Windows Xp, Windows 7上ではVC2008, VC2010, VC2012
|
||||||
|
Linux (kernel 3.8)上ではgcc 4.7.3, clang 3.3
|
||||||
|
Intel Mac
|
||||||
|
などで動作確認をしています。
|
||||||
|
|
||||||
|
※ Xbyakはデフォルトでand(), or(), xor(), not()関数を使います。
|
||||||
|
gccではそれらを演算子として解釈してしまうため、-fno-operator-namesオプションを追加してコンパイルしてください。
|
||||||
|
あるいはXBYAK_NO_OP_NAMESを定義してand_(), or_(), xor_(), not_()を使ってください。
|
||||||
|
and_(), or_(), xor_(), not_()はXBYAK_NO_OP_NAMESされていないときでも使えます。
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎準備
|
||||||
|
xbyak.h
|
||||||
|
xbyak_bin2hex.h
|
||||||
|
xbyak_mnemonic.h
|
||||||
|
これらを同一のパスに入れてインクルードパスに追加してください。
|
||||||
|
|
||||||
|
Linuxではmake installで/usr/local/include/xbyakにコピーされます。
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎下位互換性の破れ
|
||||||
|
* Xbyak::Errorの型をenumからclassに変更
|
||||||
|
** 従来のenumの値をとるにはintにキャストしてください。
|
||||||
|
* (古い)Reg32eクラスを(新しい)Reg32eとRegExpに分ける。
|
||||||
|
** (新しい)Reg32eはReg32かReg64
|
||||||
|
** (新しい)RegExpは'Reg32e + (Reg32e|Xmm|Ymm) * scale + disp'の型
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎新機能
|
||||||
|
|
||||||
|
MmapAllocator追加
|
||||||
|
これはUnix系OSでのみの仕様です。XBYAK_USE_MMAP_ALLOCATORを使うと利用できます。
|
||||||
|
デフォルトのAllocatorはメモリ確保時にposix_memalignを使います。
|
||||||
|
この領域に対するmprotectはmap countを減らします。
|
||||||
|
map countの最大値は/proc/sys/vm/max_map_countに書かれています。
|
||||||
|
デフォルトでは3万個ほどのXbyak::CodeGeneratorインスタンスを生成するとエラーになります。
|
||||||
|
test/mprotect_test.cppで確認できます。
|
||||||
|
これを避けるためにはmmapを使うMmapAllocatorを使ってください。
|
||||||
|
将来この挙動がデフォルトになるかもしれません。
|
||||||
|
|
||||||
|
|
||||||
|
AutoGrowモード追加
|
||||||
|
これはメモリ伸長を動的に行うモードです。
|
||||||
|
今まではXbyak::CodeGenerator()に渡したメモリサイズを超えると例外が発生して
|
||||||
|
いましたが、このモードでは内部でメモリを再確保して伸長します。
|
||||||
|
ただし、getCode()を呼び出す前にジャンプ命令のアドレス解決をするためにready()
|
||||||
|
関数を呼ぶ必要があります。
|
||||||
|
|
||||||
|
次のように使います。
|
||||||
|
|
||||||
|
struct Code : Xbyak::CodeGenerator {
|
||||||
|
Code()
|
||||||
|
: Xbyak::CodeGenerator(<default memory size>, Xbyak::AutoGrow)
|
||||||
|
{
|
||||||
|
...
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Code c;
|
||||||
|
c.ready(); // この呼び出しを忘れてはいけない
|
||||||
|
|
||||||
|
注意1. ready()を呼んで確定するまではgetCurr()で得たポインタは無効化されている
|
||||||
|
可能性があります。getSize()でoffsetを保持しておきready()のあとにgetCode()を
|
||||||
|
呼び出してからgetCode() + offsetで新しいポインタを取得してください。
|
||||||
|
|
||||||
|
注意2. AutoGrowモードでは64bitモードの相対アドレッシング[rip]は非サポートです。
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎文法
|
||||||
|
|
||||||
|
Xbyak::CodeGeneratorクラスを継承し、そのクラスメソッド内でx86, x64アセンブラを
|
||||||
|
記述します。そのメソッドを呼び出した後、getCode()メソッドを呼び出し、その戻
|
||||||
|
り値を自分が使いたい関数ポインタに変換して利用します。アセンブルエラーは例外
|
||||||
|
により通知されます(cf. main.cpp)。
|
||||||
|
|
||||||
|
・基本的にnasmの命令で括弧をつければよいです。
|
||||||
|
|
||||||
|
mov eax, ebx --> mov(eax, ebx);
|
||||||
|
inc ecx inc(ecx);
|
||||||
|
ret --> ret();
|
||||||
|
|
||||||
|
・アドレッシング
|
||||||
|
|
||||||
|
(ptr|dword|word|byte) [base + index * (1|2|4|8) + displacement]
|
||||||
|
[rip + 32bit disp] ; x64 only
|
||||||
|
という形で指定します。サイズを指定する必要がない限りptrを使えばよいです。
|
||||||
|
|
||||||
|
セレクター(セグメントレジスタ)をサポートしました。
|
||||||
|
(注意)セグメントレジスタはOperandを継承していません。
|
||||||
|
|
||||||
|
mov eax, [fs:eax] --> putSeg(fs); mov(eax, ptr [eax]);
|
||||||
|
mov ax, cs --> mov(ax, cs);
|
||||||
|
|
||||||
|
mov eax, [ebx+ecx] --> mov (eax, ptr[ebx+ecx]);
|
||||||
|
test byte [esp], 4 --> test (byte [esp], 4);
|
||||||
|
|
||||||
|
(注意) dword, word, byteはメンバ変数です。従ってたとえばunsigned intの
|
||||||
|
つもりでdwordをtypedefしないでください。
|
||||||
|
|
||||||
|
・AVX
|
||||||
|
|
||||||
|
FMAについては簡略表記を導入するか検討中です(アイデア募集中)。
|
||||||
|
|
||||||
|
vaddps(xmm1, xmm2, xmm3); // xmm1 <- xmm2 + xmm3
|
||||||
|
vaddps(xmm2, xmm3, ptr [rax]); // メモリアクセスはptrで
|
||||||
|
|
||||||
|
vfmadd231pd(xmm1, xmm2, xmm3); // xmm1 <- (xmm2 * xmm3) + xmm1
|
||||||
|
|
||||||
|
*注意*
|
||||||
|
デスティネーションの省略形はサポートされなくなりました。
|
||||||
|
|
||||||
|
vaddps(xmm2, xmm3); // xmm2 <- xmm2 + xmm3
|
||||||
|
|
||||||
|
XBYAK_ENABLE_OMITTED_OPERANDを定義すると使えますが、将来はそれも非サポートになるでしょう。
|
||||||
|
|
||||||
|
・AVX-512
|
||||||
|
|
||||||
|
vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30);
|
||||||
|
vaddpd xmm30, xmm20, [rax] --> vaddpd(xmm30, xmm20, ptr [rax]);
|
||||||
|
vaddps xmm30, xmm20, [rax] --> vaddps(xmm30, xmm20, ptr [rax]);
|
||||||
|
vaddpd zmm2{k5}, zmm4, zmm2 --> vaddpd(zmm2 | k5, zmm4, zmm2);
|
||||||
|
vaddpd zmm2{k5}{z}, zmm4, zmm2 --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2);
|
||||||
|
vaddpd zmm2{k5}{z}, zmm4, zmm2,{rd-sae} --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2 | T_rd_sae);
|
||||||
|
vaddpd(zmm2 | k5 | T_z | T_rd_sae, zmm4, zmm2); // the position of `|` is arbitrary.
|
||||||
|
vcmppd k4{k3}, zmm1, zmm2, {sae}, 5 --> vcmppd(k4 | k3, zmm1, zmm2 | T_sae, 5);
|
||||||
|
|
||||||
|
vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]);
|
||||||
|
vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]);
|
||||||
|
vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]);
|
||||||
|
vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]);
|
||||||
|
vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4);
|
||||||
|
|
||||||
|
vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, xword [eax+33]); // use xword for m128 instead of oword
|
||||||
|
vcvtpd2dq(xmm16, ptr [eax+33]); // default xword
|
||||||
|
vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]);
|
||||||
|
vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256
|
||||||
|
vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast
|
||||||
|
|
||||||
|
vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512
|
||||||
|
vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit
|
||||||
|
vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit
|
||||||
|
|
||||||
|
|
||||||
|
注意
|
||||||
|
* k1, ..., k7 は新しいopmaskレジスタです。
|
||||||
|
* z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。
|
||||||
|
* `k4 | k3`と`k3 | k4`は意味が異なります。
|
||||||
|
* {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。
|
||||||
|
* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。
|
||||||
|
|
||||||
|
・ラベル
|
||||||
|
|
||||||
|
L(文字列);
|
||||||
|
で定義します。ジャンプするときはその文字列を指定します。後方参照も可能ですが、
|
||||||
|
相対アドレスが8ビットに収まらない場合はT_NEARをつけないと実行時に例外が発生
|
||||||
|
します。
|
||||||
|
mov(eax, "L2");の様にラベルが表すアドレスをmovの即値として使えます。
|
||||||
|
|
||||||
|
・hasUndefinedLabel()を呼び出して真ならジャンプ先が存在しないことを示します。
|
||||||
|
コードを見直してください。
|
||||||
|
|
||||||
|
L("L1");
|
||||||
|
jmp ("L1");
|
||||||
|
|
||||||
|
jmp ("L2");
|
||||||
|
...
|
||||||
|
少しの命令の場合。
|
||||||
|
...
|
||||||
|
L("L2");
|
||||||
|
|
||||||
|
jmp ("L3", T_NEAR);
|
||||||
|
...
|
||||||
|
沢山の命令がある場合
|
||||||
|
...
|
||||||
|
L("L3");
|
||||||
|
|
||||||
|
<応用編>
|
||||||
|
|
||||||
|
1. MASMライクな@@, @f, @bをサポート
|
||||||
|
|
||||||
|
L("@@"); // <A>
|
||||||
|
jmp("@b"); // jmp to <A>
|
||||||
|
jmp("@f"); // jmp to <B>
|
||||||
|
L("@@"); // <B>
|
||||||
|
jmp("@b"); // jmp to <B>
|
||||||
|
mov(eax, "@b");
|
||||||
|
jmp(eax); // jmp to <B>
|
||||||
|
|
||||||
|
2. ラベルの局所化
|
||||||
|
|
||||||
|
ピリオドで始まるラベルをinLocalLabel(), outLocalLabel()で挟むことで局所化できます。
|
||||||
|
inLocalLabel(), outLocalLabel()は入れ子にすることができます。
|
||||||
|
|
||||||
|
void func1()
|
||||||
|
{
|
||||||
|
inLocalLabel();
|
||||||
|
L(".lp"); // <A> ; ローカルラベル
|
||||||
|
...
|
||||||
|
jmp(".lp"); // jmpt to <A>
|
||||||
|
L("aaa"); // グローバルラベル
|
||||||
|
outLocalLabel();
|
||||||
|
}
|
||||||
|
|
||||||
|
void func2()
|
||||||
|
{
|
||||||
|
inLocalLabel();
|
||||||
|
L(".lp"); // <B> ; ローカルラベル
|
||||||
|
func1();
|
||||||
|
jmp(".lp"); // jmp to <B>
|
||||||
|
outLocalLabel();
|
||||||
|
}
|
||||||
|
|
||||||
|
上記サンプルではinLocalLabel(), outLocalLabel()が無いと、
|
||||||
|
".lp"ラベルの二重定義エラーになります。
|
||||||
|
|
||||||
|
3. 新しいLabelクラスによるジャンプ命令
|
||||||
|
|
||||||
|
ジャンプ先を文字列による指定だけでなくラベルクラスを使えるようになりました。
|
||||||
|
|
||||||
|
Label label1, label2;
|
||||||
|
L(label1);
|
||||||
|
...
|
||||||
|
jmp(label1);
|
||||||
|
...
|
||||||
|
jmp(label2);
|
||||||
|
...
|
||||||
|
L(label2);
|
||||||
|
|
||||||
|
更にラベルの割り当てを行うassignL(dstLabel, srcLabel)という命令も追加されました。
|
||||||
|
|
||||||
|
Label label1, label2;
|
||||||
|
L(label1);
|
||||||
|
...
|
||||||
|
jmp(label2);
|
||||||
|
...
|
||||||
|
assignL(label2, label1);
|
||||||
|
|
||||||
|
上記jmp命令はlabel1にジャンプします。
|
||||||
|
|
||||||
|
制限
|
||||||
|
* srcLabelはL()により飛び先が確定していないといけません。
|
||||||
|
* dstLabelはL()により飛び先が確定していてはいけません。
|
||||||
|
|
||||||
|
ラベルは`getAddress()`によりそのアドレスを取得できます。
|
||||||
|
未定義のときは0が返ります。
|
||||||
|
```
|
||||||
|
// not AutoGrow mode
|
||||||
|
Label label;
|
||||||
|
assert(label.getAddress(), 0);
|
||||||
|
L(label);
|
||||||
|
assert(label.getAddress(), getCurr());
|
||||||
|
```
|
||||||
|
|
||||||
|
・Xbyak::CodeGenerator()コンストラクタインタフェース
|
||||||
|
|
||||||
|
@param maxSize [in] コード生成最大サイズ(デフォルト4096byte)
|
||||||
|
@param userPtr [in] ユーザ指定メモリ
|
||||||
|
|
||||||
|
CodeGenerator(size_t maxSize = DEFAULT_MAX_CODE_SIZE, void *userPtr = 0);
|
||||||
|
|
||||||
|
デフォルトコードサイズは4096(=DEFAULT_MAX_CODE_SIZE)バイトです。
|
||||||
|
それより大きなコードを生成する場合はCodeGenerator()のコンストラクタに指定してください。
|
||||||
|
|
||||||
|
class Quantize : public Xbyak::CodeGenerator {
|
||||||
|
public:
|
||||||
|
Quantize()
|
||||||
|
: CodeGenerator(8192)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
...
|
||||||
|
};
|
||||||
|
|
||||||
|
またユーザ指定メモリをコード生成最大サイズと共に指定すると、CodeGeneratorは
|
||||||
|
指定されたメモリ上にバイト列を生成します。
|
||||||
|
|
||||||
|
補助関数として指定されたアドレスの実行属性を変更するCodeArray::protect()と
|
||||||
|
与えられたポインタからアライメントされたポインタを取得するCodeArray::getAlignedAddress()
|
||||||
|
も用意しました。詳細はsample/test0.cppのuse memory allocated by userを参考に
|
||||||
|
してください。
|
||||||
|
|
||||||
|
/**
|
||||||
|
change exec permission of memory
|
||||||
|
@param addr [in] buffer address
|
||||||
|
@param size [in] buffer size
|
||||||
|
@param canExec [in] true(enable to exec), false(disable to exec)
|
||||||
|
@return true(success), false(failure)
|
||||||
|
*/
|
||||||
|
bool CodeArray::protect(const void *addr, size_t size, bool canExec);
|
||||||
|
|
||||||
|
/**
|
||||||
|
get aligned memory pointer
|
||||||
|
*/
|
||||||
|
uint8 *CodeArray::getAlignedAddress(uint8 *addr, size_t alignedSize = ALIGN_SIZE);
|
||||||
|
|
||||||
|
その他詳細は各種サンプルを参照してください。
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎マクロ
|
||||||
|
|
||||||
|
32bit環境上でコンパイルするとXBYAK32が、64bit環境上でコンパイルするとXBYAK64が
|
||||||
|
定義されます。さらに64bit環境上ではWindows(VC)ならXBYAK64_WIN、cygwin, gcc上では
|
||||||
|
XBYAK64_GCCが定義されます。
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎使用例
|
||||||
|
|
||||||
|
test0.cpp ; 簡単な例(x86, x64)
|
||||||
|
quantize.cpp ; 割り算のJITアセンブルによる量子化の高速化(x86)
|
||||||
|
calc.cpp ; 与えられた多項式をアセンブルして実行(x86, x64)
|
||||||
|
boost(http://www.boost.org/)が必要
|
||||||
|
bf.cpp ; JIT Brainfuck(x86, x64)
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎ライセンス
|
||||||
|
|
||||||
|
修正された新しいBSDライセンスに従います。
|
||||||
|
http://opensource.org/licenses/BSD-3-Clause
|
||||||
|
|
||||||
|
sample/{echo,hello}.bfは http://www.kmonos.net/alang/etc/brainfuck.php から
|
||||||
|
いただきました。
|
||||||
|
|
||||||
|
test/cybozu/以下のファイルはcybozulib(https://github.com/herumi/cybozulib/)
|
||||||
|
の一部を使っています。cybozulibはBSD-3-Clauseライセンスです。
|
||||||
|
cybozulibは単体テストでのみ利用されていて、xbyak/ディレクトリ以下のヘッダ
|
||||||
|
ファイルはcybozulibとは独立に利用できます。
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎履歴
|
||||||
|
|
||||||
|
2017/01/26 ver 5.41 prefetcwt1追加とscale == 0対応(thanks to rsdubtso)
|
||||||
|
2016/12/14 ver 5.40 Labelが示すアドレスを取得するLabel::getAddress()追加
|
||||||
|
2016/12/07 ver 5.34 disp8N時の負のオフセット処理の修正(thanks to rsdubtso)
|
||||||
|
2016/12/06 ver 5.33 disp8N時のvpbroadcast{b,w,d,q}, vpinsr{b,w}, vpextr{b,w}のバグ修正
|
||||||
|
2016/12/01 ver 5.32 clang for Visual Studioサポートのために__xgetbv()を_xgetbv()に変更(thanks to freiro)
|
||||||
|
2016/11/27 ver 5.31 AVX512_4VNNIをAVX512_4VNNIWに変更
|
||||||
|
2016/11/27 ver 5.30 AVX512_4VNNI, AVX512_4FMAPS命令の追加(thanks to rsdubtso)
|
||||||
|
2016/11/26 ver 5.20 AVX512_4VNNIとAVX512_4FMAPSの判定追加(thanks to rsdubtso)
|
||||||
|
2016/11/20 ver 5.11 何故か消えていたvptest for ymm追加(thanks to gregory38)
|
||||||
|
2016/11/20 ver 5.10 [rip+&var]の形のアドレッシング追加
|
||||||
|
2016/09/29 ver 5.03 ERR_INVALID_OPMASK_WITH_MEMORYの判定ミス修正(thanks to PVS-Studio)
|
||||||
|
2016/08/15 ver 5.02 xbyak_bin2hex.hをincludeしない
|
||||||
|
2016/08/15 ver 5.011 gcc 5.4のバージョン取得ミスの修正
|
||||||
|
2016/08/03 ver 5.01 AVXの省略表記非サポート
|
||||||
|
2016/07/24 ver 5.00 avx-512フルサポート
|
||||||
|
2016/06/13 avx-512 opmask命令サポート
|
||||||
|
2016/05/05 ver 4.91 AVX-512命令の検出サポート
|
||||||
|
2016/03/14 ver 4.901 ready()関数にコメント加筆(thanks to skmp)
|
||||||
|
2016/02/04 ver 4.90 条件分岐命令にjcc(const void *addr);のタイプを追加
|
||||||
|
2016/01/30 ver 4.89 vpblendvbがymmレジスタをサポートしていなかった(thanks to John Funnell)
|
||||||
|
2016/01/24 ver 4.88 lea, cmovの16bitレジスタ対応(thanks to whyisthisfieldhere)
|
||||||
|
2015/08/16 ver 4.87 セグメントセレクタに対応
|
||||||
|
2015/08/16 ver 4.86 [rip + label]アドレッシングで即値を使うと壊れる(thanks to whyisthisfieldhere)
|
||||||
|
2015/08/10 ver 4.85 Address::operator==()が間違っている(thanks to inolen)
|
||||||
|
2015/07/22 ver 4.84 call()がvariadic template対応
|
||||||
|
2015/05/24 ver 4.83 mobveサポート(thanks to benvanik)
|
||||||
|
2015/05/24 ver 4.82 F16Cが使えるかどうかの判定追加
|
||||||
|
2015/04/25 ver 4.81 setSizeが例外を投げる条件を修正(thanks to whyisthisfieldhere)
|
||||||
|
2015/04/22 ver 4.80 rip相対でLabelのサポート(thanks to whyisthisfieldhere)
|
||||||
|
2015/01/28 ver 4.71 adcx, adox, cmpxchg, rdseed, stacのサポート
|
||||||
|
2014/10/14 ver 4.70 MmapAllocatorのサポート
|
||||||
|
2014/06/13 ver 4.62 VC2014で警告抑制
|
||||||
|
2014/05/30 ver 4.61 bt, bts, btr, btcのサポート
|
||||||
|
2014/05/28 ver 4.60 vcvtph2ps, vcvtps2phのサポート
|
||||||
|
2014/04/11 ver 4.52 rdrandの判定追加
|
||||||
|
2014/03/25 ver 4.51 参照されなくなったラベルの状態を削除する
|
||||||
|
2014/03/16 ver 4.50 新しいラベルクラスのサポート
|
||||||
|
2014/03/05 ver 4.40 VirtualBox上でBMI/enhanced repのサポート判定を間違うことがあるのを修正
|
||||||
|
2013/12/03 ver 4.30 Reg::cvt8(), cvt16(), cvt32()のサポート
|
||||||
|
2013/10/16 ver 4.21 ラベルでstd::stringを受け付ける。
|
||||||
|
2013/07/30 ver 4.20 [break backward compatibility] 従来のReg32eクラスをアドレッシング用のRegExpとReg32, Reg64を表すReg32eに分離
|
||||||
|
2013/07/04 ver 4.10 [break backward compatibility] Xbyak::Errorの型をenumからclassに変更
|
||||||
|
2013/06/21 ver 4.02 LABELの指すアドレスを書き込むputL(LABEL)関数の追加。
|
||||||
|
2013/06/21 ver 4.01 vpsllw, vpslld, vpsllq, vpsraw, vpsrad, vpsrlw, vpsrld, vpsrlq support (ymm, ymm, xmm)
|
||||||
|
support vpbroadcastb, vpbroadcastw, vpbroadcastd, vpbroadcastq(thanks to Gabest)
|
||||||
|
2013/05/30 ver 4.00 AVX2, VEX-encoded GPR-instructionをサポート
|
||||||
|
2013/03/27 ver 3.80 mov(reg, "label");をサポート
|
||||||
|
2013/03/13 ver 3.76 cqo, jcxz, jecxz, jrcxz追加
|
||||||
|
2013/01/15 ver 3.75 生成されたコードを修正するためにsetSize()を追加
|
||||||
|
2013/01/12 ver 3.74 CodeGenerator::reset()とAllocator::useProtect()を追加
|
||||||
|
2013/01/06 ver 3.73 可能ならunordered_mapを使う
|
||||||
|
2012/12/04 ver 3.72 eaxなどをCodeGeneratorのメンバ変数に戻す. Xbyak::util::eaxはstatic const変数
|
||||||
|
2012/11/17 ver 3.71 and_(), or_(), xor_(), not_()をXBYAK_NO_OP_NAMESが定義されていないときでも使えるようにした
|
||||||
|
2012/11/17 CodeGeneratorのeax, ecx, ptrなどのメンバ変数をstaticにし、const参照をXbyak::utilにも定義
|
||||||
|
2012/11/09 ver 3.70 and()をand_()にするためのマクロXBYAK_NO_OP_NAMESを追加(thanks to Mattias)
|
||||||
|
2012/11/01 ver 3.62 add fwait/fnwait/finit/fninit
|
||||||
|
2012/11/01 ver 3.61 add fldcw/fstcw
|
||||||
|
2012/05/03 ver 3.60 Allocatorクラスのインタフェースを変更
|
||||||
|
2012/03/23 ver 3.51 userPtrモードがバグったのを修正
|
||||||
|
2012/03/19 ver 3.50 AutoGrowモードサポート
|
||||||
|
2011/11/09 ver 3.05 rip相対の64bitサイズ以外の扱いのバグ修正 / movsxdサポート
|
||||||
|
2011/08/15 ver 3.04 add(dword [ebp-8], 0xda);などにおけるimm8の扱いのバグ修正(thanks to lolcat)
|
||||||
|
2011/06/16 ver 3.03 Macのgcc上での__GNUC_PREREQがミスってたのを修正(thanks to t_teruya)
|
||||||
|
2011/04/28 ver 3.02 Macのgcc上ではxgetbvをdisable
|
||||||
|
2011/03/24 ver 3.01 fix typo of OSXSAVE
|
||||||
|
2011/03/23 ver 3.00 vcmpeqpsなどを追加
|
||||||
|
2011/02/16 ver 2.994 beta add vmovq for 32-bit mode(I forgot it)
|
||||||
|
2011/02/16 ver 2.993 beta remove cvtReg to avoid thread unsafe
|
||||||
|
2011/02/10 ver 2.992 beta support one argument syntax for fadd like nasm
|
||||||
|
2011/02/07 ver 2.991 beta fix pextrw reg, xmm, imm(Thanks to Gabest)
|
||||||
|
2011/02/04 ver 2.99 beta support AVX
|
||||||
|
2010/12/08 ver 2.31 fix ptr [rip + 32bit offset], support rtdscp
|
||||||
|
2010/10/19 ver 2.30 support pclmulqdq, aesdec, aesdeclast, aesenc, aesenclast, aesimc, aeskeygenassist
|
||||||
|
2010/07/07 ver 2.29 fix call(<label>)
|
||||||
|
2010/06/17 ver 2.28 move some member functions to public
|
||||||
|
2010/06/01 ver 2.27 support encoding of mov(reg64, imm) like yasm(not nasm)
|
||||||
|
2010/05/24 ver 2.26 fix sub(rsp, 1000)
|
||||||
|
2010/04/26 ver 2.25 add jc/jnc(I forgot to implement them...)
|
||||||
|
2010/04/16 ver 2.24 change the prototype of rewrite() method
|
||||||
|
2010/04/15 ver 2.23 fix align() and xbyak_util.h for Mac
|
||||||
|
2010/02/16 ver 2.22 fix inLocalLabel()/outLocalLabel()
|
||||||
|
2009/12/09 ver 2.21 support cygwin(gcc 4.3.2)
|
||||||
|
2009/11/28 ver 2.20 FPUの一部命令サポート
|
||||||
|
2009/06/25 ver 2.11 64bitモードでの mov(qword[rax], imm); 修正(thanks to Martinさん)
|
||||||
|
2009/03/10 ver 2.10 jmp/call reg64の冗長なREG.W削除
|
||||||
|
2009/02/24 ver 2.09 movq reg64, mmx/xmm; movq mmx/xmm, reg64追加
|
||||||
|
2009/02/13 ver 2.08 movd(xmm7, dword[eax])が0x66を落とすバグ修正(thanks to Gabestさん)
|
||||||
|
2008/12/30 ver 2.07 call()の相対アドレスが8bit以下のときのバグ修正(thanks to katoさん)
|
||||||
|
2008/09/18 ver 2.06 @@, @f, @bとラベルの局所化機能追加(thanks to nobu-qさん)
|
||||||
|
2008/09/18 ver 2.05 ptr [rip + 32bit offset]サポート(thanks to 団子厨(Dango-Chu)さん)
|
||||||
|
2008/06/03 ver 2.04 align()のポカミス修正。mov(ptr[eax],1);などをエラーに
|
||||||
|
2008/06/02 ver 2.03 ユーザ定義メモリインタフェースサポート
|
||||||
|
2008/05/26 ver 2.02 protect()(on Linux)で不正な設定になることがあるのを修正(thanks to sinichiro_hさん)
|
||||||
|
2008/04/30 ver 2.01 cmpxchg16b, cdqe追加
|
||||||
|
2008/04/29 ver 2.00 x86/x64-64版公開
|
||||||
|
2008/04/25 ver 1.90 x64版β公開
|
||||||
|
2008/04/18 ver 1.12 コード整理
|
||||||
|
2008/04/14 ver 1.11 コード整理
|
||||||
|
2008/03/12 ver 1.10 bsf/bsr追加(忘れていた)
|
||||||
|
2008/02/14 ver 1.09 sub eax, 1234が16bitモードで出力されていたのを修正(thanks to Robertさん)
|
||||||
|
2007/11/05 ver 1.08 lock, xadd, xchg追加
|
||||||
|
2007/11/02 ver 1.07 SSSE3/SSE4対応(thanks to 団子厨(Dango-Chu)さん)
|
||||||
|
2007/09/25 ver 1.06 call((int)関数ポインタ); jmp((int)関数ポインタ);のサポート
|
||||||
|
2007/08/04 ver 1.05 細かい修正
|
||||||
|
2007/02/04 後方へのジャンプでT_NEARをつけないときに8bit相対アドレスに入らない
|
||||||
|
場合に例外が発生しないバグの修正
|
||||||
|
2007/01/21 [disp]の形のアドレス生成のバグ修正
|
||||||
|
mov (eax|ax|al, [disp]); mov([disp], eax|ax|al);の短い表現選択
|
||||||
|
2007/01/17 webページ作成
|
||||||
|
2007/01/04 公開開始
|
||||||
|
|
||||||
|
-----------------------------------------------------------------------------
|
||||||
|
◎著作権者
|
||||||
|
|
||||||
|
光成滋生(MITSUNARI Shigeo, herumi@nifty.com)
|
||||||
|
|
||||||
|
---
|
||||||
|
$Revision: 1.56 $
|
||||||
|
$Date: 2010/04/16 11:58:22 $
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,258 @@
|
||||||
|
enum {
|
||||||
|
B00000000= 0,
|
||||||
|
B00000001= 1,
|
||||||
|
B00000010= 2,
|
||||||
|
B00000011= 3,
|
||||||
|
B00000100= 4,
|
||||||
|
B00000101= 5,
|
||||||
|
B00000110= 6,
|
||||||
|
B00000111= 7,
|
||||||
|
B00001000= 8,
|
||||||
|
B00001001= 9,
|
||||||
|
B00001010= 10,
|
||||||
|
B00001011= 11,
|
||||||
|
B00001100= 12,
|
||||||
|
B00001101= 13,
|
||||||
|
B00001110= 14,
|
||||||
|
B00001111= 15,
|
||||||
|
B00010000= 16,
|
||||||
|
B00010001= 17,
|
||||||
|
B00010010= 18,
|
||||||
|
B00010011= 19,
|
||||||
|
B00010100= 20,
|
||||||
|
B00010101= 21,
|
||||||
|
B00010110= 22,
|
||||||
|
B00010111= 23,
|
||||||
|
B00011000= 24,
|
||||||
|
B00011001= 25,
|
||||||
|
B00011010= 26,
|
||||||
|
B00011011= 27,
|
||||||
|
B00011100= 28,
|
||||||
|
B00011101= 29,
|
||||||
|
B00011110= 30,
|
||||||
|
B00011111= 31,
|
||||||
|
B00100000= 32,
|
||||||
|
B00100001= 33,
|
||||||
|
B00100010= 34,
|
||||||
|
B00100011= 35,
|
||||||
|
B00100100= 36,
|
||||||
|
B00100101= 37,
|
||||||
|
B00100110= 38,
|
||||||
|
B00100111= 39,
|
||||||
|
B00101000= 40,
|
||||||
|
B00101001= 41,
|
||||||
|
B00101010= 42,
|
||||||
|
B00101011= 43,
|
||||||
|
B00101100= 44,
|
||||||
|
B00101101= 45,
|
||||||
|
B00101110= 46,
|
||||||
|
B00101111= 47,
|
||||||
|
B00110000= 48,
|
||||||
|
B00110001= 49,
|
||||||
|
B00110010= 50,
|
||||||
|
B00110011= 51,
|
||||||
|
B00110100= 52,
|
||||||
|
B00110101= 53,
|
||||||
|
B00110110= 54,
|
||||||
|
B00110111= 55,
|
||||||
|
B00111000= 56,
|
||||||
|
B00111001= 57,
|
||||||
|
B00111010= 58,
|
||||||
|
B00111011= 59,
|
||||||
|
B00111100= 60,
|
||||||
|
B00111101= 61,
|
||||||
|
B00111110= 62,
|
||||||
|
B00111111= 63,
|
||||||
|
B01000000= 64,
|
||||||
|
B01000001= 65,
|
||||||
|
B01000010= 66,
|
||||||
|
B01000011= 67,
|
||||||
|
B01000100= 68,
|
||||||
|
B01000101= 69,
|
||||||
|
B01000110= 70,
|
||||||
|
B01000111= 71,
|
||||||
|
B01001000= 72,
|
||||||
|
B01001001= 73,
|
||||||
|
B01001010= 74,
|
||||||
|
B01001011= 75,
|
||||||
|
B01001100= 76,
|
||||||
|
B01001101= 77,
|
||||||
|
B01001110= 78,
|
||||||
|
B01001111= 79,
|
||||||
|
B01010000= 80,
|
||||||
|
B01010001= 81,
|
||||||
|
B01010010= 82,
|
||||||
|
B01010011= 83,
|
||||||
|
B01010100= 84,
|
||||||
|
B01010101= 85,
|
||||||
|
B01010110= 86,
|
||||||
|
B01010111= 87,
|
||||||
|
B01011000= 88,
|
||||||
|
B01011001= 89,
|
||||||
|
B01011010= 90,
|
||||||
|
B01011011= 91,
|
||||||
|
B01011100= 92,
|
||||||
|
B01011101= 93,
|
||||||
|
B01011110= 94,
|
||||||
|
B01011111= 95,
|
||||||
|
B01100000= 96,
|
||||||
|
B01100001= 97,
|
||||||
|
B01100010= 98,
|
||||||
|
B01100011= 99,
|
||||||
|
B01100100= 100,
|
||||||
|
B01100101= 101,
|
||||||
|
B01100110= 102,
|
||||||
|
B01100111= 103,
|
||||||
|
B01101000= 104,
|
||||||
|
B01101001= 105,
|
||||||
|
B01101010= 106,
|
||||||
|
B01101011= 107,
|
||||||
|
B01101100= 108,
|
||||||
|
B01101101= 109,
|
||||||
|
B01101110= 110,
|
||||||
|
B01101111= 111,
|
||||||
|
B01110000= 112,
|
||||||
|
B01110001= 113,
|
||||||
|
B01110010= 114,
|
||||||
|
B01110011= 115,
|
||||||
|
B01110100= 116,
|
||||||
|
B01110101= 117,
|
||||||
|
B01110110= 118,
|
||||||
|
B01110111= 119,
|
||||||
|
B01111000= 120,
|
||||||
|
B01111001= 121,
|
||||||
|
B01111010= 122,
|
||||||
|
B01111011= 123,
|
||||||
|
B01111100= 124,
|
||||||
|
B01111101= 125,
|
||||||
|
B01111110= 126,
|
||||||
|
B01111111= 127,
|
||||||
|
B10000000= 128,
|
||||||
|
B10000001= 129,
|
||||||
|
B10000010= 130,
|
||||||
|
B10000011= 131,
|
||||||
|
B10000100= 132,
|
||||||
|
B10000101= 133,
|
||||||
|
B10000110= 134,
|
||||||
|
B10000111= 135,
|
||||||
|
B10001000= 136,
|
||||||
|
B10001001= 137,
|
||||||
|
B10001010= 138,
|
||||||
|
B10001011= 139,
|
||||||
|
B10001100= 140,
|
||||||
|
B10001101= 141,
|
||||||
|
B10001110= 142,
|
||||||
|
B10001111= 143,
|
||||||
|
B10010000= 144,
|
||||||
|
B10010001= 145,
|
||||||
|
B10010010= 146,
|
||||||
|
B10010011= 147,
|
||||||
|
B10010100= 148,
|
||||||
|
B10010101= 149,
|
||||||
|
B10010110= 150,
|
||||||
|
B10010111= 151,
|
||||||
|
B10011000= 152,
|
||||||
|
B10011001= 153,
|
||||||
|
B10011010= 154,
|
||||||
|
B10011011= 155,
|
||||||
|
B10011100= 156,
|
||||||
|
B10011101= 157,
|
||||||
|
B10011110= 158,
|
||||||
|
B10011111= 159,
|
||||||
|
B10100000= 160,
|
||||||
|
B10100001= 161,
|
||||||
|
B10100010= 162,
|
||||||
|
B10100011= 163,
|
||||||
|
B10100100= 164,
|
||||||
|
B10100101= 165,
|
||||||
|
B10100110= 166,
|
||||||
|
B10100111= 167,
|
||||||
|
B10101000= 168,
|
||||||
|
B10101001= 169,
|
||||||
|
B10101010= 170,
|
||||||
|
B10101011= 171,
|
||||||
|
B10101100= 172,
|
||||||
|
B10101101= 173,
|
||||||
|
B10101110= 174,
|
||||||
|
B10101111= 175,
|
||||||
|
B10110000= 176,
|
||||||
|
B10110001= 177,
|
||||||
|
B10110010= 178,
|
||||||
|
B10110011= 179,
|
||||||
|
B10110100= 180,
|
||||||
|
B10110101= 181,
|
||||||
|
B10110110= 182,
|
||||||
|
B10110111= 183,
|
||||||
|
B10111000= 184,
|
||||||
|
B10111001= 185,
|
||||||
|
B10111010= 186,
|
||||||
|
B10111011= 187,
|
||||||
|
B10111100= 188,
|
||||||
|
B10111101= 189,
|
||||||
|
B10111110= 190,
|
||||||
|
B10111111= 191,
|
||||||
|
B11000000= 192,
|
||||||
|
B11000001= 193,
|
||||||
|
B11000010= 194,
|
||||||
|
B11000011= 195,
|
||||||
|
B11000100= 196,
|
||||||
|
B11000101= 197,
|
||||||
|
B11000110= 198,
|
||||||
|
B11000111= 199,
|
||||||
|
B11001000= 200,
|
||||||
|
B11001001= 201,
|
||||||
|
B11001010= 202,
|
||||||
|
B11001011= 203,
|
||||||
|
B11001100= 204,
|
||||||
|
B11001101= 205,
|
||||||
|
B11001110= 206,
|
||||||
|
B11001111= 207,
|
||||||
|
B11010000= 208,
|
||||||
|
B11010001= 209,
|
||||||
|
B11010010= 210,
|
||||||
|
B11010011= 211,
|
||||||
|
B11010100= 212,
|
||||||
|
B11010101= 213,
|
||||||
|
B11010110= 214,
|
||||||
|
B11010111= 215,
|
||||||
|
B11011000= 216,
|
||||||
|
B11011001= 217,
|
||||||
|
B11011010= 218,
|
||||||
|
B11011011= 219,
|
||||||
|
B11011100= 220,
|
||||||
|
B11011101= 221,
|
||||||
|
B11011110= 222,
|
||||||
|
B11011111= 223,
|
||||||
|
B11100000= 224,
|
||||||
|
B11100001= 225,
|
||||||
|
B11100010= 226,
|
||||||
|
B11100011= 227,
|
||||||
|
B11100100= 228,
|
||||||
|
B11100101= 229,
|
||||||
|
B11100110= 230,
|
||||||
|
B11100111= 231,
|
||||||
|
B11101000= 232,
|
||||||
|
B11101001= 233,
|
||||||
|
B11101010= 234,
|
||||||
|
B11101011= 235,
|
||||||
|
B11101100= 236,
|
||||||
|
B11101101= 237,
|
||||||
|
B11101110= 238,
|
||||||
|
B11101111= 239,
|
||||||
|
B11110000= 240,
|
||||||
|
B11110001= 241,
|
||||||
|
B11110010= 242,
|
||||||
|
B11110011= 243,
|
||||||
|
B11110100= 244,
|
||||||
|
B11110101= 245,
|
||||||
|
B11110110= 246,
|
||||||
|
B11110111= 247,
|
||||||
|
B11111000= 248,
|
||||||
|
B11111001= 249,
|
||||||
|
B11111010= 250,
|
||||||
|
B11111011= 251,
|
||||||
|
B11111100= 252,
|
||||||
|
B11111101= 253,
|
||||||
|
B11111110= 254,
|
||||||
|
B11111111= 255
|
||||||
|
};
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,561 @@
|
||||||
|
#ifndef XBYAK_XBYAK_UTIL_H_
|
||||||
|
#define XBYAK_XBYAK_UTIL_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
utility class and functions for Xbyak
|
||||||
|
Xbyak::util::Clock ; rdtsc timer
|
||||||
|
Xbyak::util::Cpu ; detect CPU
|
||||||
|
@note this header is UNDER CONSTRUCTION!
|
||||||
|
*/
|
||||||
|
#include "xbyak.h"
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#if (_MSC_VER < 1400) && defined(XBYAK32)
|
||||||
|
static inline __declspec(naked) void __cpuid(int[4], int)
|
||||||
|
{
|
||||||
|
__asm {
|
||||||
|
push ebx
|
||||||
|
push esi
|
||||||
|
mov eax, dword ptr [esp + 4 * 2 + 8] // eaxIn
|
||||||
|
cpuid
|
||||||
|
mov esi, dword ptr [esp + 4 * 2 + 4] // data
|
||||||
|
mov dword ptr [esi], eax
|
||||||
|
mov dword ptr [esi + 4], ebx
|
||||||
|
mov dword ptr [esi + 8], ecx
|
||||||
|
mov dword ptr [esi + 12], edx
|
||||||
|
pop esi
|
||||||
|
pop ebx
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#include <intrin.h> // for __cpuid
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifndef __GNUC_PREREQ
|
||||||
|
#define __GNUC_PREREQ(major, minor) ((((__GNUC__) << 16) + (__GNUC_MINOR__)) >= (((major) << 16) + (minor)))
|
||||||
|
#endif
|
||||||
|
#if __GNUC_PREREQ(4, 3) && !defined(__APPLE__)
|
||||||
|
#include <cpuid.h>
|
||||||
|
#else
|
||||||
|
#if defined(__APPLE__) && defined(XBYAK32) // avoid err : can't find a register in class `BREG' while reloading `asm'
|
||||||
|
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
|
||||||
|
#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("pushl %%ebx\ncpuid\nmovl %%ebp, %%esi\npopl %%ebx" : "=a"(a), "=S"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
|
||||||
|
#else
|
||||||
|
#define __cpuid(eaxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn))
|
||||||
|
#define __cpuid_count(eaxIn, ecxIn, a, b, c, d) __asm__ __volatile__("cpuid\n" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(eaxIn), "2"(ecxIn))
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace Xbyak { namespace util {
|
||||||
|
|
||||||
|
/**
|
||||||
|
CPU detection class
|
||||||
|
*/
|
||||||
|
class Cpu {
|
||||||
|
uint64 type_;
|
||||||
|
unsigned int get32bitAsBE(const char *x) const
|
||||||
|
{
|
||||||
|
return x[0] | (x[1] << 8) | (x[2] << 16) | (x[3] << 24);
|
||||||
|
}
|
||||||
|
unsigned int mask(int n) const
|
||||||
|
{
|
||||||
|
return (1U << n) - 1;
|
||||||
|
}
|
||||||
|
void setFamily()
|
||||||
|
{
|
||||||
|
unsigned int data[4];
|
||||||
|
getCpuid(1, data);
|
||||||
|
stepping = data[0] & mask(4);
|
||||||
|
model = (data[0] >> 4) & mask(4);
|
||||||
|
family = (data[0] >> 8) & mask(4);
|
||||||
|
// type = (data[0] >> 12) & mask(2);
|
||||||
|
extModel = (data[0] >> 16) & mask(4);
|
||||||
|
extFamily = (data[0] >> 20) & mask(8);
|
||||||
|
if (family == 0x0f) {
|
||||||
|
displayFamily = family + extFamily;
|
||||||
|
} else {
|
||||||
|
displayFamily = family;
|
||||||
|
}
|
||||||
|
if (family == 6 || family == 0x0f) {
|
||||||
|
displayModel = (extModel << 4) + model;
|
||||||
|
} else {
|
||||||
|
displayModel = model;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public:
|
||||||
|
int model;
|
||||||
|
int family;
|
||||||
|
int stepping;
|
||||||
|
int extModel;
|
||||||
|
int extFamily;
|
||||||
|
int displayFamily; // family + extFamily
|
||||||
|
int displayModel; // model + extModel
|
||||||
|
/*
|
||||||
|
data[] = { eax, ebx, ecx, edx }
|
||||||
|
*/
|
||||||
|
static inline void getCpuid(unsigned int eaxIn, unsigned int data[4])
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
__cpuid(reinterpret_cast<int*>(data), eaxIn);
|
||||||
|
#else
|
||||||
|
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static inline void getCpuidEx(unsigned int eaxIn, unsigned int ecxIn, unsigned int data[4])
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
__cpuidex(reinterpret_cast<int*>(data), eaxIn, ecxIn);
|
||||||
|
#else
|
||||||
|
__cpuid_count(eaxIn, ecxIn, data[0], data[1], data[2], data[3]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
static inline uint64 getXfeature()
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
return _xgetbv(0);
|
||||||
|
#else
|
||||||
|
unsigned int eax, edx;
|
||||||
|
// xgetvb is not support on gcc 4.2
|
||||||
|
// __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(0));
|
||||||
|
__asm__ volatile(".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0));
|
||||||
|
return ((uint64)edx << 32) | eax;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
typedef uint64 Type;
|
||||||
|
static const Type NONE = 0;
|
||||||
|
static const Type tMMX = 1 << 0;
|
||||||
|
static const Type tMMX2 = 1 << 1;
|
||||||
|
static const Type tCMOV = 1 << 2;
|
||||||
|
static const Type tSSE = 1 << 3;
|
||||||
|
static const Type tSSE2 = 1 << 4;
|
||||||
|
static const Type tSSE3 = 1 << 5;
|
||||||
|
static const Type tSSSE3 = 1 << 6;
|
||||||
|
static const Type tSSE41 = 1 << 7;
|
||||||
|
static const Type tSSE42 = 1 << 8;
|
||||||
|
static const Type tPOPCNT = 1 << 9;
|
||||||
|
static const Type tAESNI = 1 << 10;
|
||||||
|
static const Type tSSE5 = 1 << 11;
|
||||||
|
static const Type tOSXSAVE = 1 << 12;
|
||||||
|
static const Type tPCLMULQDQ = 1 << 13;
|
||||||
|
static const Type tAVX = 1 << 14;
|
||||||
|
static const Type tFMA = 1 << 15;
|
||||||
|
|
||||||
|
static const Type t3DN = 1 << 16;
|
||||||
|
static const Type tE3DN = 1 << 17;
|
||||||
|
static const Type tSSE4a = 1 << 18;
|
||||||
|
static const Type tRDTSCP = 1 << 19;
|
||||||
|
static const Type tAVX2 = 1 << 20;
|
||||||
|
static const Type tBMI1 = 1 << 21; // andn, bextr, blsi, blsmsk, blsr, tzcnt
|
||||||
|
static const Type tBMI2 = 1 << 22; // bzhi, mulx, pdep, pext, rorx, sarx, shlx, shrx
|
||||||
|
static const Type tLZCNT = 1 << 23;
|
||||||
|
|
||||||
|
static const Type tINTEL = 1 << 24;
|
||||||
|
static const Type tAMD = 1 << 25;
|
||||||
|
|
||||||
|
static const Type tENHANCED_REP = 1 << 26; // enhanced rep movsb/stosb
|
||||||
|
static const Type tRDRAND = 1 << 27;
|
||||||
|
static const Type tADX = 1 << 28; // adcx, adox
|
||||||
|
static const Type tRDSEED = 1 << 29; // rdseed
|
||||||
|
static const Type tSMAP = 1 << 30; // stac
|
||||||
|
static const Type tHLE = uint64(1) << 31; // xacquire, xrelease, xtest
|
||||||
|
static const Type tRTM = uint64(1) << 32; // xbegin, xend, xabort
|
||||||
|
static const Type tF16C = uint64(1) << 33; // vcvtph2ps, vcvtps2ph
|
||||||
|
static const Type tMOVBE = uint64(1) << 34; // mobve
|
||||||
|
static const Type tAVX512F = uint64(1) << 35;
|
||||||
|
static const Type tAVX512DQ = uint64(1) << 36;
|
||||||
|
static const Type tAVX512IFMA = uint64(1) << 37;
|
||||||
|
static const Type tAVX512PF = uint64(1) << 38;
|
||||||
|
static const Type tAVX512ER = uint64(1) << 39;
|
||||||
|
static const Type tAVX512CD = uint64(1) << 40;
|
||||||
|
static const Type tAVX512BW = uint64(1) << 41;
|
||||||
|
static const Type tAVX512VL = uint64(1) << 42;
|
||||||
|
static const Type tAVX512VBMI = uint64(1) << 43;
|
||||||
|
static const Type tAVX512_4VNNIW = uint64(1) << 44;
|
||||||
|
static const Type tAVX512_4FMAPS = uint64(1) << 45;
|
||||||
|
static const Type tPREFETCHWT1 = uint64(1) << 46;
|
||||||
|
|
||||||
|
Cpu()
|
||||||
|
: type_(NONE)
|
||||||
|
{
|
||||||
|
unsigned int data[4];
|
||||||
|
getCpuid(0, data);
|
||||||
|
const unsigned int maxNum = data[0];
|
||||||
|
static const char intel[] = "ntel";
|
||||||
|
static const char amd[] = "cAMD";
|
||||||
|
if (data[2] == get32bitAsBE(amd)) {
|
||||||
|
type_ |= tAMD;
|
||||||
|
getCpuid(0x80000001, data);
|
||||||
|
if (data[3] & (1U << 31)) type_ |= t3DN;
|
||||||
|
if (data[3] & (1U << 15)) type_ |= tCMOV;
|
||||||
|
if (data[3] & (1U << 30)) type_ |= tE3DN;
|
||||||
|
if (data[3] & (1U << 22)) type_ |= tMMX2;
|
||||||
|
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
|
||||||
|
}
|
||||||
|
if (data[2] == get32bitAsBE(intel)) {
|
||||||
|
type_ |= tINTEL;
|
||||||
|
getCpuid(0x80000001, data);
|
||||||
|
if (data[3] & (1U << 27)) type_ |= tRDTSCP;
|
||||||
|
if (data[2] & (1U << 5)) type_ |= tLZCNT;
|
||||||
|
}
|
||||||
|
getCpuid(1, data);
|
||||||
|
if (data[2] & (1U << 0)) type_ |= tSSE3;
|
||||||
|
if (data[2] & (1U << 9)) type_ |= tSSSE3;
|
||||||
|
if (data[2] & (1U << 19)) type_ |= tSSE41;
|
||||||
|
if (data[2] & (1U << 20)) type_ |= tSSE42;
|
||||||
|
if (data[2] & (1U << 22)) type_ |= tMOVBE;
|
||||||
|
if (data[2] & (1U << 23)) type_ |= tPOPCNT;
|
||||||
|
if (data[2] & (1U << 25)) type_ |= tAESNI;
|
||||||
|
if (data[2] & (1U << 1)) type_ |= tPCLMULQDQ;
|
||||||
|
if (data[2] & (1U << 27)) type_ |= tOSXSAVE;
|
||||||
|
if (data[2] & (1U << 30)) type_ |= tRDRAND;
|
||||||
|
if (data[2] & (1U << 29)) type_ |= tF16C;
|
||||||
|
|
||||||
|
if (data[3] & (1U << 15)) type_ |= tCMOV;
|
||||||
|
if (data[3] & (1U << 23)) type_ |= tMMX;
|
||||||
|
if (data[3] & (1U << 25)) type_ |= tMMX2 | tSSE;
|
||||||
|
if (data[3] & (1U << 26)) type_ |= tSSE2;
|
||||||
|
|
||||||
|
if (type_ & tOSXSAVE) {
|
||||||
|
// check XFEATURE_ENABLED_MASK[2:1] = '11b'
|
||||||
|
uint64 bv = getXfeature();
|
||||||
|
if ((bv & 6) == 6) {
|
||||||
|
if (data[2] & (1U << 28)) type_ |= tAVX;
|
||||||
|
if (data[2] & (1U << 12)) type_ |= tFMA;
|
||||||
|
if (((bv >> 5) & 7) == 7) {
|
||||||
|
getCpuidEx(7, 0, data);
|
||||||
|
if (data[1] & (1U << 16)) type_ |= tAVX512F;
|
||||||
|
if (type_ & tAVX512F) {
|
||||||
|
if (data[1] & (1U << 17)) type_ |= tAVX512DQ;
|
||||||
|
if (data[1] & (1U << 21)) type_ |= tAVX512IFMA;
|
||||||
|
if (data[1] & (1U << 26)) type_ |= tAVX512PF;
|
||||||
|
if (data[1] & (1U << 27)) type_ |= tAVX512ER;
|
||||||
|
if (data[1] & (1U << 28)) type_ |= tAVX512CD;
|
||||||
|
if (data[1] & (1U << 30)) type_ |= tAVX512BW;
|
||||||
|
if (data[1] & (1U << 31)) type_ |= tAVX512VL;
|
||||||
|
if (data[2] & (1U << 1)) type_ |= tAVX512VBMI;
|
||||||
|
if (data[3] & (1U << 2)) type_ |= tAVX512_4VNNIW;
|
||||||
|
if (data[3] & (1U << 3)) type_ |= tAVX512_4FMAPS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (maxNum >= 7) {
|
||||||
|
getCpuidEx(7, 0, data);
|
||||||
|
if (type_ & tAVX && data[1] & 0x20) type_ |= tAVX2;
|
||||||
|
if (data[1] & (1U << 3)) type_ |= tBMI1;
|
||||||
|
if (data[1] & (1U << 8)) type_ |= tBMI2;
|
||||||
|
if (data[1] & (1U << 9)) type_ |= tENHANCED_REP;
|
||||||
|
if (data[1] & (1U << 18)) type_ |= tRDSEED;
|
||||||
|
if (data[1] & (1U << 19)) type_ |= tADX;
|
||||||
|
if (data[1] & (1U << 20)) type_ |= tSMAP;
|
||||||
|
if (data[1] & (1U << 4)) type_ |= tHLE;
|
||||||
|
if (data[1] & (1U << 11)) type_ |= tRTM;
|
||||||
|
if (data[2] & (1U << 0)) type_ |= tPREFETCHWT1;
|
||||||
|
}
|
||||||
|
setFamily();
|
||||||
|
}
|
||||||
|
void putFamily() const
|
||||||
|
{
|
||||||
|
printf("family=%d, model=%X, stepping=%d, extFamily=%d, extModel=%X\n",
|
||||||
|
family, model, stepping, extFamily, extModel);
|
||||||
|
printf("display:family=%X, model=%X\n", displayFamily, displayModel);
|
||||||
|
}
|
||||||
|
bool has(Type type) const
|
||||||
|
{
|
||||||
|
return (type & type_) != 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class Clock {
|
||||||
|
public:
|
||||||
|
static inline uint64 getRdtsc()
|
||||||
|
{
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
return __rdtsc();
|
||||||
|
#else
|
||||||
|
unsigned int eax, edx;
|
||||||
|
__asm__ volatile("rdtsc" : "=a"(eax), "=d"(edx));
|
||||||
|
return ((uint64)edx << 32) | eax;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
Clock()
|
||||||
|
: clock_(0)
|
||||||
|
, count_(0)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
void begin()
|
||||||
|
{
|
||||||
|
clock_ -= getRdtsc();
|
||||||
|
}
|
||||||
|
void end()
|
||||||
|
{
|
||||||
|
clock_ += getRdtsc();
|
||||||
|
count_++;
|
||||||
|
}
|
||||||
|
int getCount() const { return count_; }
|
||||||
|
uint64 getClock() const { return clock_; }
|
||||||
|
void clear() { count_ = 0; clock_ = 0; }
|
||||||
|
private:
|
||||||
|
uint64 clock_;
|
||||||
|
int count_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef XBYAK64
|
||||||
|
const int UseRCX = 1 << 6;
|
||||||
|
const int UseRDX = 1 << 7;
|
||||||
|
|
||||||
|
class Pack {
|
||||||
|
static const size_t maxTblNum = 10;
|
||||||
|
const Xbyak::Reg64 *tbl_[maxTblNum];
|
||||||
|
size_t n_;
|
||||||
|
public:
|
||||||
|
Pack() : tbl_(), n_(0) {}
|
||||||
|
Pack(const Xbyak::Reg64 *tbl, size_t n) { init(tbl, n); }
|
||||||
|
Pack(const Pack& rhs)
|
||||||
|
: n_(rhs.n_)
|
||||||
|
{
|
||||||
|
if (n_ > maxTblNum) throw Error(ERR_INTERNAL);
|
||||||
|
for (size_t i = 0; i < n_; i++) tbl_[i] = rhs.tbl_[i];
|
||||||
|
}
|
||||||
|
Pack(const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 1; tbl_[0] = &t0; }
|
||||||
|
Pack(const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 2; tbl_[0] = &t0; tbl_[1] = &t1; }
|
||||||
|
Pack(const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 3; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; }
|
||||||
|
Pack(const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 4; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; }
|
||||||
|
Pack(const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 5; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; }
|
||||||
|
Pack(const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 6; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; }
|
||||||
|
Pack(const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 7; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; }
|
||||||
|
Pack(const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 8; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; }
|
||||||
|
Pack(const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 9; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; }
|
||||||
|
Pack(const Xbyak::Reg64& t9, const Xbyak::Reg64& t8, const Xbyak::Reg64& t7, const Xbyak::Reg64& t6, const Xbyak::Reg64& t5, const Xbyak::Reg64& t4, const Xbyak::Reg64& t3, const Xbyak::Reg64& t2, const Xbyak::Reg64& t1, const Xbyak::Reg64& t0)
|
||||||
|
{ n_ = 10; tbl_[0] = &t0; tbl_[1] = &t1; tbl_[2] = &t2; tbl_[3] = &t3; tbl_[4] = &t4; tbl_[5] = &t5; tbl_[6] = &t6; tbl_[7] = &t7; tbl_[8] = &t8; tbl_[9] = &t9; }
|
||||||
|
Pack& append(const Xbyak::Reg64& t)
|
||||||
|
{
|
||||||
|
if (n_ == 10) {
|
||||||
|
fprintf(stderr, "ERR Pack::can't append\n");
|
||||||
|
throw Error(ERR_BAD_PARAMETER);
|
||||||
|
}
|
||||||
|
tbl_[n_++] = &t;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
void init(const Xbyak::Reg64 *tbl, size_t n)
|
||||||
|
{
|
||||||
|
if (n > maxTblNum) {
|
||||||
|
fprintf(stderr, "ERR Pack::init bad n=%d\n", (int)n);
|
||||||
|
throw Error(ERR_BAD_PARAMETER);
|
||||||
|
}
|
||||||
|
n_ = n;
|
||||||
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
tbl_[i] = &tbl[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const Xbyak::Reg64& operator[](size_t n) const
|
||||||
|
{
|
||||||
|
if (n >= n_) {
|
||||||
|
fprintf(stderr, "ERR Pack bad n=%d\n", (int)n);
|
||||||
|
throw Error(ERR_BAD_PARAMETER);
|
||||||
|
}
|
||||||
|
return *tbl_[n];
|
||||||
|
}
|
||||||
|
size_t size() const { return n_; }
|
||||||
|
/*
|
||||||
|
get tbl[pos, pos + num)
|
||||||
|
*/
|
||||||
|
Pack sub(size_t pos, size_t num = size_t(-1)) const
|
||||||
|
{
|
||||||
|
if (num == size_t(-1)) num = n_ - pos;
|
||||||
|
if (pos + num > n_) {
|
||||||
|
fprintf(stderr, "ERR Pack::sub bad pos=%d, num=%d\n", (int)pos, (int)num);
|
||||||
|
throw Error(ERR_BAD_PARAMETER);
|
||||||
|
}
|
||||||
|
Pack pack;
|
||||||
|
pack.n_ = num;
|
||||||
|
for (size_t i = 0; i < num; i++) {
|
||||||
|
pack.tbl_[i] = tbl_[pos + i];
|
||||||
|
}
|
||||||
|
return pack;
|
||||||
|
}
|
||||||
|
void put() const
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < n_; i++) {
|
||||||
|
printf("%s ", tbl_[i]->toString());
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
class StackFrame {
|
||||||
|
#ifdef XBYAK64_WIN
|
||||||
|
static const int noSaveNum = 6;
|
||||||
|
static const int rcxPos = 0;
|
||||||
|
static const int rdxPos = 1;
|
||||||
|
#else
|
||||||
|
static const int noSaveNum = 8;
|
||||||
|
static const int rcxPos = 3;
|
||||||
|
static const int rdxPos = 2;
|
||||||
|
#endif
|
||||||
|
Xbyak::CodeGenerator *code_;
|
||||||
|
int pNum_;
|
||||||
|
int tNum_;
|
||||||
|
bool useRcx_;
|
||||||
|
bool useRdx_;
|
||||||
|
int saveNum_;
|
||||||
|
int P_;
|
||||||
|
bool makeEpilog_;
|
||||||
|
Xbyak::Reg64 pTbl_[4];
|
||||||
|
Xbyak::Reg64 tTbl_[10];
|
||||||
|
Pack p_;
|
||||||
|
Pack t_;
|
||||||
|
StackFrame(const StackFrame&);
|
||||||
|
void operator=(const StackFrame&);
|
||||||
|
public:
|
||||||
|
const Pack& p;
|
||||||
|
const Pack& t;
|
||||||
|
/*
|
||||||
|
make stack frame
|
||||||
|
@param sf [in] this
|
||||||
|
@param pNum [in] num of function parameter(0 <= pNum <= 4)
|
||||||
|
@param tNum [in] num of temporary register(0 <= tNum <= 10, with UseRCX, UseRDX)
|
||||||
|
@param stackSizeByte [in] local stack size
|
||||||
|
@param makeEpilog [in] automatically call close() if true
|
||||||
|
|
||||||
|
you can use
|
||||||
|
rax
|
||||||
|
gp0, ..., gp(pNum - 1)
|
||||||
|
gt0, ..., gt(tNum-1)
|
||||||
|
rcx if tNum & UseRCX
|
||||||
|
rdx if tNum & UseRDX
|
||||||
|
rsp[0..stackSizeByte - 1]
|
||||||
|
*/
|
||||||
|
StackFrame(Xbyak::CodeGenerator *code, int pNum, int tNum = 0, int stackSizeByte = 0, bool makeEpilog = true)
|
||||||
|
: code_(code)
|
||||||
|
, pNum_(pNum)
|
||||||
|
, tNum_(tNum & ~(UseRCX | UseRDX))
|
||||||
|
, useRcx_((tNum & UseRCX) != 0)
|
||||||
|
, useRdx_((tNum & UseRDX) != 0)
|
||||||
|
, saveNum_(0)
|
||||||
|
, P_(0)
|
||||||
|
, makeEpilog_(makeEpilog)
|
||||||
|
, p(p_)
|
||||||
|
, t(t_)
|
||||||
|
{
|
||||||
|
using namespace Xbyak;
|
||||||
|
if (pNum < 0 || pNum > 4) throw Error(ERR_BAD_PNUM);
|
||||||
|
const int allRegNum = pNum + tNum_ + (useRcx_ ? 1 : 0) + (useRdx_ ? 1 : 0);
|
||||||
|
if (allRegNum < pNum || allRegNum > 14) throw Error(ERR_BAD_TNUM);
|
||||||
|
const Reg64& _rsp = code->rsp;
|
||||||
|
const AddressFrame& _ptr = code->ptr;
|
||||||
|
saveNum_ = (std::max)(0, allRegNum - noSaveNum);
|
||||||
|
const int *tbl = getOrderTbl() + noSaveNum;
|
||||||
|
P_ = saveNum_ + (stackSizeByte + 7) / 8;
|
||||||
|
if (P_ > 0 && (P_ & 1) == 0) P_++; // here (rsp % 16) == 8, then increment P_ for 16 byte alignment
|
||||||
|
P_ *= 8;
|
||||||
|
if (P_ > 0) code->sub(_rsp, P_);
|
||||||
|
#ifdef XBYAK64_WIN
|
||||||
|
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
|
||||||
|
code->mov(_ptr [_rsp + P_ + (i + 1) * 8], Reg64(tbl[i]));
|
||||||
|
}
|
||||||
|
for (int i = 4; i < saveNum_; i++) {
|
||||||
|
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (int i = 0; i < saveNum_; i++) {
|
||||||
|
code->mov(_ptr [_rsp + P_ - 8 * (saveNum_ - i)], Reg64(tbl[i]));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < pNum; i++) {
|
||||||
|
pTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
|
||||||
|
}
|
||||||
|
for (int i = 0; i < tNum_; i++) {
|
||||||
|
tTbl_[i] = Xbyak::Reg64(getRegIdx(pos));
|
||||||
|
}
|
||||||
|
if (useRcx_ && rcxPos < pNum) code_->mov(code_->r10, code_->rcx);
|
||||||
|
if (useRdx_ && rdxPos < pNum) code_->mov(code_->r11, code_->rdx);
|
||||||
|
p_.init(pTbl_, pNum);
|
||||||
|
t_.init(tTbl_, tNum_);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
make epilog manually
|
||||||
|
@param callRet [in] call ret() if true
|
||||||
|
*/
|
||||||
|
void close(bool callRet = true)
|
||||||
|
{
|
||||||
|
using namespace Xbyak;
|
||||||
|
const Reg64& _rsp = code_->rsp;
|
||||||
|
const AddressFrame& _ptr = code_->ptr;
|
||||||
|
const int *tbl = getOrderTbl() + noSaveNum;
|
||||||
|
#ifdef XBYAK64_WIN
|
||||||
|
for (int i = 0; i < (std::min)(saveNum_, 4); i++) {
|
||||||
|
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ + (i + 1) * 8]);
|
||||||
|
}
|
||||||
|
for (int i = 4; i < saveNum_; i++) {
|
||||||
|
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (int i = 0; i < saveNum_; i++) {
|
||||||
|
code_->mov(Reg64(tbl[i]), _ptr [_rsp + P_ - 8 * (saveNum_ - i)]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (P_ > 0) code_->add(_rsp, P_);
|
||||||
|
|
||||||
|
if (callRet) code_->ret();
|
||||||
|
}
|
||||||
|
~StackFrame()
|
||||||
|
{
|
||||||
|
if (!makeEpilog_) return;
|
||||||
|
try {
|
||||||
|
close();
|
||||||
|
} catch (std::exception& e) {
|
||||||
|
printf("ERR:StackFrame %s\n", e.what());
|
||||||
|
exit(1);
|
||||||
|
} catch (...) {
|
||||||
|
printf("ERR:StackFrame otherwise\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private:
|
||||||
|
const int *getOrderTbl() const
|
||||||
|
{
|
||||||
|
using namespace Xbyak;
|
||||||
|
static const int tbl[] = {
|
||||||
|
#ifdef XBYAK64_WIN
|
||||||
|
Operand::RCX, Operand::RDX, Operand::R8, Operand::R9, Operand::R10, Operand::R11, Operand::RDI, Operand::RSI,
|
||||||
|
#else
|
||||||
|
Operand::RDI, Operand::RSI, Operand::RDX, Operand::RCX, Operand::R8, Operand::R9, Operand::R10, Operand::R11,
|
||||||
|
#endif
|
||||||
|
Operand::RBX, Operand::RBP, Operand::R12, Operand::R13, Operand::R14, Operand::R15
|
||||||
|
};
|
||||||
|
return &tbl[0];
|
||||||
|
}
|
||||||
|
int getRegIdx(int& pos) const
|
||||||
|
{
|
||||||
|
assert(pos < 14);
|
||||||
|
using namespace Xbyak;
|
||||||
|
const int *tbl = getOrderTbl();
|
||||||
|
int r = tbl[pos++];
|
||||||
|
if (useRcx_) {
|
||||||
|
if (r == Operand::RCX) { return Operand::R10; }
|
||||||
|
if (r == Operand::R10) { r = tbl[pos++]; }
|
||||||
|
}
|
||||||
|
if (useRdx_) {
|
||||||
|
if (r == Operand::RDX) { return Operand::R11; }
|
||||||
|
if (r == Operand::R11) { return tbl[pos++]; }
|
||||||
|
}
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} } // end of util
|
||||||
|
#endif
|
Loading…
Reference in New Issue