diff --git a/amd64.c b/amd64.c index 5f1cfca9..37122f44 100644 --- a/amd64.c +++ b/amd64.c @@ -1,11 +1,16 @@ /* - libco.amd64 (2009-10-12) + libco.amd64 (2015-06-19) author: byuu license: public domain */ #define LIBCO_C #include "libco.h" + +//Win64 only: provides a substantial speed-up, but will thrash XMM regs +//do not use this unless you are certain your application won't use SSE +//#define LIBCO_AMD64_NO_SSE + #include #include @@ -18,21 +23,54 @@ static thread_local cothread_t co_active_handle = 0; static void (*co_swap)(cothread_t, cothread_t) = 0; #ifdef _WIN32 - //ABI: Win64 + /* ABI: Win64 */ static unsigned char co_swap_function[] = { - 0x48, 0x89, 0x22, 0x48, 0x8B, 0x21, 0x58, 0x48, 0x89, 0x6A, 0x08, 0x48, 0x89, 0x72, 0x10, 0x48, - 0x89, 0x7A, 0x18, 0x48, 0x89, 0x5A, 0x20, 0x4C, 0x89, 0x62, 0x28, 0x4C, 0x89, 0x6A, 0x30, 0x4C, - 0x89, 0x72, 0x38, 0x4C, 0x89, 0x7A, 0x40, 0x48, 0x81, 0xC2, 0x80, 0x00, 0x00, 0x00, 0x48, 0x83, - 0xE2, 0xF0, 0x0F, 0x29, 0x32, 0x0F, 0x29, 0x7A, 0x10, 0x44, 0x0F, 0x29, 0x42, 0x20, 0x44, 0x0F, - 0x29, 0x4A, 0x30, 0x44, 0x0F, 0x29, 0x52, 0x40, 0x44, 0x0F, 0x29, 0x5A, 0x50, 0x44, 0x0F, 0x29, - 0x62, 0x60, 0x44, 0x0F, 0x29, 0x6A, 0x70, 0x44, 0x0F, 0x29, 0xB2, 0x80, 0x00, 0x00, 0x00, 0x44, - 0x0F, 0x29, 0xBA, 0x90, 0x00, 0x00, 0x00, 0x48, 0x8B, 0x69, 0x08, 0x48, 0x8B, 0x71, 0x10, 0x48, - 0x8B, 0x79, 0x18, 0x48, 0x8B, 0x59, 0x20, 0x4C, 0x8B, 0x61, 0x28, 0x4C, 0x8B, 0x69, 0x30, 0x4C, - 0x8B, 0x71, 0x38, 0x4C, 0x8B, 0x79, 0x40, 0x48, 0x81, 0xC1, 0x80, 0x00, 0x00, 0x00, 0x48, 0x83, - 0xE1, 0xF0, 0x0F, 0x29, 0x31, 0x0F, 0x29, 0x79, 0x10, 0x44, 0x0F, 0x29, 0x41, 0x20, 0x44, 0x0F, - 0x29, 0x49, 0x30, 0x44, 0x0F, 0x29, 0x51, 0x40, 0x44, 0x0F, 0x29, 0x59, 0x50, 0x44, 0x0F, 0x29, - 0x61, 0x60, 0x44, 0x0F, 0x29, 0x69, 0x70, 0x44, 0x0F, 0x29, 0xB1, 0x80, 0x00, 0x00, 0x00, 0x44, - 0x0F, 0x29, 0xB9, 0x90, 0x00, 0x00, 0x00, 0xFF, 0xE0, + 0x48, 0x89, 0x22, /* mov [rdx],rsp */ + 0x48, 0x8b, 0x21, /* mov rsp,[rcx] */ + 0x58, /* pop rax */ + 0x48, 0x89, 0x6a, 0x08, /* mov [rdx+ 8],rbp */ + 0x48, 0x89, 0x72, 0x10, /* mov [rdx+16],rsi */ + 0x48, 0x89, 0x7a, 0x18, /* mov [rdx+24],rdi */ + 0x48, 0x89, 0x5a, 0x20, /* mov [rdx+32],rbx */ + 0x4c, 0x89, 0x62, 0x28, /* mov [rdx+40],r12 */ + 0x4c, 0x89, 0x6a, 0x30, /* mov [rdx+48],r13 */ + 0x4c, 0x89, 0x72, 0x38, /* mov [rdx+56],r14 */ + 0x4c, 0x89, 0x7a, 0x40, /* mov [rdx+64],r15 */ + #if !defined(LIBCO_AMD64_NO_SSE) + 0x0f, 0x29, 0x72, 0x50, /* movaps [rdx+ 80],xmm6 */ + 0x0f, 0x29, 0x7a, 0x60, /* movaps [rdx+ 96],xmm7 */ + 0x44, 0x0f, 0x29, 0x42, 0x70, /* movaps [rdx+112],xmm8 */ + 0x48, 0x83, 0xc2, 0x70, /* add rdx,112 */ + 0x44, 0x0f, 0x29, 0x4a, 0x10, /* movaps [rdx+ 16],xmm9 */ + 0x44, 0x0f, 0x29, 0x52, 0x20, /* movaps [rdx+ 32],xmm10 */ + 0x44, 0x0f, 0x29, 0x5a, 0x30, /* movaps [rdx+ 48],xmm11 */ + 0x44, 0x0f, 0x29, 0x62, 0x40, /* movaps [rdx+ 64],xmm12 */ + 0x44, 0x0f, 0x29, 0x6a, 0x50, /* movaps [rdx+ 80],xmm13 */ + 0x44, 0x0f, 0x29, 0x72, 0x60, /* movaps [rdx+ 96],xmm14 */ + 0x44, 0x0f, 0x29, 0x7a, 0x70, /* movaps [rdx+112],xmm15 */ + #endif + 0x48, 0x8b, 0x69, 0x08, /* mov rbp,[rcx+ 8] */ + 0x48, 0x8b, 0x71, 0x10, /* mov rsi,[rcx+16] */ + 0x48, 0x8b, 0x79, 0x18, /* mov rdi,[rcx+24] */ + 0x48, 0x8b, 0x59, 0x20, /* mov rbx,[rcx+32] */ + 0x4c, 0x8b, 0x61, 0x28, /* mov r12,[rcx+40] */ + 0x4c, 0x8b, 0x69, 0x30, /* mov r13,[rcx+48] */ + 0x4c, 0x8b, 0x71, 0x38, /* mov r14,[rcx+56] */ + 0x4c, 0x8b, 0x79, 0x40, /* mov r15,[rcx+64] */ + #if !defined(LIBCO_AMD64_NO_SSE) + 0x0f, 0x28, 0x71, 0x50, /* movaps xmm6, [rcx+ 80] */ + 0x0f, 0x28, 0x79, 0x60, /* movaps xmm7, [rcx+ 96] */ + 0x44, 0x0f, 0x28, 0x41, 0x70, /* movaps xmm8, [rcx+112] */ + 0x48, 0x83, 0xc1, 0x70, /* add rcx,112 */ + 0x44, 0x0f, 0x28, 0x49, 0x10, /* movaps xmm9, [rcx+ 16] */ + 0x44, 0x0f, 0x28, 0x51, 0x20, /* movaps xmm10,[rcx+ 32] */ + 0x44, 0x0f, 0x28, 0x59, 0x30, /* movaps xmm11,[rcx+ 48] */ + 0x44, 0x0f, 0x28, 0x61, 0x40, /* movaps xmm12,[rcx+ 64] */ + 0x44, 0x0f, 0x28, 0x69, 0x50, /* movaps xmm13,[rcx+ 80] */ + 0x44, 0x0f, 0x28, 0x71, 0x60, /* movaps xmm14,[rcx+ 96] */ + 0x44, 0x0f, 0x28, 0x79, 0x70, /* movaps xmm15,[rcx+112] */ + #endif + 0xff, 0xe0, /* jmp rax */ }; #include @@ -42,12 +80,24 @@ static void (*co_swap)(cothread_t, cothread_t) = 0; VirtualProtect(co_swap_function, sizeof co_swap_function, PAGE_EXECUTE_READWRITE, &old_privileges); } #else - //ABI: SystemV + /* ABI: SystemV */ static unsigned char co_swap_function[] = { - 0x48, 0x89, 0x26, 0x48, 0x8B, 0x27, 0x58, 0x48, 0x89, 0x6E, 0x08, 0x48, 0x89, 0x5E, 0x10, 0x4C, - 0x89, 0x66, 0x18, 0x4C, 0x89, 0x6E, 0x20, 0x4C, 0x89, 0x76, 0x28, 0x4C, 0x89, 0x7E, 0x30, 0x48, - 0x8B, 0x6F, 0x08, 0x48, 0x8B, 0x5F, 0x10, 0x4C, 0x8B, 0x67, 0x18, 0x4C, 0x8B, 0x6F, 0x20, 0x4C, - 0x8B, 0x77, 0x28, 0x4C, 0x8B, 0x7F, 0x30, 0xFF, 0xE0, + 0x48, 0x89, 0x26, /* mov [rsi],rsp */ + 0x48, 0x8b, 0x27, /* mov rsp,[rdi] */ + 0x58, /* pop rax */ + 0x48, 0x89, 0x6e, 0x08, /* mov [rsi+ 8],rbp */ + 0x48, 0x89, 0x5e, 0x10, /* mov [rsi+16],rbx */ + 0x4c, 0x89, 0x66, 0x18, /* mov [rsi+24],r12 */ + 0x4c, 0x89, 0x6e, 0x20, /* mov [rsi+32],r13 */ + 0x4c, 0x89, 0x76, 0x28, /* mov [rsi+40],r14 */ + 0x4c, 0x89, 0x7e, 0x30, /* mov [rsi+48],r15 */ + 0x48, 0x8b, 0x6f, 0x08, /* mov rbp,[rdi+ 8] */ + 0x48, 0x8b, 0x5f, 0x10, /* mov rbx,[rdi+16] */ + 0x4c, 0x8b, 0x67, 0x18, /* mov r12,[rdi+24] */ + 0x4c, 0x8b, 0x6f, 0x20, /* mov r13,[rdi+32] */ + 0x4c, 0x8b, 0x77, 0x28, /* mov r14,[rdi+40] */ + 0x4c, 0x8b, 0x7f, 0x30, /* mov r15,[rdi+48] */ + 0xff, 0xe0, /* jmp rax */ }; #include @@ -62,7 +112,7 @@ static void (*co_swap)(cothread_t, cothread_t) = 0; #endif static void crash() { - assert(0); /* called only if cothread_t entrypoint returns */ + assert(0); /* called only if cothread_t entrypoint returns */ } cothread_t co_active() { @@ -77,14 +127,14 @@ cothread_t co_create(unsigned int size, void (*entrypoint)(void)) { co_swap = (void (*)(cothread_t, cothread_t))co_swap_function; } if(!co_active_handle) co_active_handle = &co_active_buffer; - size += 512; /* allocate additional space for storage */ - size &= ~15; /* align stack to 16-byte boundary */ + size += 512; /* allocate additional space for storage */ + size &= ~15; /* align stack to 16-byte boundary */ if(handle = (cothread_t)malloc(size)) { - long long *p = (long long*)((char*)handle + size); /* seek to top of stack */ - *--p = (long long)crash; /* crash if entrypoint returns */ - *--p = (long long)entrypoint; /* start of function */ - *(long long*)handle = (long long)p; /* stack pointer */ + long long *p = (long long*)((char*)handle + size); /* seek to top of stack */ + *--p = (long long)crash; /* crash if entrypoint returns */ + *--p = (long long)entrypoint; /* start of function */ + *(long long*)handle = (long long)p; /* stack pointer */ } return handle; diff --git a/arm.c b/arm.c new file mode 100644 index 00000000..70dbdd1b --- /dev/null +++ b/arm.c @@ -0,0 +1,71 @@ +/* + libco.arm (2015-06-18) + author: byuu + license: public domain +*/ + +#define LIBCO_C +#include "libco.h" + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +static thread_local unsigned long co_active_buffer[64]; +static thread_local cothread_t co_active_handle = 0; +static void (*co_swap)(cothread_t, cothread_t) = 0; + +static unsigned long co_swap_function[] = { + 0xe8a16ff0, /* stmia r1!, {r4-r11,sp,lr} */ + 0xe8b0aff0, /* ldmia r0!, {r4-r11,sp,pc} */ + 0xe12fff1e, /* bx lr */ +}; + +void co_init() { + unsigned long addr = (unsigned long)co_swap_function; + unsigned long base = addr - (addr % sysconf(_SC_PAGESIZE)); + unsigned long size = (addr - base) + sizeof co_swap_function; + mprotect((void*)base, size, PROT_READ | PROT_WRITE | PROT_EXEC); +} + +cothread_t co_active() { + if(!co_active_handle) co_active_handle = &co_active_buffer; + return co_active_handle; +} + +cothread_t co_create(unsigned int size, void (*entrypoint)(void)) { + unsigned long* handle = 0; + if(!co_swap) { + co_init(); + co_swap = (void (*)(cothread_t, cothread_t))co_swap_function; + } + if(!co_active_handle) co_active_handle = &co_active_buffer; + size += 256; + size &= ~15; + + if(handle = (unsigned long*)malloc(size)) { + unsigned long* p = (unsigned long*)((unsigned char*)handle + size); + handle[8] = (unsigned long)p; + handle[9] = (unsigned long)entrypoint; + } + + return handle; +} + +void co_delete(cothread_t handle) { + free(handle); +} + +void co_switch(cothread_t handle) { + cothread_t co_previous_handle = co_active_handle; + co_swap(co_active_handle = handle, co_previous_handle); +} + +#ifdef __cplusplus +} +#endif diff --git a/fiber.c b/fiber.c index 02ef5bc7..f57c0799 100644 --- a/fiber.c +++ b/fiber.c @@ -6,9 +6,9 @@ #define LIBCO_C #include "libco.h" + #define WINVER 0x0400 #define _WIN32_WINNT 0x0400 -#define WIN32_LEAN_AND_MEAN #include #ifdef __cplusplus diff --git a/libco.c b/libco.c index 55676263..c48ffd97 100644 --- a/libco.c +++ b/libco.c @@ -1,23 +1,30 @@ /* libco - auto-selection module license: public domain */ -#if defined(__GNUC__) && defined(__i386__) - #include "x86.c" -#elif defined(__GNUC__) && defined(__amd64__) - #include "amd64.c" -#elif defined(__GNUC__) && defined(_ARCH_PPC) - #include "ppc.c" -#elif defined(__GNUC__) - #include "sjlj.c" -#elif defined(_MSC_VER) && defined(_M_IX86) - #include "x86.c" -#elif defined(_MSC_VER) && defined(_M_AMD64) - #include "amd64.c" +#if defined(__clang__) || defined(__GNUC__) + #if defined(__i386__) + #include "x86.c" + #elif defined(__amd64__) + #include "amd64.c" + #elif defined(__arm__) + #include "arm.c" + #elif defined(_ARCH_PPC) + #include "ppc.c" + #elif defined(_WIN32) + #include "fiber.c" + #else + #include "sjlj.c" + #endif #elif defined(_MSC_VER) - #include "fiber.c" + #if defined(_M_IX86) + #include "x86.c" + #elif defined(_M_AMD64) + #include "amd64.c" + #else + #include "fiber.c" + #endif #else #error "libco: unsupported processor, compiler or operating system" #endif diff --git a/libco.h b/libco.h index deb954fb..1851696e 100644 --- a/libco.h +++ b/libco.h @@ -1,6 +1,7 @@ /* libco - version: 0.16 (2010-12-24) + version: 0.17 (2015-06-18) + author: byuu license: public domain */ diff --git a/ppc.c b/ppc.c index a6028fdb..d509cd9e 100644 --- a/ppc.c +++ b/ppc.c @@ -9,6 +9,7 @@ floating-point and AltiVec save/restore */ #define LIBCO_C #include "libco.h" + #include #include #include diff --git a/sjlj.c b/sjlj.c index 8b72b614..9203efe7 100644 --- a/sjlj.c +++ b/sjlj.c @@ -13,6 +13,7 @@ #define LIBCO_C #include "libco.h" + #include #include #include diff --git a/x86.c b/x86.c index d8f820b0..44bbe4b8 100644 --- a/x86.c +++ b/x86.c @@ -6,6 +6,7 @@ #define LIBCO_C #include "libco.h" + #include #include @@ -13,10 +14,10 @@ extern "C" { #endif -#if defined(_MSC_VER) - #define fastcall __fastcall -#elif defined(__GNUC__) +#if defined(__clang__) || defined(__GNUC__) #define fastcall __attribute__((fastcall)) +#elif defined(_MSC_VER) + #define fastcall __fastcall #else #error "libco: please define fastcall macro" #endif @@ -25,10 +26,20 @@ static thread_local long co_active_buffer[64]; static thread_local cothread_t co_active_handle = 0; static void (fastcall *co_swap)(cothread_t, cothread_t) = 0; -//ABI: fastcall +/* ABI: fastcall */ static unsigned char co_swap_function[] = { - 0x89, 0x22, 0x8B, 0x21, 0x58, 0x89, 0x6A, 0x04, 0x89, 0x72, 0x08, 0x89, 0x7A, 0x0C, 0x89, 0x5A, - 0x10, 0x8B, 0x69, 0x04, 0x8B, 0x71, 0x08, 0x8B, 0x79, 0x0C, 0x8B, 0x59, 0x10, 0xFF, 0xE0, + 0x89, 0x22, /* mov [edx],esp */ + 0x8b, 0x21, /* mov esp,[ecx] */ + 0x58, /* pop eax */ + 0x89, 0x6a, 0x04, /* mov [edx+ 4],ebp */ + 0x89, 0x72, 0x08, /* mov [edx+ 8],esi */ + 0x89, 0x7a, 0x0c, /* mov [edx+12],edi */ + 0x89, 0x5a, 0x10, /* mov [edx+16],ebx */ + 0x8b, 0x69, 0x04, /* mov ebp,[ecx+ 4] */ + 0x8b, 0x71, 0x08, /* mov esi,[ecx+ 8] */ + 0x8b, 0x79, 0x0c, /* mov edi,[ecx+12] */ + 0x8b, 0x59, 0x10, /* mov ebx,[ecx+16] */ + 0xff, 0xe0, /* jmp eax */ }; #ifdef _WIN32 @@ -51,7 +62,7 @@ static unsigned char co_swap_function[] = { #endif static void crash() { - assert(0); /* called only if cothread_t entrypoint returns */ + assert(0); /* called only if cothread_t entrypoint returns */ } cothread_t co_active() { @@ -66,14 +77,14 @@ cothread_t co_create(unsigned int size, void (*entrypoint)(void)) { co_swap = (void (fastcall*)(cothread_t, cothread_t))co_swap_function; } if(!co_active_handle) co_active_handle = &co_active_buffer; - size += 256; /* allocate additional space for storage */ - size &= ~15; /* align stack to 16-byte boundary */ + size += 256; /* allocate additional space for storage */ + size &= ~15; /* align stack to 16-byte boundary */ if(handle = (cothread_t)malloc(size)) { - long *p = (long*)((char*)handle + size); /* seek to top of stack */ - *--p = (long)crash; /* crash if entrypoint returns */ - *--p = (long)entrypoint; /* start of function */ - *(long*)handle = (long)p; /* stack pointer */ + long *p = (long*)((char*)handle + size); /* seek to top of stack */ + *--p = (long)crash; /* crash if entrypoint returns */ + *--p = (long)entrypoint; /* start of function */ + *(long*)handle = (long)p; /* stack pointer */ } return handle;