Experiment to speed up libco and hence libsnes (#2248)

The compiler now can fully inline the co_switch, and with most registers being specified as clobbers and not saved explicitly, the compiler can choose to save only what it needs to (we don't have to defensively save everything).

Practically speaking, the co_switch calls are usually inlined, but the functions they're in don't seem to be that big and don't make direct use of r12..r15 too much anyway, so (push r12..r15, switch, pop r12..r15) is a common emit.  But I see a miniscule FPS increase.
This commit is contained in:
nattthebear 2020-07-22 16:13:19 -04:00 committed by GitHub
parent 966a2abe3f
commit 07c7c329d3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 66 deletions

Binary file not shown.

View File

@ -1,8 +1,8 @@
NO_WBX_TARGETS := 1
CCFLAGS := -std=c99 -Wall
SRCS := amd64.c coswap.s
AS := nasm
ASFLAGS := -f elf64 -Wall
SRCS := amd64.c #coswap.s
#AS := nasm
#ASFLAGS := -f elf64 -Wall
include ../common.mak
@ -10,7 +10,7 @@ $(OBJ_DIR)/%.s.o: %.s
@echo as $<
@mkdir -p $(@D)
@$(AS) $(ASFLAGS) -o $@ $<
$(DOBJ_DIR)/%.s.o: %.s
@echo as $<
@mkdir -p $(@D)
@$(AS) $(ASFLAGS) -o $@ $<
# $(DOBJ_DIR)/%.s.o: %.s
# @echo as $<
# @mkdir -p $(@D)
# @$(AS) $(ASFLAGS) -o $@ $<

View File

@ -20,12 +20,8 @@ typedef struct {
// used by coswap.s, has to be at the beginning of the struct
struct {
uint64_t rsp;
uint64_t rbp;
uint64_t rbx;
uint64_t r12;
uint64_t r13;
uint64_t r14;
uint64_t r15;
uint64_t rbp; // we have to save rbp because unless fomit-frame-pointer is set, the compiler regards it as "special" and won't allow clobbers
uint64_t rip;
} jmp_buf;
// points to the lowest address in the stack
// NB: because of guard space, this is not valid stack
@ -104,8 +100,8 @@ cothread_t co_create(unsigned int sz, void (*entrypoint)(void))
{
uint64_t* p = (uint64_t*)((char*)co->stack + co->stack_size); // seek to top of stack
*--p = (uint64_t)crash; // crash if entrypoint returns
*--p = (uint64_t)entrypoint; // start of function
co->jmp_buf.rsp = (uint64_t)p; // stack pointer
co->jmp_buf.rip = (uint64_t)entrypoint; // start of function
}
return co;
@ -116,40 +112,41 @@ void co_delete(cothread_t handle)
free_thread(handle);
}
// static uint64_t hoststart;
// static uint64_t hostend;
void co_switch(cothread_t handle)
{
cothread_impl* co = handle;
cothread_impl* co_previous_handle = co_active_handle;
co_active_handle = co;
// uint64_t start;
// uint64_t end;
// if (co_active_handle == &co_host_buffer)
// {
// // migrating off of real thread; save stack params
// __asm__("movq %%gs:0x08, %0": "=r"(end));
// __asm__("movq %%gs:0x10, %0": "=r"(start));
// hoststart = start;
// hostend = end;
// }
// if (handle == &co_host_buffer)
// {
// // migrating onto real thread; load stack params
// start = hoststart;
// end = hostend;
// hoststart = 0;
// hostend = 0;
// }
// else
// {
// // migrating onto cothread; compute its extents we allocated them
// start = (uintptr_t)co->stack;
// end = start + co->stack_size;
// }
// __asm__("movq %0, %%gs:0x08":: "r"(end));
// __asm__("movq %0, %%gs:0x10":: "r"(start));
register uint64_t _rdi __asm__("rdi") = (uint64_t)co_previous_handle;
register uint64_t _rsi __asm__("rsi") = (uint64_t)co_active_handle;
register cothread_impl* co_previous_handle = co_active_handle;
co_swap(co_active_handle = co, co_previous_handle);
/*
mov [rdi + 0], rsp
mov [rdi + 8], rbp
lea rax, [rip + 17]
mov [rdi + 16], rax
mov rsp, [rsi + 0]
mov rbp, [rsi + 8]
mov rax, [rsi + 16]
jmp rax
*/
__asm__(
"mov %%rsp, 0(%%rdi)\n"
"mov %%rbp, 8(%%rdi)\n"
"lea 17(%%rip), %%rax\n"
"mov %%rax, 16(%%rdi)\n"
"mov 0(%%rsi), %%rsp\n"
"mov 8(%%rsi), %%rbp\n"
"mov 16(%%rsi), %%rax\n"
"jmp *%%rax\n"
::"r"(_rdi), "r"(_rsi)
:"rax", "rbx", "rcx", "rdx", /*"rbp",*/ /*"rsi", "rdi",*/ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9",
"zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
/*"zmm16", "zmm17", "zmm18", "zmm19",
"zmm20", "zmm21", "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
"zmm30", "zmm31",*/
"memory"
);
}

View File

@ -1,21 +0,0 @@
section .text
global co_swap
align 16
co_swap:
mov [rsi],rsp
mov rsp,[rdi]
pop rax
mov [rsi+ 8],rbp
mov [rsi+16],rbx
mov [rsi+24],r12
mov [rsi+32],r13
mov [rsi+40],r14
mov [rsi+48],r15
mov rbp,[rdi+ 8]
mov rbx,[rdi+16]
mov r12,[rdi+24]
mov r13,[rdi+32]
mov r14,[rdi+40]
mov r15,[rdi+48]
jmp rax