Fix a complete savestate snafu in waterbox

This was broken by 175556529e, with two related issues:  When we allowed for some operations to happen even when the block is inactive, we didn't account for the fact that in swapin, the block technically is not active yet (the lock is not on the self), and similarly in swapout, the lock has already been moved out of self.  The former caused all memory areas to revert to RWX at the host OS level after a swap, so no dirty detection was done.  After the former was fixed, the latter caused saved costacks to still get missed.

At the same time we ran into a perfect storm with costacks on Windows; if a stack page is not yet dirty, but we hit a fault for something else, Windows will not call our VEH handler unless the TIB stack extents are satisfactory, since it needs userspace to fix up the TIB extents via VEH or SEH handler, but there's already an exception pending.
This commit is contained in:
nattthebear 2020-07-23 12:09:08 -04:00
parent ae6d512e11
commit e343f6bd26
9 changed files with 198 additions and 36 deletions

Binary file not shown.

Binary file not shown.

View File

@ -9,11 +9,24 @@ struc Context
.extcall_slots resq 64
endstruc
times 0-($-$$) int3 ; CALL_GUEST_IMPL_ADDR
; sets up guest stack and calls a function
; r11 - guest entry point
; r10 - address of context structure
; regular arg registers are 0..6 args passed through to guest
call_guest_impl:
; save host TIB data
mov rax, [gs:0x08]
push rax
mov rax, [gs:0x10]
push rax
; set guest TIB data
xor rax, rax
mov [gs:0x10], rax
sub rax, 1
mov [gs:0x08], rax
mov [gs:0x18], r10
mov [r10 + Context.host_rsp], rsp
mov rsp, [r10 + Context.guest_rsp]
@ -23,18 +36,16 @@ call_guest_impl:
mov rsp, [r10 + Context.host_rsp]
mov r11, 0
mov [gs:0x18], r11
; restore host TIB data
pop r10
mov [gs:0x10], r10
pop r10
mov [gs:0x08], r10
ret
align 64, int3
; alternative to guest call thunks for functions with 0 args
; rdi - guest entry point
; rsi - address of context structure
call_guest_simple:
mov r11, rdi
mov r10, rsi
jmp call_guest_impl
align 64, int3
times 0x80-($-$$) int3
; called by guest when it wishes to make a syscall
; must be loaded at fixed address, as that address is burned into guest executables
; rax - syscall number
@ -43,16 +54,52 @@ guest_syscall:
mov r10, [gs:0x18]
mov [r10 + Context.guest_rsp], rsp
mov rsp, [r10 + Context.host_rsp]
; restore host TIB data
mov r11, [rsp]
mov [gs:0x10], r11
mov r11, [rsp + 8]
mov [gs:0x08], r11
sub rsp, 8 ; align
mov r11, [r10 + Context.host_ptr]
push r11 ; arg 8 to dispatch_syscall: host
push rax ; arg 7 to dispatch_syscall: nr
mov rax, [r10 + Context.dispatch_syscall]
call rax
; set guest TIB data
xor r10, r10
mov [gs:0x10], r10
sub r10, 1
mov [gs:0x08], r10
mov r10, [gs:0x18]
mov rsp, [r10 + Context.guest_rsp]
ret
align 64, int3
times 0x100-($-$$) int3 ; CALL_GUEST_SIMPLE_ADDR
; alternative to guest call thunks for functions with 0 args
; rdi - guest entry point
; rsi - address of context structure
call_guest_simple:
mov r11, rdi
mov r10, rsi
jmp call_guest_impl
times 0x200-($-$$) int3 ; EXTCALL_THUNK_ADDR
; individual thunks to each of 64 call slots
; should be in fixed locations for memory hygiene in the core, since they may be stored there for some time
%macro guest_extcall_thunk 1
mov rax, %1
jmp guest_extcall_impl
align 16, int3
%endmacro
%assign j 0
%rep 64
guest_extcall_thunk j
%assign j j+1
%endrep
; called by individual extcall thunks when the guest wishes to make an external call
; (very similar to guest_syscall)
@ -62,25 +109,23 @@ guest_extcall_impl:
mov r10, [gs:0x18]
mov [r10 + Context.guest_rsp], rsp
mov rsp, [r10 + Context.host_rsp]
; restore host TIB data
mov r11, [rsp]
mov [gs:0x10], r11
mov r11, [rsp + 8]
mov [gs:0x08], r11
mov r11, [r10 + Context.extcall_slots + rax * 8] ; get slot ptr
sub rsp, 8 ; align
call r11
; set guest TIB data
xor r10, r10
mov [gs:0x10], r10
sub r10, 1
mov [gs:0x08], r10
mov r10, [gs:0x18]
mov rsp, [r10 + Context.guest_rsp]
ret
align 64, int3
; individual thunks to each of 64 call slots
; should be in fixed locations for memory hygiene in the core, since they may be stored there for some time
%macro guest_extcall_thunk 1
mov rax, %1
jmp guest_extcall_impl
align 16, int3
%endmacro
%assign j 0
%rep 64
guest_extcall_thunk j
%assign j j+1
%endrep

View File

@ -10,14 +10,15 @@ pub mod thunks;
const ORG: usize = 0x35f00000000;
const CALL_GUEST_IMPL_ADDR: usize = ORG;
const CALL_GUEST_SIMPLE_ADDR: usize = ORG + 64;
const CALL_GUEST_SIMPLE_ADDR: usize = ORG + 0x100;
const EXTCALL_THUNK_ADDR: usize = ORG + 0x200;
pub const CALLBACK_SLOTS: usize = 64;
/// Retrieves a function pointer suitable for sending to the guest that will cause
/// the host to callback to `slot` when called. Slot must be less than CALLBACK_SLOTS
pub fn get_callback_ptr(slot: usize) -> usize{
assert!(slot < CALLBACK_SLOTS);
ORG + 0x100 + slot * 16
EXTCALL_THUNK_ADDR + slot * 16
}
fn init_interop_area() -> AddressRange {

View File

@ -2,6 +2,7 @@
#![feature(try_trait)]
#![feature(core_intrinsics)]
#![feature(asm)]
#![allow(dead_code)]

View File

@ -233,6 +233,8 @@ pub struct MemoryBlock {
debug_id: u32,
active_guard: Option<BlockGuard>,
/// Pretend this block is active. Used internally by activate/deactivate for blocks that are about to gain / just lost the lock.
assume_active: bool,
}
type BlockGuard = MutexGuard<'static, Option<MemoryBlockRef>>;
@ -277,6 +279,7 @@ impl MemoryBlock {
debug_id,
active_guard: None,
assume_active: false,
});
// res.trace("new");
res
@ -290,7 +293,7 @@ impl MemoryBlock {
}
pub fn active(&self) -> bool {
match self.active_guard {
self.assume_active || match self.active_guard {
Some(_) => true,
None => false
}
@ -360,11 +363,15 @@ impl MemoryBlock {
// self.trace("swapin");
pal::map_handle(&self.handle, self.addr).unwrap();
tripguard::register(self);
self.assume_active = true;
self.refresh_all_protections();
self.assume_active = false;
}
unsafe fn swapout(&mut self) {
// self.trace("swapout");
self.assume_active = true;
self.get_stack_dirty();
self.assume_active = false;
pal::unmap_handle(self.addr).unwrap();
tripguard::unregister(self);
}
@ -763,6 +770,7 @@ impl MemoryBlock {
if self.sealed {
return Err(anyhow!("Already sealed!"))
}
self.get_stack_dirty();
for p in self.pages.iter_mut() {
if p.dirty && !p.invisible {

View File

@ -63,6 +63,7 @@ mod win {
/// Leaks if not later unmapped
/// addr.start can be 0, which means the OS chooses a location, or non-zero, which gives fixed address behavior
/// Returned address range will be identical in the case of non-zero, or give the actual address in the case of zero.
/// Returned address range will have no access permissions initially; use protect() to change that.
pub fn map_handle(handle: &Handle, addr: AddressRange) -> anyhow::Result<AddressRange> {
unsafe {
let res = MapViewOfFileEx(
@ -76,7 +77,9 @@ mod win {
if res == null_mut() {
Err(error())
} else {
Ok(AddressRange { start: res as usize, size: addr.size })
let ret = AddressRange { start: res as usize, size: addr.size };
protect(ret, Protection::None).unwrap();
Ok(ret)
}
}
}
@ -203,6 +206,7 @@ mod nix {
/// Leaks if not later unmapped
/// addr.start can be 0, which means the OS chooses a location, or non-zero, which gives fixed address behavior
/// Returned address range will be identical in the case of non-zero, or give the actual address in the case of zero.
/// Returned address range will have no access permissions initially; use protect() to change that.
pub fn map_handle(handle: &Handle, addr: AddressRange) -> anyhow::Result<AddressRange> {
unsafe {
let mut flags = MAP_SHARED;
@ -211,7 +215,7 @@ mod nix {
}
let ptr = mmap(addr.start as *mut c_void,
addr.size,
PROT_READ | PROT_WRITE | PROT_EXEC,
PROT_NONE,
flags,
handle.0 as i32,
0

View File

@ -5,6 +5,44 @@ use super::*;
type TestResult = anyhow::Result<()>;
struct TibSave {
stack_top: usize,
stack_bottom: usize,
}
impl TibSave {
fn new() -> TibSave {
context::prepare_thread();
let mut res = TibSave {
stack_top: 0,
stack_bottom: 0,
};
// nt will decline to call ntdll!KiUserExceptionDispatcher if it doesn't like rsp. The most
// obvious example of this is if [rsp] is not writable, which is why we have the whole RWStack nonsense.
// But it seems there can be other issues if you hit a fault handler for a non-stack location while the
// stack has not yet been unguarded (see test_regular_fault_on_new_stack). So we add garbage values to TIB
// stack extents to fix things up.
// In normal, non-test code, this TIB manipulation happens in interop.s
unsafe {
asm!("mov {}, gs:0x08", out(reg) res.stack_top);
asm!("mov {}, gs:0x10", out(reg) res.stack_bottom);
asm!("mov gs:0x08, {}", in(reg) -1isize);
asm!("mov gs:0x10, {}", in(reg) 0isize);
}
res
}
}
impl Drop for TibSave {
fn drop(&mut self) {
unsafe {
asm!("mov gs:0x08, {}", in(reg) self.stack_top);
asm!("mov gs:0x10, {}", in(reg) self.stack_bottom);
}
}
}
/// new / drop, activate / deactivate
#[test]
fn test_create() {
@ -93,6 +131,8 @@ fn test_stk_norm() -> TestResult {
fn test_stack() -> TestResult {
use std::convert::TryInto;
unsafe {
let _t = TibSave::new();
let addr = AddressRange { start: 0x36f00000000, size: 0x10000 };
let mut b = MemoryBlock::new(addr);
b.activate();
@ -113,6 +153,9 @@ fn test_stack() -> TestResult {
let res = transmute::<usize, extern "sysv64" fn(rsp: usize) -> u8>(addr.start)(tmp_rsp);
assert_eq!(res, 42);
b.deactivate();
b.activate();
b.get_stack_dirty();
assert!(b.pages[0].dirty);
@ -127,6 +170,49 @@ fn test_stack() -> TestResult {
}
}
/// regular dirt detection, but we're on a usermode stack that has not yet been guard tripped (windows)
#[test]
fn test_regular_fault_on_new_stack() -> TestResult {
unsafe {
let _t = TibSave::new();
let addr = AddressRange { start: 0x36f00000000, size: 0x10000 };
let mut b = MemoryBlock::new(addr);
b.activate();
b.mmap_fixed(addr, Protection::RW, true)?;
let ptr = b.addr.slice_mut();
let mut i = 0;
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xe0 ; i += 1; // mov rax,rsp
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xfc ; i += 1; // mov rsp,rdi
// Important: Intentionally make no writes or reads from our stack area before tripping this access violation.
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0x06 ; i += 1; // mov [rsi],rax
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xc4 ; i += 1; // mov rsp,rax
ptr[i] = 0xc3 ; // ret
b.mprotect(AddressRange { start: 0x36f00000000, size: 0x1000 }, Protection::RX)?;
b.mprotect(AddressRange { start: 0x36f00001000, size: 0x1000 }, Protection::RW)?;
b.mprotect(AddressRange { start: 0x36f00008000, size: 0x8000 }, Protection::RWStack)?;
let tmp_rsp = addr.end();
let dest_ptr: usize = 0x36f00001000;
let res = transmute::<_, extern "sysv64" fn(rsp: usize, dest: usize) -> isize>(addr.start)(tmp_rsp, dest_ptr);
b.get_stack_dirty();
assert!(b.pages[0].dirty);
assert!(b.pages[1].dirty);
assert!(!b.pages[2].dirty);
// assert!(!b.pages[14].dirty);
// assert!(b.pages[15].dirty);
let real_rsp = *transmute::<_, *mut isize>(dest_ptr);
let current_rsp = &real_rsp as *const isize as isize;
assert!((real_rsp - current_rsp).abs() < 0x10000);
assert_eq!(real_rsp, res);
Ok(())
}
}
#[test]
fn test_state_basic() -> TestResult {
unsafe {
@ -139,6 +225,9 @@ fn test_state_basic() -> TestResult {
ptr[0x1000] = 40;
ptr[0x2000] = 60;
ptr[0x3000] = 80;
b.deactivate();
b.activate();
b.deactivate();
b.seal()?;
let mut state0 = Vec::new();
@ -147,9 +236,13 @@ fn test_state_basic() -> TestResult {
// no pages should be in the state
assert!(state0.len() < 0x1000);
b.activate();
ptr[0x1000] = 100;
ptr[0x3000] = 44;
b.deactivate();
let mut state1 = Vec::new();
b.save_state(&mut state1)?;
@ -159,18 +252,26 @@ fn test_state_basic() -> TestResult {
b.load_state(&mut state0.as_slice())?;
b.activate();
assert_eq!(ptr[0x0000], 20);
assert_eq!(ptr[0x1000], 40);
assert_eq!(ptr[0x2000], 60);
assert_eq!(ptr[0x3000], 80);
b.deactivate();
b.load_state(&mut state1.as_slice())?;
b.activate();
assert_eq!(ptr[0x0000], 20);
assert_eq!(ptr[0x1000], 100);
assert_eq!(ptr[0x2000], 60);
assert_eq!(ptr[0x3000], 44);
b.deactivate();
Ok(())
}
}
@ -227,13 +328,15 @@ fn test_thready_stack() -> TestResult {
let blocker = barrier.clone();
ress.push(thread::spawn(move|| {
unsafe {
let addr = AddressRange { start: 0x36000000000 + i * 0x100000000, size: PAGESIZE * 2 };
let _t = TibSave::new();
let addr = AddressRange { start: 0x36000000000 + i * 0x100000000, size: PAGESIZE * 8 };
let mut b = MemoryBlock::new(addr);
b.activate();
blocker.wait();
b.mmap_fixed(addr, Protection::RWX, true)?;
b.mprotect(AddressRange { start: addr.start + PAGESIZE, size: PAGESIZE }, Protection::RWStack)?;
b.mprotect(AddressRange { start: addr.start + PAGESIZE, size: PAGESIZE * 7 }, Protection::RWStack)?;
let ptr = b.addr.slice_mut();
let mut i = 0;
@ -248,7 +351,7 @@ fn test_thready_stack() -> TestResult {
b.seal()?;
assert!(!b.pages[0].dirty);
assert!(!b.pages[1].dirty);
assert!(!b.pages[7].dirty);
let tmp_rsp = addr.end();
let res = transmute::<usize, extern "sysv64" fn(rsp: usize) -> u8>(addr.start)(tmp_rsp);
assert_eq!(res, 42);
@ -256,7 +359,7 @@ fn test_thready_stack() -> TestResult {
b.get_stack_dirty();
assert!(!b.pages[0].dirty);
assert!(b.pages[1].dirty);
assert!(b.pages[7].dirty);
Ok(())
}