Fix a complete savestate snafu in waterbox
This was broken by 175556529e
, with two related issues: When we allowed for some operations to happen even when the block is inactive, we didn't account for the fact that in swapin, the block technically is not active yet (the lock is not on the self), and similarly in swapout, the lock has already been moved out of self. The former caused all memory areas to revert to RWX at the host OS level after a swap, so no dirty detection was done. After the former was fixed, the latter caused saved costacks to still get missed.
At the same time we ran into a perfect storm with costacks on Windows; if a stack page is not yet dirty, but we hit a fault for something else, Windows will not call our VEH handler unless the TIB stack extents are satisfactory, since it needs userspace to fix up the TIB extents via VEH or SEH handler, but there's already an exception pending.
This commit is contained in:
parent
ae6d512e11
commit
e343f6bd26
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -9,11 +9,24 @@ struc Context
|
|||
.extcall_slots resq 64
|
||||
endstruc
|
||||
|
||||
times 0-($-$$) int3 ; CALL_GUEST_IMPL_ADDR
|
||||
; sets up guest stack and calls a function
|
||||
; r11 - guest entry point
|
||||
; r10 - address of context structure
|
||||
; regular arg registers are 0..6 args passed through to guest
|
||||
call_guest_impl:
|
||||
; save host TIB data
|
||||
mov rax, [gs:0x08]
|
||||
push rax
|
||||
mov rax, [gs:0x10]
|
||||
push rax
|
||||
|
||||
; set guest TIB data
|
||||
xor rax, rax
|
||||
mov [gs:0x10], rax
|
||||
sub rax, 1
|
||||
mov [gs:0x08], rax
|
||||
|
||||
mov [gs:0x18], r10
|
||||
mov [r10 + Context.host_rsp], rsp
|
||||
mov rsp, [r10 + Context.guest_rsp]
|
||||
|
@ -23,18 +36,16 @@ call_guest_impl:
|
|||
mov rsp, [r10 + Context.host_rsp]
|
||||
mov r11, 0
|
||||
mov [gs:0x18], r11
|
||||
|
||||
; restore host TIB data
|
||||
pop r10
|
||||
mov [gs:0x10], r10
|
||||
pop r10
|
||||
mov [gs:0x08], r10
|
||||
|
||||
ret
|
||||
align 64, int3
|
||||
|
||||
; alternative to guest call thunks for functions with 0 args
|
||||
; rdi - guest entry point
|
||||
; rsi - address of context structure
|
||||
call_guest_simple:
|
||||
mov r11, rdi
|
||||
mov r10, rsi
|
||||
jmp call_guest_impl
|
||||
align 64, int3
|
||||
|
||||
times 0x80-($-$$) int3
|
||||
; called by guest when it wishes to make a syscall
|
||||
; must be loaded at fixed address, as that address is burned into guest executables
|
||||
; rax - syscall number
|
||||
|
@ -43,16 +54,52 @@ guest_syscall:
|
|||
mov r10, [gs:0x18]
|
||||
mov [r10 + Context.guest_rsp], rsp
|
||||
mov rsp, [r10 + Context.host_rsp]
|
||||
|
||||
; restore host TIB data
|
||||
mov r11, [rsp]
|
||||
mov [gs:0x10], r11
|
||||
mov r11, [rsp + 8]
|
||||
mov [gs:0x08], r11
|
||||
|
||||
sub rsp, 8 ; align
|
||||
mov r11, [r10 + Context.host_ptr]
|
||||
push r11 ; arg 8 to dispatch_syscall: host
|
||||
push rax ; arg 7 to dispatch_syscall: nr
|
||||
mov rax, [r10 + Context.dispatch_syscall]
|
||||
call rax
|
||||
|
||||
; set guest TIB data
|
||||
xor r10, r10
|
||||
mov [gs:0x10], r10
|
||||
sub r10, 1
|
||||
mov [gs:0x08], r10
|
||||
|
||||
mov r10, [gs:0x18]
|
||||
mov rsp, [r10 + Context.guest_rsp]
|
||||
ret
|
||||
align 64, int3
|
||||
|
||||
times 0x100-($-$$) int3 ; CALL_GUEST_SIMPLE_ADDR
|
||||
; alternative to guest call thunks for functions with 0 args
|
||||
; rdi - guest entry point
|
||||
; rsi - address of context structure
|
||||
call_guest_simple:
|
||||
mov r11, rdi
|
||||
mov r10, rsi
|
||||
jmp call_guest_impl
|
||||
|
||||
times 0x200-($-$$) int3 ; EXTCALL_THUNK_ADDR
|
||||
; individual thunks to each of 64 call slots
|
||||
; should be in fixed locations for memory hygiene in the core, since they may be stored there for some time
|
||||
%macro guest_extcall_thunk 1
|
||||
mov rax, %1
|
||||
jmp guest_extcall_impl
|
||||
align 16, int3
|
||||
%endmacro
|
||||
%assign j 0
|
||||
%rep 64
|
||||
guest_extcall_thunk j
|
||||
%assign j j+1
|
||||
%endrep
|
||||
|
||||
; called by individual extcall thunks when the guest wishes to make an external call
|
||||
; (very similar to guest_syscall)
|
||||
|
@ -62,25 +109,23 @@ guest_extcall_impl:
|
|||
mov r10, [gs:0x18]
|
||||
mov [r10 + Context.guest_rsp], rsp
|
||||
mov rsp, [r10 + Context.host_rsp]
|
||||
|
||||
; restore host TIB data
|
||||
mov r11, [rsp]
|
||||
mov [gs:0x10], r11
|
||||
mov r11, [rsp + 8]
|
||||
mov [gs:0x08], r11
|
||||
|
||||
mov r11, [r10 + Context.extcall_slots + rax * 8] ; get slot ptr
|
||||
sub rsp, 8 ; align
|
||||
call r11
|
||||
|
||||
; set guest TIB data
|
||||
xor r10, r10
|
||||
mov [gs:0x10], r10
|
||||
sub r10, 1
|
||||
mov [gs:0x08], r10
|
||||
|
||||
mov r10, [gs:0x18]
|
||||
mov rsp, [r10 + Context.guest_rsp]
|
||||
ret
|
||||
align 64, int3
|
||||
|
||||
; individual thunks to each of 64 call slots
|
||||
; should be in fixed locations for memory hygiene in the core, since they may be stored there for some time
|
||||
|
||||
%macro guest_extcall_thunk 1
|
||||
mov rax, %1
|
||||
jmp guest_extcall_impl
|
||||
align 16, int3
|
||||
%endmacro
|
||||
|
||||
%assign j 0
|
||||
%rep 64
|
||||
guest_extcall_thunk j
|
||||
%assign j j+1
|
||||
%endrep
|
||||
|
|
|
@ -10,14 +10,15 @@ pub mod thunks;
|
|||
const ORG: usize = 0x35f00000000;
|
||||
|
||||
const CALL_GUEST_IMPL_ADDR: usize = ORG;
|
||||
const CALL_GUEST_SIMPLE_ADDR: usize = ORG + 64;
|
||||
const CALL_GUEST_SIMPLE_ADDR: usize = ORG + 0x100;
|
||||
const EXTCALL_THUNK_ADDR: usize = ORG + 0x200;
|
||||
|
||||
pub const CALLBACK_SLOTS: usize = 64;
|
||||
/// Retrieves a function pointer suitable for sending to the guest that will cause
|
||||
/// the host to callback to `slot` when called. Slot must be less than CALLBACK_SLOTS
|
||||
pub fn get_callback_ptr(slot: usize) -> usize{
|
||||
assert!(slot < CALLBACK_SLOTS);
|
||||
ORG + 0x100 + slot * 16
|
||||
EXTCALL_THUNK_ADDR + slot * 16
|
||||
}
|
||||
|
||||
fn init_interop_area() -> AddressRange {
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#![feature(try_trait)]
|
||||
#![feature(core_intrinsics)]
|
||||
#![feature(asm)]
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
|
|
|
@ -233,6 +233,8 @@ pub struct MemoryBlock {
|
|||
|
||||
debug_id: u32,
|
||||
active_guard: Option<BlockGuard>,
|
||||
/// Pretend this block is active. Used internally by activate/deactivate for blocks that are about to gain / just lost the lock.
|
||||
assume_active: bool,
|
||||
}
|
||||
|
||||
type BlockGuard = MutexGuard<'static, Option<MemoryBlockRef>>;
|
||||
|
@ -277,6 +279,7 @@ impl MemoryBlock {
|
|||
|
||||
debug_id,
|
||||
active_guard: None,
|
||||
assume_active: false,
|
||||
});
|
||||
// res.trace("new");
|
||||
res
|
||||
|
@ -290,7 +293,7 @@ impl MemoryBlock {
|
|||
}
|
||||
|
||||
pub fn active(&self) -> bool {
|
||||
match self.active_guard {
|
||||
self.assume_active || match self.active_guard {
|
||||
Some(_) => true,
|
||||
None => false
|
||||
}
|
||||
|
@ -360,11 +363,15 @@ impl MemoryBlock {
|
|||
// self.trace("swapin");
|
||||
pal::map_handle(&self.handle, self.addr).unwrap();
|
||||
tripguard::register(self);
|
||||
self.assume_active = true;
|
||||
self.refresh_all_protections();
|
||||
self.assume_active = false;
|
||||
}
|
||||
unsafe fn swapout(&mut self) {
|
||||
// self.trace("swapout");
|
||||
self.assume_active = true;
|
||||
self.get_stack_dirty();
|
||||
self.assume_active = false;
|
||||
pal::unmap_handle(self.addr).unwrap();
|
||||
tripguard::unregister(self);
|
||||
}
|
||||
|
@ -763,6 +770,7 @@ impl MemoryBlock {
|
|||
if self.sealed {
|
||||
return Err(anyhow!("Already sealed!"))
|
||||
}
|
||||
self.get_stack_dirty();
|
||||
|
||||
for p in self.pages.iter_mut() {
|
||||
if p.dirty && !p.invisible {
|
||||
|
|
|
@ -63,6 +63,7 @@ mod win {
|
|||
/// Leaks if not later unmapped
|
||||
/// addr.start can be 0, which means the OS chooses a location, or non-zero, which gives fixed address behavior
|
||||
/// Returned address range will be identical in the case of non-zero, or give the actual address in the case of zero.
|
||||
/// Returned address range will have no access permissions initially; use protect() to change that.
|
||||
pub fn map_handle(handle: &Handle, addr: AddressRange) -> anyhow::Result<AddressRange> {
|
||||
unsafe {
|
||||
let res = MapViewOfFileEx(
|
||||
|
@ -76,7 +77,9 @@ mod win {
|
|||
if res == null_mut() {
|
||||
Err(error())
|
||||
} else {
|
||||
Ok(AddressRange { start: res as usize, size: addr.size })
|
||||
let ret = AddressRange { start: res as usize, size: addr.size };
|
||||
protect(ret, Protection::None).unwrap();
|
||||
Ok(ret)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -203,6 +206,7 @@ mod nix {
|
|||
/// Leaks if not later unmapped
|
||||
/// addr.start can be 0, which means the OS chooses a location, or non-zero, which gives fixed address behavior
|
||||
/// Returned address range will be identical in the case of non-zero, or give the actual address in the case of zero.
|
||||
/// Returned address range will have no access permissions initially; use protect() to change that.
|
||||
pub fn map_handle(handle: &Handle, addr: AddressRange) -> anyhow::Result<AddressRange> {
|
||||
unsafe {
|
||||
let mut flags = MAP_SHARED;
|
||||
|
@ -211,7 +215,7 @@ mod nix {
|
|||
}
|
||||
let ptr = mmap(addr.start as *mut c_void,
|
||||
addr.size,
|
||||
PROT_READ | PROT_WRITE | PROT_EXEC,
|
||||
PROT_NONE,
|
||||
flags,
|
||||
handle.0 as i32,
|
||||
0
|
||||
|
|
|
@ -5,6 +5,44 @@ use super::*;
|
|||
|
||||
type TestResult = anyhow::Result<()>;
|
||||
|
||||
struct TibSave {
|
||||
stack_top: usize,
|
||||
stack_bottom: usize,
|
||||
}
|
||||
|
||||
impl TibSave {
|
||||
fn new() -> TibSave {
|
||||
context::prepare_thread();
|
||||
let mut res = TibSave {
|
||||
stack_top: 0,
|
||||
stack_bottom: 0,
|
||||
};
|
||||
// nt will decline to call ntdll!KiUserExceptionDispatcher if it doesn't like rsp. The most
|
||||
// obvious example of this is if [rsp] is not writable, which is why we have the whole RWStack nonsense.
|
||||
// But it seems there can be other issues if you hit a fault handler for a non-stack location while the
|
||||
// stack has not yet been unguarded (see test_regular_fault_on_new_stack). So we add garbage values to TIB
|
||||
// stack extents to fix things up.
|
||||
|
||||
// In normal, non-test code, this TIB manipulation happens in interop.s
|
||||
unsafe {
|
||||
asm!("mov {}, gs:0x08", out(reg) res.stack_top);
|
||||
asm!("mov {}, gs:0x10", out(reg) res.stack_bottom);
|
||||
asm!("mov gs:0x08, {}", in(reg) -1isize);
|
||||
asm!("mov gs:0x10, {}", in(reg) 0isize);
|
||||
}
|
||||
res
|
||||
}
|
||||
}
|
||||
impl Drop for TibSave {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
asm!("mov gs:0x08, {}", in(reg) self.stack_top);
|
||||
asm!("mov gs:0x10, {}", in(reg) self.stack_bottom);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// new / drop, activate / deactivate
|
||||
#[test]
|
||||
fn test_create() {
|
||||
|
@ -93,6 +131,8 @@ fn test_stk_norm() -> TestResult {
|
|||
fn test_stack() -> TestResult {
|
||||
use std::convert::TryInto;
|
||||
unsafe {
|
||||
let _t = TibSave::new();
|
||||
|
||||
let addr = AddressRange { start: 0x36f00000000, size: 0x10000 };
|
||||
let mut b = MemoryBlock::new(addr);
|
||||
b.activate();
|
||||
|
@ -113,6 +153,9 @@ fn test_stack() -> TestResult {
|
|||
let res = transmute::<usize, extern "sysv64" fn(rsp: usize) -> u8>(addr.start)(tmp_rsp);
|
||||
assert_eq!(res, 42);
|
||||
|
||||
b.deactivate();
|
||||
b.activate();
|
||||
|
||||
b.get_stack_dirty();
|
||||
|
||||
assert!(b.pages[0].dirty);
|
||||
|
@ -127,6 +170,49 @@ fn test_stack() -> TestResult {
|
|||
}
|
||||
}
|
||||
|
||||
/// regular dirt detection, but we're on a usermode stack that has not yet been guard tripped (windows)
|
||||
#[test]
|
||||
fn test_regular_fault_on_new_stack() -> TestResult {
|
||||
unsafe {
|
||||
let _t = TibSave::new();
|
||||
|
||||
let addr = AddressRange { start: 0x36f00000000, size: 0x10000 };
|
||||
let mut b = MemoryBlock::new(addr);
|
||||
b.activate();
|
||||
b.mmap_fixed(addr, Protection::RW, true)?;
|
||||
let ptr = b.addr.slice_mut();
|
||||
let mut i = 0;
|
||||
|
||||
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xe0 ; i += 1; // mov rax,rsp
|
||||
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xfc ; i += 1; // mov rsp,rdi
|
||||
// Important: Intentionally make no writes or reads from our stack area before tripping this access violation.
|
||||
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0x06 ; i += 1; // mov [rsi],rax
|
||||
ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xc4 ; i += 1; // mov rsp,rax
|
||||
ptr[i] = 0xc3 ; // ret
|
||||
|
||||
b.mprotect(AddressRange { start: 0x36f00000000, size: 0x1000 }, Protection::RX)?;
|
||||
b.mprotect(AddressRange { start: 0x36f00001000, size: 0x1000 }, Protection::RW)?;
|
||||
b.mprotect(AddressRange { start: 0x36f00008000, size: 0x8000 }, Protection::RWStack)?;
|
||||
let tmp_rsp = addr.end();
|
||||
let dest_ptr: usize = 0x36f00001000;
|
||||
let res = transmute::<_, extern "sysv64" fn(rsp: usize, dest: usize) -> isize>(addr.start)(tmp_rsp, dest_ptr);
|
||||
|
||||
b.get_stack_dirty();
|
||||
|
||||
assert!(b.pages[0].dirty);
|
||||
assert!(b.pages[1].dirty);
|
||||
assert!(!b.pages[2].dirty);
|
||||
// assert!(!b.pages[14].dirty);
|
||||
// assert!(b.pages[15].dirty);
|
||||
|
||||
let real_rsp = *transmute::<_, *mut isize>(dest_ptr);
|
||||
let current_rsp = &real_rsp as *const isize as isize;
|
||||
assert!((real_rsp - current_rsp).abs() < 0x10000);
|
||||
assert_eq!(real_rsp, res);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_state_basic() -> TestResult {
|
||||
unsafe {
|
||||
|
@ -139,6 +225,9 @@ fn test_state_basic() -> TestResult {
|
|||
ptr[0x1000] = 40;
|
||||
ptr[0x2000] = 60;
|
||||
ptr[0x3000] = 80;
|
||||
b.deactivate();
|
||||
b.activate();
|
||||
b.deactivate();
|
||||
|
||||
b.seal()?;
|
||||
let mut state0 = Vec::new();
|
||||
|
@ -147,9 +236,13 @@ fn test_state_basic() -> TestResult {
|
|||
// no pages should be in the state
|
||||
assert!(state0.len() < 0x1000);
|
||||
|
||||
b.activate();
|
||||
|
||||
ptr[0x1000] = 100;
|
||||
ptr[0x3000] = 44;
|
||||
|
||||
b.deactivate();
|
||||
|
||||
let mut state1 = Vec::new();
|
||||
b.save_state(&mut state1)?;
|
||||
|
||||
|
@ -159,18 +252,26 @@ fn test_state_basic() -> TestResult {
|
|||
|
||||
b.load_state(&mut state0.as_slice())?;
|
||||
|
||||
b.activate();
|
||||
|
||||
assert_eq!(ptr[0x0000], 20);
|
||||
assert_eq!(ptr[0x1000], 40);
|
||||
assert_eq!(ptr[0x2000], 60);
|
||||
assert_eq!(ptr[0x3000], 80);
|
||||
|
||||
b.deactivate();
|
||||
|
||||
b.load_state(&mut state1.as_slice())?;
|
||||
|
||||
b.activate();
|
||||
|
||||
assert_eq!(ptr[0x0000], 20);
|
||||
assert_eq!(ptr[0x1000], 100);
|
||||
assert_eq!(ptr[0x2000], 60);
|
||||
assert_eq!(ptr[0x3000], 44);
|
||||
|
||||
b.deactivate();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -227,13 +328,15 @@ fn test_thready_stack() -> TestResult {
|
|||
let blocker = barrier.clone();
|
||||
ress.push(thread::spawn(move|| {
|
||||
unsafe {
|
||||
let addr = AddressRange { start: 0x36000000000 + i * 0x100000000, size: PAGESIZE * 2 };
|
||||
let _t = TibSave::new();
|
||||
|
||||
let addr = AddressRange { start: 0x36000000000 + i * 0x100000000, size: PAGESIZE * 8 };
|
||||
let mut b = MemoryBlock::new(addr);
|
||||
b.activate();
|
||||
|
||||
blocker.wait();
|
||||
b.mmap_fixed(addr, Protection::RWX, true)?;
|
||||
b.mprotect(AddressRange { start: addr.start + PAGESIZE, size: PAGESIZE }, Protection::RWStack)?;
|
||||
b.mprotect(AddressRange { start: addr.start + PAGESIZE, size: PAGESIZE * 7 }, Protection::RWStack)?;
|
||||
|
||||
let ptr = b.addr.slice_mut();
|
||||
let mut i = 0;
|
||||
|
@ -248,7 +351,7 @@ fn test_thready_stack() -> TestResult {
|
|||
b.seal()?;
|
||||
|
||||
assert!(!b.pages[0].dirty);
|
||||
assert!(!b.pages[1].dirty);
|
||||
assert!(!b.pages[7].dirty);
|
||||
let tmp_rsp = addr.end();
|
||||
let res = transmute::<usize, extern "sysv64" fn(rsp: usize) -> u8>(addr.start)(tmp_rsp);
|
||||
assert_eq!(res, 42);
|
||||
|
@ -256,7 +359,7 @@ fn test_thready_stack() -> TestResult {
|
|||
b.get_stack_dirty();
|
||||
|
||||
assert!(!b.pages[0].dirty);
|
||||
assert!(b.pages[1].dirty);
|
||||
assert!(b.pages[7].dirty);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue