diff --git a/waterbox/waterboxhost/Cargo.lock b/waterbox/waterboxhost/Cargo.lock index 433ab30dea..719892696c 100644 --- a/waterbox/waterboxhost/Cargo.lock +++ b/waterbox/waterboxhost/Cargo.lock @@ -1,11 +1,26 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "anyhow" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" + [[package]] name = "bitflags" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -21,12 +36,37 @@ dependencies = [ "bitflags", ] +[[package]] +name = "cpuid-bool" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d375c433320f6c5057ae04a04376eef4d04ce2801448cf8863a78da99107be4" + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array", +] + [[package]] name = "either" version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" +[[package]] +name = "generic-array" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac746a5f3bbfdadd6106868134545e684693d54d9d44f6e9588a7d54af0bf980" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getset" version = "0.1.1" @@ -39,6 +79,17 @@ dependencies = [ "syn", ] +[[package]] +name = "goblin" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d20fd25aa456527ce4f544271ae4fea65d2eda4a6561ea56f39fb3ee4f7e3884" +dependencies = [ + "log", + "plain", + "scroll", +] + [[package]] name = "itertools" version = "0.9.0" @@ -69,6 +120,21 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + [[package]] name = "page_size" version = "0.4.2" @@ -103,6 +169,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "proc-macro-error" version = "1.0.2" @@ -159,6 +231,39 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scroll" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abb2332cb595d33f7edd5700f4cbf94892e680c7f0ae56adab58a35190b66cb1" +dependencies = [ + "scroll_derive", +] + +[[package]] +name = "scroll_derive" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e367622f934864ffa1c704ba2b82280aab856e3d8213c84c5720257eb34b15b9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha2" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2933378ddfeda7ea26f48c555bdad8bb446bf8a3d17832dc83e380d444cfb8c1" +dependencies = [ + "block-buffer", + "cfg-if", + "cpuid-bool", + "digest", + "opaque-debug", +] + [[package]] name = "smallvec" version = "1.4.0" @@ -187,6 +292,12 @@ dependencies = [ "syn", ] +[[package]] +name = "typenum" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" + [[package]] name = "unicode-xid" version = "0.2.0" @@ -203,13 +314,16 @@ checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" name = "waterboxhost" version = "0.1.0" dependencies = [ + "anyhow", "bitflags", "getset", + "goblin", "itertools", "lazy_static", "libc", "page_size", "parking_lot", + "sha2", "winapi", ] diff --git a/waterbox/waterboxhost/Cargo.toml b/waterbox/waterboxhost/Cargo.toml index a001b71c0d..3780e0e49f 100644 --- a/waterbox/waterboxhost/Cargo.toml +++ b/waterbox/waterboxhost/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["nattthebear "] edition = "2018" publish = false +rust = "nightly" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -14,6 +15,9 @@ lazy_static = "1.4.0" getset = "0.1.1" parking_lot = "0.10.2" itertools = "0.9.0" +goblin = { version = "0.2.3", features = ["elf64", "std"] } +anyhow = "1.0" +sha2 = "0.9.1" [target.'cfg(windows)'.dependencies] winapi = { version = "0.3.8", features = ["memoryapi", "handleapi", "errhandlingapi", "winnt"] } @@ -23,3 +27,4 @@ libc = "0.2.71" [lib] doctest = false +crate-type=["cdylib"] diff --git a/waterbox/waterboxhost/src/bin.rs b/waterbox/waterboxhost/src/bin.rs new file mode 100644 index 0000000000..42a8ef7d84 --- /dev/null +++ b/waterbox/waterboxhost/src/bin.rs @@ -0,0 +1,55 @@ +use std::io::*; +use std::mem::{transmute, size_of, zeroed}; +use anyhow::anyhow; +use sha2::{Sha256, Digest}; + +pub fn write(stream: &mut dyn Write, val: &T) -> Result<()> { + let s = unsafe { std::slice::from_raw_parts(transmute::<&T, *const u8>(val), size_of::()) }; + stream.write_all(s)?; + Ok(()) +} +pub fn read(stream: &mut dyn Read, val: &mut T) -> Result<()> { + let s = unsafe { std::slice::from_raw_parts_mut(transmute::<&mut T, *mut u8>(val), size_of::()) }; + stream.read_exact(s)?; + Ok(()) +} +pub fn writeval(stream: &mut dyn Write, val: T) -> Result<()> { + let s = unsafe { std::slice::from_raw_parts(transmute::<&T, *const u8>(&val), size_of::()) }; + stream.write_all(s)?; + Ok(()) +} +pub fn readval(stream: &mut dyn Read) -> Result { + let mut v = unsafe { zeroed::() }; + read(stream, &mut v)?; + Ok(v) +} +pub fn write_magic(stream: &mut dyn Write, magic: &str) -> anyhow::Result<()> { + stream.write_all(magic.as_bytes())?; + Ok(()) +} +pub fn verify_magic(stream: &mut dyn Read, magic: &str) -> anyhow::Result<()> { + let mut read_tag = vec![0u8; magic.len()]; + stream.read_exact(&mut read_tag[..])?; + match std::str::from_utf8(&read_tag[..]) { + Ok(s) if s == magic => Ok(()), + _ => Err(anyhow!("Bad magic for {} state", magic)) + } +} +pub fn write_hash(stream: &mut dyn Write, hash: &[u8]) -> anyhow::Result<()> { + stream.write_all(hash)?; + Ok(()) +} +pub fn verify_hash(stream: &mut dyn Read, hash: &[u8]) -> anyhow::Result<()> { + let mut read_hash = vec![0u8; hash.len()]; + stream.read_exact(&mut read_hash[..])?; + if read_hash == hash { + Ok(()) + } else { + Err(anyhow!("Bad hash for state")) + } +} +pub fn hash(data: &[u8]) -> Vec { + let mut hasher = Sha256::new(); + hasher.update(data); + hasher.finalize()[..].to_owned() +} diff --git a/waterbox/waterboxhost/src/cinterface.rs b/waterbox/waterboxhost/src/cinterface.rs new file mode 100644 index 0000000000..90f9b1c763 --- /dev/null +++ b/waterbox/waterboxhost/src/cinterface.rs @@ -0,0 +1,265 @@ +use crate::*; +use host::{ActivatedWaterboxHost, WaterboxHost}; +use std::{os::raw::c_char, ffi::CStr}; + +/// The memory template for a WaterboxHost. Don't worry about +/// making every size as small as possible, since the savestater handles sparse regions +/// well enough. All values should be PAGESIZE aligned. +#[repr(C)] +pub struct MemoryLayoutTemplate { + /// Absolute pointer to the start of the mapped space + pub start: usize, + /// Memory space for the elf executable. The elf must be non-relocatable and + /// all loaded segments must fit within [start..start + elf_size] + pub elf_size: usize, + /// Memory space to serve brk(2) + pub sbrk_size: usize, + /// Memory space to serve alloc_sealed(3) + pub sealed_size: usize, + /// Memory space to serve alloc_invisible(3) + pub invis_size: usize, + /// Memory space to serve alloc_plain(3) + pub plain_size: usize, + /// Memory space to serve mmap(2) and friends. + /// Calls without MAP_FIXED or MREMAP_FIXED will be placed in this area. + /// TODO: Are we allowing fixed calls to happen anywhere in the block? + pub mmap_size: usize, +} +impl MemoryLayoutTemplate { + /// checks a memory layout for validity + pub fn make_layout(&self) -> anyhow::Result { + let start = align_down(self.start); + let elf_size = align_up(self.elf_size); + let sbrk_size = align_up(self.sbrk_size); + let sealed_size = align_up(self.sealed_size); + let invis_size = align_up(self.invis_size); + let plain_size = align_up(self.plain_size); + let mmap_size = align_up(self.mmap_size); + let mut res = unsafe { std::mem::zeroed::() }; + res.elf = AddressRange { + start, + size: elf_size + }; + res.sbrk = AddressRange { + start: res.elf.end(), + size: sbrk_size + }; + res.sealed = AddressRange { + start: res.sbrk.end(), + size: sealed_size + }; + res.invis = AddressRange { + start: res.sealed.end(), + size: invis_size + }; + res.plain = AddressRange { + start: res.invis.end(), + size: plain_size + }; + res.mmap = AddressRange { + start: res.invis.end(), + size: mmap_size + }; + if start >> 32 != (res.mmap.end() - 1) >> 32 { + Err(anyhow!("HostMemoryLayout must fit into a single 4GiB region!")) + } else { + Ok(res) + } + } +} + +/// "return" struct. On successful funtion call, error_message[0] will be 0 and data will be the return value. +/// On failed call, error_message will contain a string describing the error, and data will be unspecified. +/// Any function that takes this object as an argument can fail and should be checked for failure, even if +/// it does not return data. +#[repr(C)] +pub struct Return { + pub error_message: [u8; 1024], + pub data: T, +} +impl Return { + pub fn put(&mut self, result: anyhow::Result) { + match result { + Err(e) => { + let s = format!("Waterbox Error: {:?}", e); + let len = std::cmp::min(s.len(), 1023); + self.error_message[0..len].copy_from_slice(&s.as_bytes()[0..len]); + self.error_message[len] = 0; + }, + Ok(t) => { + self.error_message[0] = 0; + self.data = t; + } + } + } +} + +/// stream writer +#[repr(C)] +pub struct CWriter { + /// will be passed to callback + pub userdata: usize, + /// write bytes. Return number of bytes written on success, or < 0 on failure. + /// Permitted to write less than the provided number of bytes. + pub callback: extern fn(userdata: usize, data: *const u8, size: usize) -> isize, +} +impl Write for CWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + let res = (self.callback)(self.userdata, buf.as_ptr(), buf.len()); + if res < 0 { + Err(std::io::Error::new(std::io::ErrorKind::Other, "Callback signaled abnormal failure")) + } else { + Ok(res as usize) + } + } + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +/// stream reader +#[repr(C)] +pub struct CReader { + /// will be passed to callback + pub userdata: usize, + /// Read bytes into the buffer. Return number of bytes read on success, or < 0 on failure. + /// permitted to read less than the provided buffer size, but must always read at least 1 + /// byte if EOF is not reached. If EOF is reached, should return 0. + pub callback: extern fn(userdata: usize, data: *mut u8, size: usize) -> isize, +} +impl Read for CReader { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + let res = (self.callback)(self.userdata, buf.as_mut_ptr(), buf.len()); + if res < 0 { + Err(std::io::Error::new(std::io::ErrorKind::Other, "Callback signaled abnormal failure")) + } else { + Ok(res as usize) + } + } +} + +fn arg_to_str(arg: *const c_char) -> anyhow::Result { + let cs = unsafe { CStr::from_ptr(arg as *const c_char) }; + match cs.to_str() { + Ok(s) => Ok(s.to_string()), + Err(_) => Err(anyhow!("Bad UTF-8 string")), + } +} + +fn read_whole_file(reader: &mut CReader) -> anyhow::Result> { + let mut res = Vec::::new(); + std::io::copy(reader, &mut res)?; + Ok(res) +} + +/// Given a guest executable and a memory layout, create a new host environment. All data will be immediately consumed from the reader, +/// which will not be used after this call. +#[no_mangle] +pub extern fn wbx_create_host(layout: &MemoryLayoutTemplate, module_name: *const c_char, wbx: &mut CReader, ret: &mut Return<*mut WaterboxHost>) { + let res = (|| { + let data = read_whole_file(wbx)?; + WaterboxHost::new(&data[..], &arg_to_str(module_name)?[..], layout) + })(); + ret.put(res.map(|boxed| Box::into_raw(boxed))); +} + +/// Tear down a host environment. May not be called while the environment is active. +#[no_mangle] +pub extern fn wbx_destroy_host(obj: *mut WaterboxHost, ret: &mut Return<()>) { + let res = (|| { + unsafe { + if (*obj).active() { + return Err(anyhow!("WaterboxHost is still active!")) + } + Box::from_raw(obj); + Ok(()) + } + })(); + ret.put(res); +} + +/// Activate a host environment. This swaps it into memory and makes it available for use. +/// Pointers to inside the environment are only valid while active. Uses a mutex internally +/// so as to not stomp over other host environments in the same 4GiB slice. +/// Returns a pointer to the activated object, used to do most other functions. +#[no_mangle] +pub extern fn wbx_activate_host(obj: *mut WaterboxHost, ret: &mut Return<*mut ActivatedWaterboxHost>) { + let res = (|| { + unsafe { + if (*obj).active() { + return Err(anyhow!("WaterboxHost is already active!")) + } + Ok((&mut (*obj)).activate()) + } + })(); + ret.put(res.map(|boxed| Box::into_raw(boxed))); +} + +/// Deactivates a host environment, and releases the mutex. +#[no_mangle] +pub extern fn wbx_deactivate_host(obj: *mut ActivatedWaterboxHost, ret: &mut Return<()>) { + unsafe { Box::from_raw(obj); } + ret.put(Ok(())); +} + +/// Returns the address of an exported function from the guest executable. This pointer is only valid +/// while the host is active. A missing proc is not an error and simply returns 0. +#[no_mangle] +pub extern fn wbx_get_proc_addr(obj: &mut ActivatedWaterboxHost, name: *const c_char, ret: &mut Return) { + match arg_to_str(name) { + Ok(s) => { + ret.put(Ok(obj.get_proc_addr(&s))); + }, + Err(e) => { + ret.put(Err(e)) + } + } +} + +/// Calls the seal operation, which is a one time action that prepares the host to save states. +#[no_mangle] +pub extern fn wbx_seal(obj: &mut ActivatedWaterboxHost, ret: &mut Return<()>) { + ret.put(obj.seal()); +} + +/// Mounts a file in the environment. All data will be immediately consumed from the reader, which will not be used after this call. +/// To prevent nondeterminism, adding and removing files is very limited WRT savestates. If a file is writable, it must never exist +/// when save_state is called, and can only be used for transient operations. If a file is readable, it can appear in savestates, +/// but it must exist in every savestate and the exact sequence of add_file calls must be consistent from savestate to savestate. +#[no_mangle] +pub extern fn wbx_mount_file(obj: &mut ActivatedWaterboxHost, name: *const c_char, reader: &mut CReader, writable: bool, ret: &mut Return<()>) { + let res: anyhow::Result<()> = (|| { + obj.mount_file(arg_to_str(name)?, read_whole_file(reader)?, writable)?; + Ok(()) + })(); + ret.put(res); +} + +/// Remove a file previously added. Writer is optional; if provided, the contents of the file at time of removal will be dumped to it. +/// It is an error to remove a file which is currently open in the guest. +#[no_mangle] +pub extern fn wbx_unmount_file(obj: &mut ActivatedWaterboxHost, name: *const c_char, writer: Option<&mut CWriter>, ret: &mut Return<()>) { + let res: anyhow::Result<()> = (|| { + let data = obj.unmount_file(&arg_to_str(name)?)?; + if let Some(w) = writer { + std::io::copy(&mut &data[..], w)?; + } + Ok(()) + })(); + ret.put(res); +} + +/// Save state. Must not be called before seal. Must not be called with any writable files mounted. +/// Must always be called with the same sequence and contents of readonly files. +#[no_mangle] +pub extern fn wbx_save_state(obj: &mut ActivatedWaterboxHost, writer: &mut CWriter, ret: &mut Return<()>) { + ret.put(obj.save_state(writer)); +} +/// Load state. Must not be called before seal. Must not be called with any writable files mounted. +/// Must always be called with the same sequence and contents of readonly files that were in the save state. +/// Must be called with the same wbx executable and memory layout as in the savestate. +/// Errors generally poison the environment; sorry! +#[no_mangle] +pub extern fn wbx_load_state(obj: &mut ActivatedWaterboxHost, reader: &mut CReader, ret: &mut Return<()>) { + ret.put(obj.load_state(reader)); +} diff --git a/waterbox/waterboxhost/src/elf.rs b/waterbox/waterboxhost/src/elf.rs new file mode 100644 index 0000000000..8d6c04d541 --- /dev/null +++ b/waterbox/waterboxhost/src/elf.rs @@ -0,0 +1,208 @@ +use goblin; +use goblin::elf64::{sym::*, section_header::*}; +use crate::*; +use crate::memory_block::ActivatedMemoryBlock; +use crate::memory_block::Protection; +use std::collections::HashMap; + +/// Special system import area +const IMPORTS_OBJECT_NAME: &str = "__wbxsysarea"; + +/// Section names that are not marked as readonly, but we'll make them readonly anyway +fn section_name_is_readonly(name: &str) -> bool { + name.contains(".rel.ro") + || name.starts_with(".got") + || name == ".init_array" + || name == ".fini_array" + || name == ".tbss" + || name == ".sealed" +} + +pub struct SectionInfo { + name: String, + addr: AddressRange, +} +pub struct ElfLoader { + sections: Vec, + exports: HashMap, + entry_point: usize, + hash: Vec, + import_area: AddressRange, +} +impl ElfLoader { + pub fn new(data: &[u8], + module_name: &str, + layout: &WbxSysLayout, + b: &mut ActivatedMemoryBlock + ) -> anyhow::Result { + let wbx = goblin::elf::Elf::parse(data)?; + + let start = wbx.program_headers.iter() + .map(|x| x.vm_range().start) + .min() + .unwrap(); + let end = wbx.program_headers.iter() + .map(|x| x.vm_range().end) + .max() + .unwrap(); + if start < layout.elf.start || end > layout.elf.end() { + return Err(anyhow!("{} from {}..{} did not fit in the provided region", module_name, start, end)) + } + + println!("Mouting `{}` @{:x}", module_name, start); + + let mut sections = Vec::new(); + + for section in wbx.section_headers.iter() { + let name = match wbx.shdr_strtab.get(section.sh_name) { + Some(Ok(s)) => s, + _ => "" + }; + println!(" @{:x} {}{}{} `{}` {} bytes", + section.sh_addr, + if section.sh_flags & (SHF_ALLOC as u64) != 0 { "R" } else { " " }, + if section.sh_flags & (SHF_WRITE as u64) != 0 { "W" } else { " " }, + if section.sh_flags & (SHF_EXECINSTR as u64) != 0 { "X" } else { " " }, + name, + section.sh_size + ); + if section.sh_type != SHT_NOBITS + && name != "" + && section.sh_addr != 0 { + let si = SectionInfo { + name: name.to_string(), + addr: AddressRange { + start: section.sh_addr as usize, + size: section.sh_size as usize + } + }; + sections.push(si); + } + } + + let mut exports = HashMap::new(); + let mut import_area_opt = None; + + for sym in wbx.syms.iter() { + let name = match wbx.strtab.get(sym.st_name) { + Some(Ok(s)) => s, + _ => continue + }; + if sym.st_visibility() == STV_DEFAULT && sym.st_bind() == STB_GLOBAL { + exports.insert( + name.to_string(), + AddressRange { start: sym.st_value as usize, size: sym.st_size as usize } + ); + } + if name == IMPORTS_OBJECT_NAME { + import_area_opt = Some(AddressRange { start: sym.st_value as usize, size: sym.st_size as usize }); + } + } + + let import_area = match import_area_opt { + Some(i) => { + if i.size != std::mem::size_of::() { + return Err(anyhow!("Symbol {} is the wrong size", IMPORTS_OBJECT_NAME)) + } + i + }, + None => return Err(anyhow!("Symbol {} is missing", IMPORTS_OBJECT_NAME)) + }; + + { + let invis_opt = sections.iter().find(|x| x.name == ".invis"); + if let Some(invis) = invis_opt { + let any_below = sections.iter().any(|x| x.addr.align_expand().end() > invis.addr.align_expand().start); + let any_above = sections.iter().any(|x| x.addr.align_expand().start < invis.addr.align_expand().end()); + if any_below || any_above { + return Err(anyhow!("Overlap between .invis and other sections -- check linkscript.")); + } + b.mark_invisible(invis.addr.align_expand())?; + } + } + + b.mark_invisible(layout.invis)?; + + for segment in wbx.program_headers.iter() { + let addr = AddressRange { + start: segment.vm_range().start, + size: segment.vm_range().end - segment.vm_range().start + }; + let prot_addr = addr.align_expand(); + let prot = match (segment.is_read(), segment.is_write(), segment.is_executable()) { + (false, false, false) => Protection::None, + (true, false, false) => Protection::R, + (_, false, true) => Protection::RX, + (_, true, false) => Protection::RW, + (_, true, true) => Protection::RWX + }; + b.mmap_fixed(prot_addr, prot)?; + unsafe { + let src = &data[segment.file_range()]; + let dst = AddressRange { start: addr.start, size: segment.file_range().end - segment.file_range().start }.slice_mut(); + dst.copy_from_slice(src); + } + } + + Ok(ElfLoader { + sections, + exports, + entry_point: wbx.entry as usize, + hash: bin::hash(data), + import_area + }) + } + pub fn seal(&self, b: &mut ActivatedMemoryBlock) { + for section in self.sections.iter() { + if section_name_is_readonly(section.name.as_str()) { + b.mprotect(section.addr, Protection::R).unwrap(); + } + } + } + pub fn connect_syscalls(&mut self, _b: &mut ActivatedMemoryBlock, sys: &WbxSysArea) { + let addr = self.import_area; + unsafe { *(addr.start as *mut WbxSysArea) = *sys; } + } + pub fn clear_syscalls(&mut self, _b: &mut ActivatedMemoryBlock) { + let addr = self.import_area; + unsafe { addr.zero(); } + } + pub fn native_init(&mut self, _b: &mut ActivatedMemoryBlock) { + println!("Calling _start()"); + unsafe { + std::mem::transmute:: ()>(self.entry_point)(); + } + } + pub fn co_clean(&mut self, _b: &mut ActivatedMemoryBlock) { + match self.get_proc_addr("co_clean") { + 0 => (), + ptr => { + println!("Calling co_clean()"); + unsafe { + std::mem::transmute:: ()>(ptr)(); + } + }, + } + } + pub fn get_proc_addr(&self, proc: &str) -> usize { + match self.exports.get(proc) { + Some(addr) => addr.start, + None => 0, + } + } +} + +const MAGIC: &str = "ElfLoader"; + +impl IStateable for ElfLoader { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + bin::write_magic(stream, MAGIC)?; + bin::write_hash(stream, &self.hash[..])?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + bin::verify_magic(stream, MAGIC)?; + bin::verify_hash(stream, &self.hash[..])?; + Ok(()) + } +} diff --git a/waterbox/waterboxhost/src/fs/empty_read.rs b/waterbox/waterboxhost/src/fs/empty_read.rs new file mode 100644 index 0000000000..4ca19e02c4 --- /dev/null +++ b/waterbox/waterboxhost/src/fs/empty_read.rs @@ -0,0 +1,48 @@ +use crate::syscall_defs::*; +use crate::*; +use std::io::{Write, Read}; +use super::*; + +/// stdin +pub struct EmptyRead { +} +impl IStateable for EmptyRead { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + bin::write_magic(stream, "EmptyRead")?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + bin::verify_magic(stream, "EmptyRead")?; + Ok(()) + } +} +impl FileObject for EmptyRead { + fn can_read(&self) -> bool { + true + } + fn read(&mut self, _buf: &mut [u8]) -> Result { + Ok(0) + } + fn can_write(&self) -> bool { + false + } + fn write(&mut self, _buf: &[u8]) -> Result { + Err(EBADF) + } + fn seek(&mut self, _offset: i64, _whence: i32) -> Result { + Err(ESPIPE) + } + fn truncate(&mut self, _size: i64) -> SyscallResult { + Err(EINVAL) + } + fn stat(&self, statbuff: &mut KStat) -> SyscallResult { + fill_stat(statbuff, true, false, false, 0) + } + fn can_unmount(&self) -> bool { + false + } + fn unmount(self: Box) -> Vec { + panic!() + } + fn reset(&mut self) {} +} diff --git a/waterbox/waterboxhost/src/fs/mod.rs b/waterbox/waterboxhost/src/fs/mod.rs new file mode 100644 index 0000000000..2a47bdb8c9 --- /dev/null +++ b/waterbox/waterboxhost/src/fs/mod.rs @@ -0,0 +1,345 @@ +mod empty_read; +mod sys_out; +mod regular_file; + +use crate::syscall_defs::*; +use crate::*; +use std::io::{Write, Read}; +use empty_read::EmptyRead; +use sys_out::SysOutObj; +use regular_file::RegularFile; + +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(transparent)] +pub struct FileDescriptor(pub i32); + +const BAD_FD: FileDescriptor = FileDescriptor(-1); + +pub trait FileObject: IStateable { + fn stat(&self, statbuff: &mut KStat) -> SyscallResult; + fn truncate(&mut self, size: i64) -> SyscallResult; + fn can_read(&self) -> bool; + fn read(&mut self, buf: &mut [u8]) -> Result; + fn can_write(&self) -> bool; + fn write(&mut self, buf: &[u8]) -> Result; + fn seek(&mut self, offset: i64, whence: i32) -> Result; + fn reset(&mut self); + fn can_unmount(&self) -> bool; + fn unmount(self: Box) -> Vec; +} + +fn fill_stat(s: &mut KStat, can_read: bool, can_write: bool, can_seek: bool, length: i64) -> SyscallResult { + s.st_dev = 1; + s.st_ino = 1; + s.st_nlink = 0; + + let mut flags = 0; + if can_read { + flags |= S_IRUSR | S_IRGRP | S_IROTH; + } + if can_write { + flags |= S_IWUSR | S_IWGRP | S_IWOTH; + } + if can_seek { + flags |= S_IFREG; + } else { + flags |= S_IFIFO; + } + s.st_mode = flags; + s.st_uid = 0; + s.st_gid = 0; + s.__pad0 = 0; + s.st_rdev = 0; + if can_seek { + s.st_size = length; + } else { + s.st_size = 0; + } + s.st_blksize = 4096; + s.st_blocks = (s.st_size + 511) / 512; + + s.st_atime_sec = 1262304000000; + s.st_atime_nsec = 1000000000 / 2; + s.st_mtime_sec = 1262304000000; + s.st_mtime_nsec = 1000000000 / 2; + s.st_ctime_sec = 1262304000000; + s.st_ctime_nsec = 1000000000 / 2; + + Ok(()) +} + +struct MountedFile { + name: String, + fd: FileDescriptor, + obj: Box, +} +impl IStateable for MountedFile { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + bin::write_magic(stream, "MountedFile")?; + bin::write_magic(stream, &self.name)?; + bin::write(stream, &self.fd)?; + self.obj.save_state(stream)?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + bin::verify_magic(stream, "MountedFile")?; + bin::verify_magic(stream, &self.name)?; + bin::read(stream, &mut self.fd)?; + self.obj.load_state(stream)?; + Ok(()) + } +} + +pub struct FileSystem { + files: Vec, +} +impl FileSystem { + pub fn new() -> FileSystem { + FileSystem { + files: vec![ + MountedFile { + name: "/dev/stdin".to_string(), + fd: FileDescriptor(0), + obj: Box::new(EmptyRead {}) + }, + MountedFile { + name: "/dev/stdout".to_string(), + fd: FileDescriptor(1), + obj: Box::new(SysOutObj { host_handle: Box::new(std::io::stdout()) }) + }, + MountedFile { + name: "/dev/stderr".to_string(), + fd: FileDescriptor(2), + obj: Box::new(SysOutObj { host_handle: Box::new(std::io::stderr()) }) + }, + ], + } + } + /// Accept a file from the outside world. Writable files may never appear in a savestate, + /// and readonly files must not be added or removed from savestate to savestate, so all uses + /// are either transient or read only resources that last for the life of emulation. + pub fn mount(&mut self, name: String, data: Vec, writable: bool) -> anyhow::Result<()> { + if self.files.iter().any(|f| f.name == name) { + return Err(anyhow!("File with name {} already mounted.", name)) + } + self.files.push(MountedFile { + name: name.to_string(), + fd: BAD_FD, + obj: Box::new(RegularFile::new(data, writable)) + }); + Ok(()) + } + /// Remove a file previously loaded with mount(). Returns the content of the file at this time. + /// Not possible if the guest has yet to close the file. + pub fn unmount(&mut self, name: &str) -> anyhow::Result> { + let idx = match self.files.iter().position(|f| f.name == name) { + Some(f) => f, + None => return Err(anyhow!("File with name {} not previously mounted.", name)) + }; + let file = &self.files[idx]; + if file.fd != BAD_FD { + return Err(anyhow!("File {} is still open in the system", name)) + } + if !file.obj.can_unmount() { + return Err(anyhow!("File {} cannot be unmounted as it is permanently attached", name)) + } + Ok(self.files.remove(idx).obj.unmount()) + } + /// Implements a subset of open(2) + pub fn open(&mut self, name: &str, flags: i32, _mode: i32) -> Result { + // TODO: Missing file callback + let fd = { + let mut i = 0; + loop { + if !self.files.iter().any(|f| f.fd.0 == i) { + break FileDescriptor(i) + } + i += 1; + } + }; + let file = match self.files.iter_mut().find(|f| f.name == name) { + Some(f) => f, + None => return Err(ENOENT) + }; + if file.fd != BAD_FD { + return Err(EACCES) + } + // TODO: We should be doing more with flags and mode + match flags & O_ACCMODE { + O_RDONLY => { + if !file.obj.can_read() { + return Err(EACCES) + } + } + O_WRONLY => { + if !file.obj.can_write() { + return Err(EACCES) + } + + }, + O_RDWR => { + if !file.obj.can_read() || !file.obj.can_write() { + return Err(EACCES) + } + }, + _ => return Err(EINVAL) + } + // TODO: If the requested access was R on an RW file (transient), we still allow writing once opened + file.fd = fd; + Ok(fd) + } + /// Implements a subset of close(2) + pub fn close(&mut self, fd: FileDescriptor) -> SyscallResult { + let file = match self.files.iter_mut().find(|f| f.fd == fd) { + Some(f) => f, + None => return Err(EBADF) + }; + file.obj.reset(); + file.fd = BAD_FD; + Ok(()) + } + fn wrap_action Result>(&mut self, name: &str, action: P) -> Result { + match self.files.iter_mut().find(|f| f.name == name) { + Some(f) => action(f.obj.as_mut()), + None => Err(ENOENT) + } + } + fn wrap_faction Result>(&mut self, fd: FileDescriptor, action: P) -> Result { + match self.files.iter_mut().find(|f| f.fd == fd) { + Some(f) => action(f.obj.as_mut()), + None => Err(ENOENT) + } + } + /// Implements a subset of stat(2) + pub fn stat(&mut self, name: &str, statbuff: &mut KStat) -> SyscallResult { + self.wrap_action(name, |f| f.stat(statbuff)) + } + /// Implements a subset of fstat(2) + pub fn fstat(&mut self, fd: FileDescriptor, statbuff: &mut KStat) -> SyscallResult { + self.wrap_faction(fd, |f| f.stat(statbuff)) + } + /// Implements a subset of truncate(2) + pub fn truncate(&mut self, name: &str, size: i64) -> SyscallResult { + self.wrap_action(name, |f| f.truncate(size)) + } + /// Implements a subset of ftruncate(2) + pub fn ftruncate(&mut self, fd: FileDescriptor, size: i64) -> SyscallResult { + self.wrap_faction(fd, |f| f.truncate(size)) + } + /// Implements a subset of read(2) + pub fn read(&mut self, fd: FileDescriptor, buf: &mut [u8]) -> Result { + self.wrap_faction(fd, |f| f.read(buf)) + } + /// Implements a subset of write(2) + pub fn write(&mut self, fd: FileDescriptor, buf: &[u8]) -> Result { + self.wrap_faction(fd, |f| f.write(buf)) + } + /// Implements a subset of lseek(2) + pub fn seek(&mut self, fd: FileDescriptor, offset: i64, whence: i32) -> Result { + self.wrap_faction(fd, |f| f.seek(offset, whence)) + } +} +impl IStateable for FileSystem { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + bin::write_magic(stream, "FileSystem")?; + for f in self.files.iter_mut() { + f.save_state(stream)?; + } + bin::write_magic(stream, "FileSystemEnd")?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + bin::verify_magic(stream, "FileSystem")?; + for f in self.files.iter_mut() { + f.load_state(stream)?; + } + bin::verify_magic(stream, "FileSystemEnd")?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + type TestResult = anyhow::Result<()>; + + #[test] + fn test_create() -> TestResult { + let mut fs = FileSystem::new(); + let mut state0 = Vec::new(); + fs.save_state(&mut state0)?; + fs.load_state(&mut &state0[..])?; + Ok(()) + } + + #[test] + fn test_ro_state() -> TestResult { + let mut fs = FileSystem::new(); + fs.mount("myfile".to_string(), + "The quick brown fox jumps over the lazy dog.".to_string().into_bytes(), false)?; + let fd = fs.open("myfile", O_RDONLY, 0)?; + assert_eq!(fd.0, 3); + let mut buff = vec![0u8; 8]; + assert!(fs.write(fd, &buff[..]).is_err()); + assert_eq!(fs.read(fd, &mut buff[..])?, 8); + assert_eq!(buff, "The quic".as_bytes()); + let mut state0 = Vec::new(); + fs.save_state(&mut state0)?; + assert_eq!(fs.read(fd, &mut buff[..])?, 8); + assert_eq!(buff, "k brown ".as_bytes()); + fs.load_state(&mut &state0[..])?; + assert_eq!(fs.read(fd, &mut buff[..])?, 8); + assert_eq!(buff, "k brown ".as_bytes()); + assert_eq!(fs.read(fd, &mut buff[..])?, 8); + assert_eq!(buff, "fox jump".as_bytes()); + assert_eq!(fs.read(fd, &mut buff[..])?, 8); + assert_eq!(buff, "s over t".as_bytes()); + assert_eq!(fs.read(fd, &mut buff[..])?, 8); + assert_eq!(buff, "he lazy ".as_bytes()); + assert_eq!(fs.read(fd, &mut buff[..])?, 4); + assert_eq!(&buff[0..4], "dog.".as_bytes()); + Ok(()) + } + + #[test] + fn test_negative() -> TestResult { + let mut fs = FileSystem::new(); + assert!(fs.mount("/dev/stdin".to_string(), Vec::new(), false).is_err()); // overriding existing name + assert!(fs.unmount("oopopo").is_err()); // unmounting nonexistant file + assert!(fs.unmount("/dev/stdout").is_err()); // unmounting permanent file + fs.mount("oopopo".to_string(), Vec::new(), true)?; + let mut state0 = Vec::new(); + assert!(fs.save_state(&mut state0).is_err()); // save state with transient file + state0.resize(0, 0); + fs.unmount("oopopo")?; + fs.mount("oopopo".to_string(), Vec::new(), false)?; + fs.save_state(&mut state0)?; + fs.unmount("oopopo")?; + assert!(fs.load_state(&mut &state0[..]).is_err()); // loading state with different list of files + // TODO: Our general contract is that after a failed loadstate, the entire core is poisoned. + // Can we do better? Should we do better? + Ok(()) + } + + #[test] + fn test_rw_unmount() -> TestResult { + let mut fs = FileSystem::new(); + fs.mount("z".to_string(), Vec::new(), true)?; + let fd = fs.open("z", O_RDWR, 0)?; + fs.write(fd, "Big test".as_bytes())?; + fs.seek(fd, 0, SEEK_SET)?; + fs.write(fd, "Q".as_bytes())?; + fs.seek(fd, 2, SEEK_CUR)?; + fs.write(fd, ")".as_bytes())?; + fs.seek(fd, -1, SEEK_END)?; + fs.write(fd, "$$$$".as_bytes())?; + let mut statbuff = Box::new(KStat::default()); + fs.fstat(fd, statbuff.as_mut())?; + assert_eq!(statbuff.st_size, 11); + fs.close(fd)?; + let vec = fs.unmount("z")?; + assert_eq!(vec, "Qig)tes$$$$".as_bytes()); + Ok(()) + } +} diff --git a/waterbox/waterboxhost/src/fs/regular_file.rs b/waterbox/waterboxhost/src/fs/regular_file.rs new file mode 100644 index 0000000000..5a3bd171dd --- /dev/null +++ b/waterbox/waterboxhost/src/fs/regular_file.rs @@ -0,0 +1,126 @@ +use crate::syscall_defs::*; +use crate::*; +use std::io::{Write, Read}; +use super::*; + +/// A file whose content is in memory and managed by the waterbox host +pub struct RegularFile { + data: Vec, + hash: Option>, + position: usize, +} +impl RegularFile { + pub fn new(data: Vec, writable: bool) -> RegularFile { + let hash = if writable { + None + } else { + Some(bin::hash(&data[..])) + }; + RegularFile { + data, + hash, + position: 0, + } + } +} +impl IStateable for RegularFile { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + match &self.hash { + Some(hash) => { + bin::write_magic(stream, "RegularFile")?; + bin::write_hash(stream, &hash[..])?; + bin::write(stream, &self.position)?; + Ok(()) + + }, + None => Err(anyhow!("Cannot save state while transient files are active")) + } + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + match &self.hash { + Some(hash) => { + bin::verify_magic(stream, "RegularFile")?; + bin::verify_hash(stream, &hash[..])?; + bin::read(stream, &mut self.position)?; + Ok(()) + } + None => Err(anyhow!("Cannot load state while transient files are active")) + } + } +} +impl FileObject for RegularFile { + fn can_read(&self) -> bool { + true + } + fn read(&mut self, buf: &mut [u8]) -> Result { + let n = std::cmp::min(buf.len(), self.data.len() - self.position); + let dst = &mut buf[0..n]; + let src = &self.data[self.position..self.position + n]; + dst.copy_from_slice(src); + self.position += n; + Ok(n as i64) + } + fn can_write(&self) -> bool { + match self.hash { + None => true, + Some(_) => false + } + } + fn write(&mut self, buf: &[u8]) -> Result { + if !self.can_write() { + return Err(EBADF) + } + let n = buf.len(); + let newpos = self.position + n; + if newpos > self.data.len() { + self.data.resize(newpos, 0); + } + let dst = &mut self.data[self.position..newpos]; + + dst.copy_from_slice(buf); + self.position = newpos; + Ok(n as i64) + } + fn seek(&mut self, offset: i64, whence: i32) -> Result { + let newpos = match whence { + SEEK_SET => { + 0 + }, + SEEK_CUR => { + self.position as i64 + offset + }, + SEEK_END => { + self.data.len() as i64 + offset + } + _ => return Err(EINVAL) + }; + if newpos < 0 || newpos > self.data.len() as i64 { + return Err(EINVAL) + } + self.position = newpos as usize; + Ok(newpos) + } + fn truncate(&mut self, size: i64) -> SyscallResult { + if !self.can_write() { + return Err(EBADF) + } + if size < 0 { + return Err(EINVAL) + } + self.data.resize(size as usize, 0); + self.position = std::cmp::min(self.position, size as usize); + Ok(()) + } + fn reset(&mut self) { + self.position = 0; + } + fn stat(&self, statbuff: &mut KStat) -> SyscallResult { + fill_stat(statbuff, true, self.can_write(), true, self.data.len() as i64) + } + fn can_unmount(&self) -> bool { + true + } + fn unmount(self: Box) -> Vec { + self.data + } +} diff --git a/waterbox/waterboxhost/src/fs/sys_out.rs b/waterbox/waterboxhost/src/fs/sys_out.rs new file mode 100644 index 0000000000..18e6d41494 --- /dev/null +++ b/waterbox/waterboxhost/src/fs/sys_out.rs @@ -0,0 +1,51 @@ +use crate::syscall_defs::*; +use crate::*; +use std::io::{Write, Read}; +use super::*; + +/// stdout, stderr +pub struct SysOutObj { + pub host_handle: Box, +} +impl IStateable for SysOutObj { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + bin::write_magic(stream, "SysOutObj")?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + bin::verify_magic(stream, "SysOutObj")?; + Ok(()) + } +} +impl FileObject for SysOutObj { + fn can_read(&self) -> bool { + false + } + fn read(&mut self, _buf: &mut [u8]) -> Result { + Err(EBADF) + } + fn can_write(&self) -> bool { + true + } + fn write(&mut self, buf: &[u8]) -> Result { + // do not propogate host errors up to the waterbox! + let _ = self.host_handle.write_all(buf); + Ok(buf.len() as i64) + } + fn seek(&mut self, _offset: i64, _whence: i32) -> Result { + Err(ESPIPE) + } + fn truncate(&mut self, _size: i64) -> SyscallResult { + Err(EINVAL) + } + fn stat(&self, statbuff: &mut KStat) -> SyscallResult { + fill_stat(statbuff, false, true, false, 0) + } + fn can_unmount(&self) -> bool { + false + } + fn unmount(self: Box) -> Vec { + panic!() + } + fn reset(&mut self) {} +} diff --git a/waterbox/waterboxhost/src/host.rs b/waterbox/waterboxhost/src/host.rs new file mode 100644 index 0000000000..952e77e4f3 --- /dev/null +++ b/waterbox/waterboxhost/src/host.rs @@ -0,0 +1,313 @@ +use crate::*; +use crate::{memory_block::ActivatedMemoryBlock, syscall_defs::*}; +use memory_block::{MemoryBlock, Protection}; +use std::{os::raw::c_char, ffi::CStr}; +use fs::{FileDescriptor, FileSystem}; +use elf::ElfLoader; +use cinterface::MemoryLayoutTemplate; + +pub struct WaterboxHost { + fs: FileSystem, + program_break: usize, + elf: ElfLoader, + layout: WbxSysLayout, + memory_block: Box, + active: bool, + sealed: bool, +} +impl WaterboxHost { + pub fn new(wbx: &[u8], module_name: &str, layout_template: &MemoryLayoutTemplate) -> anyhow::Result> { + let layout = layout_template.make_layout()?; + let mut memory_block = MemoryBlock::new(layout.all()); + let mut b = memory_block.enter(); + let elf = ElfLoader::new(wbx, module_name, &layout, &mut b)?; + let fs = FileSystem::new(); + drop(b); + let mut res = Box::new(WaterboxHost { + fs, + program_break: layout.sbrk.start, + elf, + layout, + memory_block, + active: false, + sealed: false, + }); + + let mut active = res.activate(); + active.h.elf.connect_syscalls(&mut active.b, &mut active.sys); + active.h.elf.native_init(&mut active.b); + drop(active); + + Ok(res) + } + + pub fn active(&self) -> bool { + self.active + } + + pub fn activate(&mut self) -> Box { + let h = unsafe { &mut *(self as *mut WaterboxHost) }; + let b = self.memory_block.enter(); + let sys = WbxSysArea { + layout: self.layout, + syscall: WbxSysSyscall { + ud: 0, + syscall, + } + }; + let mut res = Box::new(ActivatedWaterboxHost { + tag: TAG, + h, + b, + sys + }); + res.sys.syscall.ud = res.as_mut() as *mut ActivatedWaterboxHost as usize; + res.h.active = true; + res + } +} + +const TAG: u64 = 0xd01487803948acff; +pub struct ActivatedWaterboxHost<'a> { + tag: u64, + h: &'a mut WaterboxHost, + b: ActivatedMemoryBlock<'a>, + sys: WbxSysArea, +} +impl<'a> Drop for ActivatedWaterboxHost<'a> { + fn drop(&mut self) { + self.h.active = false; + } +} + +impl<'a> ActivatedWaterboxHost<'a> { + pub fn get_proc_addr(&self, name: &str) -> usize { + self.h.elf.get_proc_addr(name) + } + fn check_sealed(&self) -> anyhow::Result<()> { + if !self.h.sealed { + Err(anyhow!("Not sealed!")) + } else { + Ok(()) + } + } + pub fn seal(&mut self) -> anyhow::Result<()> { + if self.h.sealed { + return Err(anyhow!("Already sealed!")) + } + self.h.elf.clear_syscalls(&mut self.b); + self.h.elf.seal(&mut self.b); + self.h.elf.connect_syscalls(&mut self.b, &self.sys); + self.h.elf.co_clean(&mut self.b); + self.b.seal(); + self.h.sealed = true; + Ok(()) + } + pub fn mount_file(&mut self, name: String, data: Vec, writable: bool) -> anyhow::Result<()> { + self.h.fs.mount(name, data, writable) + } + pub fn unmount_file(&mut self, name: &str) -> anyhow::Result> { + self.h.fs.unmount(name) + } +} + +const SAVE_START_MAGIC: &str = "ActivatedWaterboxHost_v1"; +const SAVE_END_MAGIC: &str = "ʇsoHxoqɹǝʇɐMpǝʇɐʌᴉʇɔ∀"; +impl<'a> IStateable for ActivatedWaterboxHost<'a> { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + self.check_sealed()?; + bin::write_magic(stream, SAVE_START_MAGIC)?; + self.h.fs.save_state(stream)?; + bin::write(stream, &self.h.program_break)?; + self.h.elf.save_state(stream)?; + self.b.save_state(stream)?; + bin::write_magic(stream, SAVE_END_MAGIC)?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + self.check_sealed()?; + bin::verify_magic(stream, SAVE_START_MAGIC)?; + self.h.fs.load_state(stream)?; + bin::read(stream, &mut self.h.program_break)?; + self.h.elf.load_state(stream)?; + self.b.load_state(stream)?; + bin::verify_magic(stream, SAVE_END_MAGIC)?; + self.h.elf.connect_syscalls(&mut self.b, &self.sys); + Ok(()) + } +} + +fn unimp(nr: SyscallNumber) -> SyscallResult { + eprintln!("Stopped on unimplemented syscall {}", lookup_syscall(&nr)); + unsafe { std::intrinsics::breakpoint() } + Err(ENOSYS) +} + +fn gethost<'a>(ud: usize) -> &'a mut ActivatedWaterboxHost<'a> { + let res = unsafe { &mut *(ud as *mut ActivatedWaterboxHost) }; + if res.tag != TAG { + unsafe { std::intrinsics::breakpoint() } + std::process::abort(); + } + res +} + +fn arg_to_prot(arg: usize) -> Result { + use Protection::*; + if arg != arg & (PROT_READ | PROT_WRITE | PROT_EXEC) { + Err(EINVAL) + } else if arg & PROT_EXEC != 0 { + if arg & PROT_WRITE != 0 { + Ok(RWX) + } else { + Ok(RX) + } + } else if arg & PROT_WRITE != 0 { + Ok(RW) + } else if arg & PROT_READ != 0 { + Ok(R) + } else { + Ok(None) + } +} + +fn arg_to_fd(arg: usize) -> Result { + if arg < 0x80000000 { + Ok(FileDescriptor(arg as i32)) + } else { + Err(EBADFD) + } +} + +fn arg_to_str(arg: usize) -> Result { + let cs = unsafe { CStr::from_ptr(arg as *const c_char) }; + match cs.to_str() { + Ok(s) => Ok(s.to_string()), + Err(_) => Err(EINVAL), + } +} + +fn arg_to_statbuff<'a>(arg: usize) -> &'a mut KStat { + unsafe { &mut *(arg as *mut KStat) } +} + +pub extern "win64" fn syscall(nr: SyscallNumber, ud: usize, a1: usize, a2: usize, a3: usize, a4: usize, _a5: usize, _a6: usize) -> SyscallReturn { + let mut h = gethost(ud); + match nr { + NR_MMAP => { + let prot = arg_to_prot(a3)?; + let flags = a4; + if flags & MAP_ANONYMOUS == 0 { + // anonymous + private is easy + // anonymous by itself is hard + // nothing needs either right now + return syscall_err(EOPNOTSUPP) + } + if flags & 0xf00 != 0 { + // various unsupported flags + return syscall_err(EOPNOTSUPP) + } + let arena_addr = h.sys.layout.mmap; + let res = h.b.mmap(AddressRange { start: a1, size: a2 }, prot, arena_addr)?; + syscall_ok(res) + }, + NR_MREMAP => { + let arena_addr = h.sys.layout.mmap; + let res = h.b.mremap(AddressRange { start: a1, size: a2 }, a3, arena_addr)?; + syscall_ok(res) + }, + NR_MPROTECT => { + let prot = arg_to_prot(a3)?; + let res = h.b.mprotect(AddressRange { start: a1, size: a2 }, prot); + syscall_ret(res) + }, + NR_MUNMAP => syscall_ret(h.b.munmap(AddressRange { start: a1, size: a2 })), + NR_MADVISE => { + match a3 { + MADV_DONTNEED => syscall_ret(h.b.madvise_dontneed(AddressRange { start: a1, size: a2 })), + _ => syscall_ok(0), + } + }, + NR_STAT => { + let name = arg_to_str(a1)?; + syscall_ret(h.h.fs.stat(&name, arg_to_statbuff(a2))) + }, + NR_FSTAT => { + syscall_ret(h.h.fs.fstat(arg_to_fd(a1)?, arg_to_statbuff(a2))) + }, + NR_IOCTL => syscall_ok(0), + NR_READ => { + unsafe { + syscall_ret_i64(h.h.fs.read(arg_to_fd(a1)?, std::slice::from_raw_parts_mut(a2 as *mut u8, a3))) + } + }, + NR_WRITE => { + unsafe { + syscall_ret_i64(h.h.fs.write(arg_to_fd(a1)?, std::slice::from_raw_parts(a2 as *const u8, a3))) + } + }, + NR_READV => { + let fd = arg_to_fd(a1)?; + unsafe { + let mut ret = 0; + let iov = std::slice::from_raw_parts_mut(a2 as *mut Iovec, a3); + for io in iov { + if io.iov_base != 0 { + ret += h.h.fs.read(fd, io.slice_mut())?; + } + } + syscall_ok(ret as usize) + } + }, + NR_WRITEV => { + let fd = arg_to_fd(a1)?; + unsafe { + let mut ret = 0; + let iov = std::slice::from_raw_parts(a2 as *const Iovec, a3); + for io in iov { + if io.iov_base != 0 { + ret += h.h.fs.write(fd, io.slice())?; + } + } + syscall_ok(ret as usize) + } + }, + NR_OPEN => { + syscall_ret_val(h.h.fs.open(&arg_to_str(a1)?, a2 as i32, a3 as i32).map(|x| x.0 as usize)) + }, + NR_CLOSE => syscall_ret(h.h.fs.close(arg_to_fd(a1)?)), + NR_LSEEK => syscall_ret_i64(h.h.fs.seek(arg_to_fd(a1)?, a2 as i64, a3 as i32)), + NR_TRUNCATE => syscall_ret(h.h.fs.truncate(&arg_to_str(a1)?, a2 as i64)), + NR_FTRUNCATE => syscall_ret(h.h.fs.ftruncate(arg_to_fd(a1)?, a2 as i64)), + // TODO: 99% sure nothing calls this + NR_SET_THREAD_AREA => syscall_err(ENOSYS), + // TODO: What calls this? + NR_SET_TID_ADDRESS => syscall_ok(8675309), + NR_CLOCK_GETTIME => { + let ts = a2 as *mut TimeSpec; + unsafe { + (*ts).tv_sec = 1495889068; + (*ts).tv_nsec = 0; + } + syscall_ok(0) + }, + NR_BRK => { + // TODO: This could be done on the C side + let addr = h.sys.layout.sbrk; + let old = h.h.program_break; + let res = if a1 != align_down(a1) { + old + } else if a1 < addr.start || a1 > addr.end() { + old + } else if a1 > old { + h.b.mmap_fixed(AddressRange { start: old, size: a1 - old }, Protection::RW).unwrap(); + a1 + } else { + old + }; + h.h.program_break = res; + syscall_ok(res) + }, + _ => syscall_ret(unimp(nr)), + } +} diff --git a/waterbox/waterboxhost/src/lib.rs b/waterbox/waterboxhost/src/lib.rs index c735039d02..eb7510f853 100644 --- a/waterbox/waterboxhost/src/lib.rs +++ b/waterbox/waterboxhost/src/lib.rs @@ -1,9 +1,13 @@ #![crate_type = "cdylib"] -// TODO: Turn this off once we've built the exported public API +#![feature(try_trait)] +#![feature(core_intrinsics)] + #![allow(dead_code)] -use std::io::{Read, Write, Error}; +use std::io::{Read, Write}; +use anyhow::anyhow; +use syscall_defs::{SyscallNumber, SyscallReturn}; const PAGESIZE: usize = 0x1000; const PAGEMASK: usize = 0xfff; @@ -11,13 +15,19 @@ const PAGESHIFT: i32 = 12; mod memory_block; mod syscall_defs; +mod bin; +mod elf; +mod fs; +mod host; +mod cinterface; pub trait IStateable { - fn save_sate(&mut self, stream: Box) -> Result<(), Error>; - fn load_state(&mut self, stream: Box) -> Result<(), Error>; + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()>; + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()>; } -#[derive(Debug, Clone, Copy)] +#[repr(C)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct AddressRange { pub start: usize, pub size: usize, @@ -37,6 +47,72 @@ impl AddressRange { pub unsafe fn slice_mut(&self) -> &'static mut [u8] { std::slice::from_raw_parts_mut(self.start as *mut u8, self.size) } + /// Unsafe: Pointers are unchecked and mut is not required (TODO: but why?) + pub unsafe fn zero(&self) { + std::ptr::write_bytes(self.start as *mut u8, 0, self.size); + } + /// Expands an address range to page alignment + pub fn align_expand(&self) -> AddressRange { + return AddressRange { + start: align_down(self.start), + size: align_up(self.end()) - align_down(self.start), + } + } +} +impl IStateable for AddressRange { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + bin::write(stream, &self.start)?; + bin::write(stream, &self.size)?; + Ok(()) + } + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + bin::read(stream, &mut self.start)?; + bin::read(stream, &mut self.size)?; + Ok(()) + } +} + +fn align_down(p: usize) -> usize { + p & !PAGEMASK +} +fn align_up(p: usize) -> usize { + ((p - 1) | PAGEMASK) + 1 +} + +/// Information about memory layout injected into the guest application +#[repr(C)] +#[derive(Copy, Clone)] +pub struct WbxSysLayout { + pub elf: AddressRange, + pub sbrk: AddressRange, + pub sealed: AddressRange, + pub invis: AddressRange, + pub plain: AddressRange, + pub mmap: AddressRange, +} +impl WbxSysLayout { + pub fn all(&self) -> AddressRange { + AddressRange { + start: self.elf.start, + size: self.mmap.end() - self.elf.start + } + } +} + +/// Information for making syscalls injected into the guest application +#[repr(C)] +#[derive(Copy, Clone)] +pub struct WbxSysSyscall { + pub ud: usize, + pub syscall: extern "win64" fn(nr: SyscallNumber, ud: usize, a1: usize, a2: usize, a3: usize, a4: usize, a5: usize, a6: usize) -> SyscallReturn, +} + +/// Data that is injected into the guest application +#[repr(C)] +#[derive(Copy, Clone)] +pub struct WbxSysArea { + pub layout: WbxSysLayout, + pub syscall: WbxSysSyscall, } #[cfg(test)] diff --git a/waterbox/waterboxhost/src/memory_block/mod.rs b/waterbox/waterboxhost/src/memory_block/mod.rs index c6d163b7f5..b75cbf425b 100644 --- a/waterbox/waterboxhost/src/memory_block/mod.rs +++ b/waterbox/waterboxhost/src/memory_block/mod.rs @@ -1,16 +1,18 @@ mod pageblock; mod pal; mod tripguard; +mod tests; use std::sync::MutexGuard; -use std::ops::{DerefMut, Deref}; +use std::ops::DerefMut; use pageblock::PageBlock; use crate::*; use getset::Getters; use crate::syscall_defs::*; use itertools::Itertools; -use std::io; use std::sync::atomic::AtomicU32; +use crate::bin; +use sha2::{Sha256, Digest}; /// Tracks one lock for each 4GB memory area mod lock_list { @@ -26,10 +28,12 @@ mod lock_list { unsafe fn extend(o: &T) -> &'static T { std::mem::transmute::<&T, &'static T>(o) } + /// adds a lock if it does not exist; no effect if it already does. pub fn maybe_add(lock_index: u32) { let map = &mut LOCK_LIST.lock().unwrap(); map.entry(lock_index).or_insert_with(|| Box::new(Mutex::new(None))); } + /// Gets the lock for a particular index. pub fn get(lock_index: u32) -> &'static Mutex> { let map = &mut LOCK_LIST.lock().unwrap(); unsafe { @@ -38,13 +42,6 @@ mod lock_list { } } -fn align_down(p: usize) -> usize { - p & !PAGEMASK -} -fn align_up(p: usize) -> usize { - ((p - 1) | PAGEMASK) + 1 -} - #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Protection { None, @@ -73,8 +70,17 @@ impl PageAllocation { _ => false, } } + pub fn readable(&self) -> bool { + use PageAllocation::*; + match self { + Allocated(Protection::None) => false, + Free => false, + _ => true, + } + } } +/// Stores information about the original data content of a memory area, before it got dirty #[derive(Debug)] enum Snapshot { None, @@ -89,6 +95,8 @@ struct Page { /// if true, the page has changed from its original state pub dirty: bool, pub snapshot: Snapshot, + /// If true, the page content is not stored in states (but status still is). + pub invisible: bool, } impl Page { pub fn new() -> Page { @@ -96,10 +104,12 @@ impl Page { status: PageAllocation::Free, dirty: false, snapshot: Snapshot::ZeroFilled, + invisible: false, } } /// Take a snapshot if one is not yet stored /// unsafe: caller must ensure pages are mapped and addr is correct + /// Does not check dirty or invisible pub unsafe fn maybe_snapshot(&mut self, addr: usize) { if match self.snapshot { Snapshot:: None => true, _ => false } { let mut snapshot = PageBlock::new(); @@ -124,6 +134,67 @@ impl Page { } } +/// Used internally to talk about regions of memory together with their allocation status +struct PageRange<'a> { + pub start: usize, + pub pages: &'a mut [Page] +} +impl<'a> PageRange<'a> { + pub fn addr(&self) -> AddressRange { + AddressRange { + start: self.start, + size: self.pages.len() << PAGESHIFT + } + } + pub fn split_at_size(&mut self, size: usize) -> (PageRange, PageRange) { + let (sl, sr) = self.pages.split_at_mut(size >> PAGESHIFT); + ( + PageRange { + start: self.start, + pages: sl + }, + PageRange { + start: self.start + size, + pages: sr + } + ) + } + pub fn iter(&self) -> std::slice::Iter { + self.pages.iter() + } + pub fn iter_mut(&mut self) -> std::slice::IterMut { + self.pages.iter_mut() + } + pub fn iter_with_addr(&self) -> impl Iterator { + let mut start = self.start; + self.pages.iter().map(move |p| { + let page_start = start; + start += PAGESIZE; + (AddressRange { start: page_start, size: PAGESIZE}, p) + }) + } + pub fn iter_mut_with_addr(&mut self) -> impl Iterator { + let mut start = self.start; + self.pages.iter_mut().map(move |p| { + let page_start = start; + start += PAGESIZE; + (AddressRange { start: page_start, size: PAGESIZE}, p) + }) + } + /// fuse two adjacent ranges. panics if they do not exactly touch + pub fn fuse(left: Self, right: Self) -> PageRange<'a> { + unsafe { + let lp = left.pages.as_mut_ptr(); + let rp = right.pages.as_mut_ptr(); + assert_eq!(lp.add(left.pages.len()), rp); + PageRange { + start: left.start, + pages: std::slice::from_raw_parts_mut(lp, left.pages.len() + right.pages.len()) + } + } + } +} + static NEXT_DEBUG_ID: AtomicU32 = AtomicU32::new(0); #[derive(Getters)] @@ -135,32 +206,28 @@ pub struct MemoryBlock { addr: AddressRange, #[get] sealed: bool, + #[get] + hash: Vec, lock_index: u32, handle: pal::Handle, - lock_count: u32, - mutex_guard: Option>>, debug_id: u32, + active: bool, } -pub struct MemoryBlockGuard<'a> { - block: &'a mut MemoryBlock, +type BlockGuard = MutexGuard<'static, Option>; + +pub struct ActivatedMemoryBlock<'block> { + b: &'block mut MemoryBlock, + mutex_guard: Option, } -impl<'a> Drop for MemoryBlockGuard<'a> { +impl<'block> Drop for ActivatedMemoryBlock<'block> { fn drop(&mut self) { - self.block.deactivate(); - } -} -impl<'a> Deref for MemoryBlockGuard<'a> { - type Target = MemoryBlock; - fn deref(&self) -> &MemoryBlock { - self.block - } -} -impl<'a> DerefMut for MemoryBlockGuard<'a> { - fn deref_mut(&mut self) -> &mut MemoryBlock { - self.block + unsafe { + let guard = std::mem::replace(&mut self.mutex_guard, None); + self.b.deactivate(guard.unwrap()); + } } } @@ -188,13 +255,13 @@ impl MemoryBlock { pages, addr, sealed: false, + hash: Vec::new(), lock_index, handle, - lock_count: 0, - mutex_guard: None, debug_id, + active: false, }); // res.trace("new"); res @@ -203,80 +270,77 @@ impl MemoryBlock { pub fn trace(&self, name: &str) { let ptr = unsafe { std::mem::transmute::<&Self, usize>(self) }; let tid = unsafe { std::mem::transmute::(std::thread::current().id()) }; - eprintln!("{}#{} {} [{}]@[{}] thr{}", - name, self.debug_id, ptr, self.lock_count, self.lock_index, tid) + eprintln!("{}#{} {} [{}] thr{}", + name, self.debug_id, ptr, self.lock_index, tid) } - pub fn enter(&mut self) -> MemoryBlockGuard { - self.activate(); - MemoryBlockGuard { - block: self, + pub fn enter(&mut self) -> ActivatedMemoryBlock { + unsafe { + let mutex_guard = self.activate(); + ActivatedMemoryBlock { + b: self, + mutex_guard: Some(mutex_guard), + } } } - /// lock self, and potentially swap this block into memory - pub fn activate(&mut self) { + /// lock memory region and potentially swap this block into memory + unsafe fn activate(&mut self) -> BlockGuard { // self.trace("activate"); - unsafe { - if !self.active() { - let area = lock_list::get(self.lock_index); - let mut guard = area.lock().unwrap(); + assert!(!self.active); + let area = lock_list::get(self.lock_index); + let mut guard = area.lock().unwrap(); - let other_opt = guard.deref_mut(); - match *other_opt { - Some(MemoryBlockRef(other)) => { - if other != self { - assert!(!(*other).active()); - (*other).swapout(); - self.swapin(); - *other_opt = Some(MemoryBlockRef(self)); - } - }, - None => { - self.swapin(); - *other_opt = Some(MemoryBlockRef(self)); - } + let other_opt = guard.deref_mut(); + match *other_opt { + Some(MemoryBlockRef(other)) => { + if other != self { + assert!(!(*other).active); + (*other).swapout(); + self.swapin(); + *other_opt = Some(MemoryBlockRef(self)); } - - self.mutex_guard = Some(guard); + }, + None => { + self.swapin(); + *other_opt = Some(MemoryBlockRef(self)); } - self.lock_count += 1; } + + self.active = true; + guard } - /// unlock self, and potentially swap this block out of memory - pub fn deactivate(&mut self) { + /// unlock memory region, and potentially swap this block out of memory + #[allow(unused_variables)] // unused stuff in release mode only + #[allow(unused_mut)] + unsafe fn deactivate(&mut self, mut guard: BlockGuard) { // self.trace("deactivate"); - unsafe { - assert!(self.active()); - self.lock_count -= 1; - if !self.active() { - let mut guard = std::mem::replace(&mut self.mutex_guard, None).unwrap(); - #[cfg(debug_assertions)] - { - // in debug mode, forcibly evict to catch dangling pointers - let other_opt = guard.deref_mut(); - match *other_opt { - Some(MemoryBlockRef(other)) => { - if other != self { - panic!(); - } - self.swapout(); - *other_opt = None; - }, - None => { - panic!() - } + assert!(self.active); + #[cfg(debug_assertions)] + { + // in debug mode, forcibly evict to catch dangling pointers + let other_opt = guard.deref_mut(); + match *other_opt { + Some(MemoryBlockRef(other)) => { + if other != self { + panic!(); } + self.swapout(); + *other_opt = None; + }, + None => { + panic!() } } } + self.active = false; } unsafe fn swapin(&mut self) { // self.trace("swapin"); assert!(pal::map(&self.handle, self.addr)); tripguard::register(self); - MemoryBlock::refresh_protections(self.addr.start, self.pages.as_slice()); + self.refresh_all_protections(); } unsafe fn swapout(&mut self) { // self.trace("swapout"); @@ -285,34 +349,14 @@ impl MemoryBlock { tripguard::unregister(self); } - pub fn active(&self) -> bool { - self.lock_count > 0 - } -} - -impl Drop for MemoryBlock { - fn drop(&mut self) { - // self.trace("drop"); - assert!(!self.active()); - let area = lock_list::get(self.lock_index); - let mut guard = area.lock().unwrap(); - let other_opt = guard.deref_mut(); - match *other_opt { - Some(MemoryBlockRef(other)) => { - if other == self { - unsafe { self.swapout(); } - *other_opt = None; - } - }, - None => () + fn page_range(&mut self) -> PageRange { + PageRange { + start: self.addr.start, + pages: &mut self.pages[..] } - let h = std::mem::replace(&mut self.handle, pal::bad()); - unsafe { pal::close(h); } } -} -impl MemoryBlock { - fn validate_range(&mut self, addr: AddressRange) -> Result<&mut [Page], i32> { + fn validate_range(&mut self, addr: AddressRange) -> Result { if addr.start < self.addr.start || addr.end() > self.addr.end() || addr.size == 0 @@ -322,16 +366,22 @@ impl MemoryBlock { } else { let pstart = (addr.start - self.addr.start) >> PAGESHIFT; let psize = (addr.size) >> PAGESHIFT; - Ok(&mut self.pages[pstart..pstart + psize]) + Ok(PageRange { + start: addr.start, + pages: &mut self.pages[pstart..pstart + psize] + }) } } - fn refresh_protections(mut start: usize, pages: &[Page]) { + /// Refresh the correct protections in underlying host RAM on a page range. Use after + /// temporary pal::protect(...) modifications, or to apply the effect of a dirty/prot change on the page + fn refresh_protections(range: &PageRange) { struct Chunk { addr: AddressRange, prot: Protection, }; - let chunks = pages.iter() + let mut start = range.start; + let chunks = range.iter() .map(|p| { let cstart = start; start += PAGESIZE; @@ -356,16 +406,21 @@ impl MemoryBlock { } } - fn set_protections(start: usize, pages: &mut [Page], status: PageAllocation) { - for p in pages.iter_mut() { + fn refresh_all_protections(&mut self) { + MemoryBlock::refresh_protections(&self.page_range()) + } + + /// Applies new protections to a pagerange, including special RWStack handling on Windows + fn set_protections(range: &mut PageRange, status: PageAllocation) { + for p in range.iter_mut() { p.status = status; } - MemoryBlock::refresh_protections(start, pages); + MemoryBlock::refresh_protections(&range); #[cfg(windows)] if status == PageAllocation::Allocated(Protection::RWStack) { // have to precapture snapshots here - let mut addr = start; - for p in pages { + let mut addr = range.start; + for p in range.iter_mut() { unsafe { p.maybe_snapshot(addr); } @@ -399,75 +454,377 @@ impl MemoryBlock { } } } +} - /// implements a subset of mmap(2) +impl Drop for MemoryBlock { + fn drop(&mut self) { + // self.trace("drop"); + let area = lock_list::get(self.lock_index); + let mut guard = area.lock().unwrap(); + let other_opt = guard.deref_mut(); + match *other_opt { + Some(MemoryBlockRef(other)) => { + if other == self { + unsafe { self.swapout(); } + *other_opt = None; + } + }, + None => () + } + let h = std::mem::replace(&mut self.handle, pal::bad()); + unsafe { pal::close(h); } + } +} + +impl<'block> ActivatedMemoryBlock<'block> { + /// Looks for some free pages inside an arena + fn find_free_pages<'a>(arena: &'a mut PageRange<'a>, npages: usize) -> Result, SyscallError> { + struct Chunk<'a> { + range: PageRange<'a>, + free: bool, + } + let range = arena.iter_mut_with_addr() + .map(|(a, p)| Chunk { + free: p.status == PageAllocation::Free, + range: PageRange { start: a.start, pages: std::slice::from_mut(p) }, + }) + .coalesce(|x, y| { + if x.free == y.free { + Ok(Chunk { + free: x.free, + range: PageRange::fuse(x.range, y.range) + }) + } else { + Err((x, y)) + } + }) + .filter(|c| c.free && c.range.pages.len() >= npages) + .map(|c| c.range) + .sorted_by(|x, y| x.pages.len().cmp(&y.pages.len())) + .next(); + match range { + Some(r) => { + if r.pages.len() == npages { + Ok(r) + } else { + Ok(PageRange { + start: r.start, + pages: &mut r.pages[0..npages] + }) + } + }, + None => Err(ENOMEM) + } + } + + /// implements a subset of mmap(2) for anonymous, movable address mappings + fn mmap_movable(&mut self, size: usize, prot: Protection, arena_addr: AddressRange) -> Result { + if size != align_down(size) { + return Err(EINVAL) + } + let mut arena = self.b.validate_range(arena_addr).unwrap(); + match ActivatedMemoryBlock::find_free_pages(&mut arena, size >> PAGESHIFT) { + Ok(mut range) => { + MemoryBlock::set_protections(&mut range, PageAllocation::Allocated(prot)); + Ok(range.start) + }, + Err(e) => Err(e), + } + } + + /// implements a subset of mmap(2) for anonymous, fixed address mappings pub fn mmap_fixed(&mut self, addr: AddressRange, prot: Protection) -> SyscallResult { - self.get_stack_dirty(); // not needed here technically? - let pages = self.validate_range(addr)?; - if pages.iter().any(|p| p.status != PageAllocation::Free) { + let mut range = self.b.validate_range(addr)?; + if range.iter().any(|p| p.status != PageAllocation::Free) { // assume MAP_FIXED_NOREPLACE at all times return Err(EEXIST) } - MemoryBlock::set_protections(addr.start, pages, PageAllocation::Allocated(prot)); + MemoryBlock::set_protections(&mut range, PageAllocation::Allocated(prot)); Ok(()) } + /// implements a subset of mremap(2) when MREMAP_MAYMOVE is not set, and MREMAP_FIXED is not + fn mremap_nomove(&mut self, addr: AddressRange, new_size: usize) -> SyscallResult { + self.b.get_stack_dirty(); + if new_size > addr.size { + let full_addr = AddressRange { start: addr.start, size: new_size }; + let mut range = self.b.validate_range(full_addr)?; + let (old_range, mut new_range) = range.split_at_size(addr.size); + if old_range.iter().any(|p| p.status == PageAllocation::Free) { + return Err(EINVAL) + } + if new_range.iter().any(|p| p.status != PageAllocation::Free) { + return Err(EEXIST) + } + MemoryBlock::set_protections(&mut new_range, old_range.pages[0].status); + Ok(()) + } else { + let range = self.b.validate_range(addr)?; + if range.iter().any(|p| p.status == PageAllocation::Free) { + return Err(EINVAL) + } + self.munmap_impl(AddressRange { start: addr.start + new_size, size: addr.size - new_size }, false) + } + } + + /// implements a subset of mremap(2) when MREMAP_MAYMOVE is set, and MREMAP_FIXED is not + fn mremap_maymove(&mut self, addr: AddressRange, new_size: usize, arena_addr: AddressRange) -> Result { + // This could be a lot more clever, but it's a difficult problem and doesn't come up often. + // So I use a "simple" solution here. + self.b.get_stack_dirty(); + if new_size != align_down(new_size) { + return Err(EINVAL) + } + + // save a copy of src, and unmap + let mut src = self.b.validate_range(addr)?; + if src.iter().any(|p| p.status == PageAllocation::Free) { + return Err(EINVAL) + } + let src_addr = src.addr(); + let mut old_status = Vec::new(); + old_status.reserve_exact(src.pages.len()); + let mut old_data = vec![0u8; src_addr.size]; + for p in src.iter() { + old_status.push(p.status); + } + unsafe { + pal::protect(src_addr, Protection::R); + old_data.copy_from_slice(src_addr.slice()); + } + ActivatedMemoryBlock::free_pages_impl(&mut src, false); + + // find new location to map to, and copy into there + let mut arena = self.b.validate_range(arena_addr).unwrap(); + let mut dest = match ActivatedMemoryBlock::find_free_pages(&mut arena, new_size >> PAGESHIFT) { + Ok(r) => r, + Err(_) => { + // woops! reallocate at the old address. + // Or just panic because that probably won't happen + panic!("Failure in realloc") + }, + }; + let nbcopy = std::cmp::min(addr.size, new_size); + let npcopy = nbcopy >> PAGESHIFT; + unsafe { + pal::protect(dest.addr(), Protection::RW); + dest.addr().slice_mut()[0..nbcopy].copy_from_slice(&old_data[0..nbcopy]); + } + for (status, pdst) in old_status.iter().zip(dest.iter_mut()) { + pdst.status = *status; + // this is conservative; there are situations where dirty might be false, + // but we're unlikely to see them with real world realloc usage + pdst.dirty = true; + } + for pdst in dest.pages[npcopy..].iter_mut() { + pdst.status = old_status[0]; + } + MemoryBlock::refresh_protections(&dest); + Ok(dest.start) + } + /// implements a subset of mprotect(2) pub fn mprotect(&mut self, addr: AddressRange, prot: Protection) -> SyscallResult { - self.get_stack_dirty(); - let pages = self.validate_range(addr)?; - if pages.iter().any(|p| p.status == PageAllocation::Free) { + self.b.get_stack_dirty(); + let mut range = self.b.validate_range(addr)?; + if range.iter().any(|p| p.status == PageAllocation::Free) { return Err(ENOMEM) } - MemoryBlock::set_protections(addr.start, pages, PageAllocation::Allocated(prot)); + MemoryBlock::set_protections(&mut range, PageAllocation::Allocated(prot)); Ok(()) } /// implements a subset of munmap(2) pub fn munmap(&mut self, addr: AddressRange) -> SyscallResult { - self.get_stack_dirty(); - let pages = self.validate_range(addr)?; - if pages.iter().any(|p| p.status == PageAllocation::Free) { + self.munmap_impl(addr, false) + } + + pub fn mmap(&mut self, addr: AddressRange, prot: Protection, arena_addr: AddressRange) -> Result { + if addr.size == 0 { return Err(EINVAL) } + if addr.start == 0 { + self.mmap_movable(addr.size, prot, arena_addr) + } else { + self.mmap_fixed(addr, prot)?; + Ok(addr.start) + } + } + + pub fn mremap(&mut self, addr: AddressRange, new_size: usize, arena_addr: AddressRange) -> Result { + if addr.size == 0 || new_size == 0 { + return Err(EINVAL) + } + if addr.start == 0 { + self.mremap_maymove(addr, new_size, arena_addr) + } else { + self.mremap_nomove(addr, new_size)?; + Ok(addr.start) + } + } + + /// release pages, assuming the range has been fully validated already + fn free_pages_impl(range: &mut PageRange, advise_only: bool) { + let addr = range.addr(); // we do not save the current state of unmapped pages, and if they are later remapped, // the expectation is that they will start out as zero filled. accordingly, the most // sensible way to do this is to zero them now unsafe { pal::protect(addr, Protection::RW); - std::ptr::write_bytes(addr.start as *mut u8, 0, addr.size); + addr.zero(); // simple state size optimization: we can undirty pages in this case depending on the initial state - for p in pages.iter_mut() { - p.dirty = match p.snapshot { + for p in range.iter_mut() { + p.dirty = !p.invisible && match p.snapshot { Snapshot::ZeroFilled => false, _ => true }; } } - MemoryBlock::set_protections(addr.start, pages, PageAllocation::Free); + if advise_only { + MemoryBlock::refresh_protections(range); + } else { + MemoryBlock::set_protections(range, PageAllocation::Free); + } + } + + /// munmap or MADV_DONTNEED + fn munmap_impl(&mut self, addr: AddressRange, advise_only: bool) -> SyscallResult { + self.b.get_stack_dirty(); + let mut range = self.b.validate_range(addr)?; + if range.iter().any(|p| p.status == PageAllocation::Free) { + return Err(EINVAL) + } + ActivatedMemoryBlock::free_pages_impl(&mut range, advise_only); + Ok(()) + } + /// Marks an address range as invisible. Its page content will not be saved in states (but + /// their allocation status still will be.) Cannot be revoked. Must be done before sealing. + /// The pages need not be currently mapped; they will always be invisible regardless of that. + /// !!Not actually saved in states, as is assumed to be unchanging for a particular layout.!! + pub fn mark_invisible(&mut self, addr: AddressRange) -> SyscallResult { + // The limitations on this method are mostly because we want to not need a snapshot or dirty + // tracking for invisible pages. But if we didn't have one and later the pages became visible, + // we'd need one and wouldn't be able to reconstruct one. + assert!(!self.b.sealed); + let mut range = self.b.validate_range(addr)?; + for p in range.iter_mut() { + p.dirty = true; + p.invisible = true; + } + MemoryBlock::refresh_protections(&range); Ok(()) } + /// implements a subset of madvise(2) + pub fn madvise_dontneed(&mut self, addr: AddressRange) -> SyscallResult { + self.munmap_impl(addr, true) + } + pub fn seal(&mut self) { - assert!(!self.sealed); - for p in self.pages.iter_mut() { - if p.dirty { + assert!(!self.b.sealed); + for p in self.b.pages.iter_mut() { + if p.dirty && !p.invisible { p.dirty = false; - } else { - p.snapshot = Snapshot::ZeroFilled; + p.snapshot = Snapshot::None; } } + self.b.refresh_all_protections(); + self.b.sealed = true; + self.b.hash = { + let mut hasher = Sha256::new(); + bin::write(&mut hasher, &self.b.addr).unwrap(); + for p in self.b.pages.iter() { + match &p.snapshot { + Snapshot::None => bin::writeval(&mut hasher, 1).unwrap(), + Snapshot::ZeroFilled => bin::writeval(&mut hasher, 2).unwrap(), + Snapshot::Data(d) => { hasher.write(d.slice()).unwrap(); }, + } + } + hasher.finalize()[..].to_owned() + }; } } -impl IStateable for MemoryBlock { - fn save_sate(&mut self, stream: Box) -> Result<(), io::Error> { - assert!(self.sealed); - self.get_stack_dirty(); + +const MAGIC: &str = "ActivatedMemoryBlock"; + +impl<'block> IStateable for ActivatedMemoryBlock<'block> { + fn save_state(&mut self, stream: &mut dyn Write) -> anyhow::Result<()> { + if !self.b.sealed { + return Err(anyhow!("Must seal first")) + } + bin::write_magic(stream, MAGIC)?; + bin::write_hash(stream, &self.b.hash[..])?; + self.b.get_stack_dirty(); + self.b.addr.save_state(stream)?; + + for (paddr, p) in self.b.page_range().iter_with_addr() { + bin::write(stream, &p.status)?; + if !p.invisible { + bin::write(stream, &p.dirty)?; + if p.dirty { + unsafe { + if !p.status.readable() { + assert!(pal::protect(paddr, Protection::R)); + } + stream.write_all(paddr.slice())?; + if !p.status.readable() { + assert!(pal::protect(paddr, Protection::None)); + } + } + } + } + } Ok(()) } - fn load_state(&mut self, stream: Box) -> Result<(), io::Error> { - assert!(self.sealed); - self.get_stack_dirty(); + fn load_state(&mut self, stream: &mut dyn Read) -> anyhow::Result<()> { + assert!(self.b.sealed); + bin::verify_magic(stream, MAGIC)?; + match bin::verify_hash(stream, &self.b.hash[..]) { + Ok(_) => (), + Err(_) => eprintln!("Unexpected MemoryBlock hash mismatch."), + } + self.b.get_stack_dirty(); + { + let mut addr = AddressRange { start:0, size: 0 }; + addr.load_state(stream)?; + if addr != self.b.addr { + return Err(anyhow!("Bad state data (addr) for ActivatedMemoryBlock")) + } + } + + unsafe { + pal::protect(self.b.addr, Protection::RW); + + for (paddr, p) in self.b.page_range().iter_mut_with_addr() { + let status = bin::readval::(stream)?; + if !p.invisible { + let dirty = bin::readval::(stream)?; + match (p.dirty, dirty) { + (false, false) => (), + (false, true) => { + p.maybe_snapshot(paddr.start); + stream.read_exact(paddr.slice_mut())?; + }, + (true, false) => { + match &p.snapshot { + Snapshot::ZeroFilled => paddr.zero(), + Snapshot::Data(b) => { + std::ptr::copy_nonoverlapping(b.as_ptr(), paddr.start as *mut u8, PAGESIZE) + }, + Snapshot::None => panic!("Missing snapshot for dirty region"), + } + } + (true, true) => { + stream.read_exact(paddr.slice_mut())?; + } + } + p.dirty = dirty; + } + p.status = status; + } + + self.b.refresh_all_protections(); + } Ok(()) } } @@ -482,127 +839,3 @@ impl Eq for MemoryBlock {} #[derive(Debug)] pub struct MemoryBlockRef(*mut MemoryBlock); unsafe impl Send for MemoryBlockRef {} - -#[cfg(test)] -mod tests { - use std::mem::transmute; - use super::*; - - /// new / drop, activate / deactivate - #[test] - fn test_create() { - drop(MemoryBlock::new(AddressRange { start: 0x36300000000, size: 0x50000 })); - drop(MemoryBlock::new(AddressRange { start: 0x36b00000000, size: 0x2000 })); - { - let mut b = MemoryBlock::new(AddressRange { start: 0x36100000000, size: 0x65000 }); - b.activate(); - b.deactivate(); - b.enter(); - } - { - let mut b = MemoryBlock::new(AddressRange { start: 0x36e00000000, size: 0x5000 }); - b.activate(); - b.activate(); - let mut guard = b.enter(); - guard.activate(); - guard.deactivate(); - drop(guard); - b.deactivate(); - b.deactivate(); - b.enter(); - } - } - - /// simple test of dirt detection - #[test] - fn test_dirty() -> SyscallResult { - unsafe { - let addr = AddressRange { start: 0x36f00000000, size: 0x10000 }; - let mut b = MemoryBlock::new(addr); - let mut g = b.enter(); - g.mmap_fixed(addr, Protection::RW)?; - let ptr = g.addr.slice_mut(); - ptr[0x2003] = 5; - assert!(g.pages[2].dirty); - Ok(()) - } - } - - /// dirt detection away from the start of a block - #[test] - fn test_offset() -> SyscallResult { - unsafe { - let addr = AddressRange { start: 0x36f00000000, size: 0x20000 }; - let mut b = MemoryBlock::new(addr); - let mut g = b.enter(); - g.mmap_fixed(AddressRange { start: 0x36f00003000, size: 0x1000 }, Protection::RW)?; - let ptr = g.addr.slice_mut(); - ptr[0x3663] = 12; - assert!(g.pages[3].dirty); - Ok(()) - } - } - - /// dirt detection in RWStack area when $rsp points there - #[test] - fn test_stk_norm() -> SyscallResult { - unsafe { - let addr = AddressRange { start: 0x36200000000, size: 0x10000 }; - let mut b = MemoryBlock::new(addr); - let mut g = b.enter(); - g.mmap_fixed(addr, Protection::RWStack)?; - let ptr = g.addr.slice_mut(); - ptr[0xeeee] = 0xee; - ptr[0x44] = 0x44; - assert!(g.pages[0].dirty); - assert!(g.pages[14].dirty); - assert_eq!(ptr[0x8000], 0); - - // This is an unfair test, but it's just documenting the current limitations of the system. - // Ideally, page 8 would be clean because we read from it but did not write to it. - // Due to limitations of RWStack tracking on windows, it is dirty. - #[cfg(windows)] - assert!(g.pages[8].dirty); - #[cfg(unix)] - assert!(!g.pages[8].dirty); - - Ok(()) - } - } - - /// dirt detection in RWStack area when $rsp points there - #[test] - fn test_stack() -> SyscallResult { - use std::convert::TryInto; - unsafe { - let addr = AddressRange { start: 0x36f00000000, size: 0x10000 }; - let mut b = MemoryBlock::new(addr); - let mut g = b.enter(); - g.mmap_fixed(addr, Protection::RW)?; - let ptr = g.addr.slice_mut(); - let mut i = 0; - - ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xe0 ; i += 1; // mov rax,rsp - ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xfc ; i += 1; // mov rsp,rdi - ptr[i] = 0x50 ; i += 1; // push rax - ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xc4 ; i += 1; // mov rsp,rax - ptr[i] = 0xb0 ; i += 1; ptr[i] = 0x2a ; i += 1; // mov al,0x2a - ptr[i] = 0xc3 ; // ret - - g.mprotect(AddressRange { start: 0x36f00000000, size: 0x1000 }, Protection::RX)?; - g.mprotect(AddressRange { start: 0x36f00008000, size: 0x8000 }, Protection::RWStack)?; - let tmp_rsp = addr.end(); - let res = transmute:: u8>(addr.start)(tmp_rsp); - assert_eq!(res, 42); - assert!(g.pages[0].dirty); - assert!(!g.pages[1].dirty); - assert!(!g.pages[14].dirty); - assert!(g.pages[15].dirty); - - let real_rsp = isize::from_le_bytes(ptr[addr.size - 8..].try_into().unwrap()); - let current_rsp = &real_rsp as *const isize as isize; - assert!((real_rsp - current_rsp).abs() < 0x10000); - Ok(()) - } - } -} diff --git a/waterbox/waterboxhost/src/memory_block/pageblock.rs b/waterbox/waterboxhost/src/memory_block/pageblock.rs index 52b42b0753..183693f1f4 100644 --- a/waterbox/waterboxhost/src/memory_block/pageblock.rs +++ b/waterbox/waterboxhost/src/memory_block/pageblock.rs @@ -32,6 +32,12 @@ impl PageBlock { std::slice::from_raw_parts_mut(self.ptr.as_ptr(), PAGESIZE) } } + pub fn as_ptr(&self) -> *const u8 { + self.ptr.as_ptr() + } + pub fn as_mut_ptr(&mut self) -> *mut u8 { + self.ptr.as_ptr() + } } impl Drop for PageBlock { diff --git a/waterbox/waterboxhost/src/memory_block/tests.rs b/waterbox/waterboxhost/src/memory_block/tests.rs new file mode 100644 index 0000000000..d511368821 --- /dev/null +++ b/waterbox/waterboxhost/src/memory_block/tests.rs @@ -0,0 +1,411 @@ +#![cfg(test)] + +use std::mem::transmute; +use super::*; + +type TestResult = anyhow::Result<()>; + +/// new / drop, activate / deactivate +#[test] +fn test_create() { + // these tests don't test much anymore... + drop(MemoryBlock::new(AddressRange { start: 0x36300000000, size: 0x50000 })); + drop(MemoryBlock::new(AddressRange { start: 0x36b00000000, size: 0x2000 })); + { + let mut b = MemoryBlock::new(AddressRange { start: 0x36100000000, size: 0x65000 }); + b.enter(); + b.enter(); + } + { + let mut b = MemoryBlock::new(AddressRange { start: 0x36e00000000, size: 0x5000 }); + let guard = b.enter(); + drop(guard); + b.enter(); + } +} + +/// simple test of dirt detection +#[test] +fn test_dirty() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36f00000000, size: 0x10000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + g.mmap_fixed(addr, Protection::RW)?; + let ptr = g.b.addr.slice_mut(); + ptr[0x2003] = 5; + assert!(g.b.pages[2].dirty); + Ok(()) + } +} + +/// dirt detection away from the start of a block +#[test] +fn test_offset() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36f00000000, size: 0x20000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + g.mmap_fixed(AddressRange { start: 0x36f00003000, size: 0x1000 }, Protection::RW)?; + let ptr = g.b.addr.slice_mut(); + ptr[0x3663] = 12; + assert!(g.b.pages[3].dirty); + Ok(()) + } +} + +/// dirt detection in RWStack area when $rsp does not point there, and it was just a conventional write +#[test] +fn test_stk_norm() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36200000000, size: 0x10000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + g.mmap_fixed(addr, Protection::RWStack)?; + let ptr = g.b.addr.slice_mut(); + ptr[0xeeee] = 0xee; + ptr[0x44] = 0x44; + assert!(g.b.pages[0].dirty); + assert!(g.b.pages[14].dirty); + assert_eq!(ptr[0x8000], 0); + + // This is an unfair test, but it's just documenting the current limitations of the system. + // Ideally, page 8 would be clean because we read from it but did not write to it. + // Due to limitations of RWStack tracking on windows, it is dirty. + #[cfg(windows)] + assert!(g.b.pages[8].dirty); + #[cfg(unix)] + assert!(!g.b.pages[8].dirty); + + Ok(()) + } +} + +/// dirt detection in RWStack area when $rsp points there +#[test] +fn test_stack() -> TestResult { + use std::convert::TryInto; + unsafe { + let addr = AddressRange { start: 0x36f00000000, size: 0x10000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + g.mmap_fixed(addr, Protection::RW)?; + let ptr = g.b.addr.slice_mut(); + let mut i = 0; + + ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xe0 ; i += 1; // mov rax,rsp + ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xfc ; i += 1; // mov rsp,rdi + ptr[i] = 0x50 ; i += 1; // push rax + ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xc4 ; i += 1; // mov rsp,rax + ptr[i] = 0xb0 ; i += 1; ptr[i] = 0x2a ; i += 1; // mov al,0x2a + ptr[i] = 0xc3 ; // ret + + g.mprotect(AddressRange { start: 0x36f00000000, size: 0x1000 }, Protection::RX)?; + g.mprotect(AddressRange { start: 0x36f00008000, size: 0x8000 }, Protection::RWStack)?; + let tmp_rsp = addr.end(); + let res = transmute:: u8>(addr.start)(tmp_rsp); + assert_eq!(res, 42); + assert!(g.b.pages[0].dirty); + assert!(!g.b.pages[1].dirty); + assert!(!g.b.pages[14].dirty); + assert!(g.b.pages[15].dirty); + + let real_rsp = isize::from_le_bytes(ptr[addr.size - 8..].try_into().unwrap()); + let current_rsp = &real_rsp as *const isize as isize; + assert!((real_rsp - current_rsp).abs() < 0x10000); + Ok(()) + } +} + +#[test] +fn test_state_basic() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36c00000000, size: 0x4000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + let ptr = g.b.addr.slice_mut(); + g.mmap_fixed(addr, Protection::RW)?; + ptr[0x0000] = 20; + ptr[0x1000] = 40; + ptr[0x2000] = 60; + ptr[0x3000] = 80; + + g.seal(); + let mut state0 = Vec::new(); + g.save_state(&mut state0)?; + + // no pages should be in the state + assert!(state0.len() < 0x1000); + + ptr[0x1000] = 100; + ptr[0x3000] = 44; + + let mut state1 = Vec::new(); + g.save_state(&mut state1)?; + + // two pages should be in the state + assert!(state1.len() > 0x2000); + assert!(state1.len() < 0x3000); + + g.load_state(&mut state0.as_slice())?; + + assert_eq!(ptr[0x0000], 20); + assert_eq!(ptr[0x1000], 40); + assert_eq!(ptr[0x2000], 60); + assert_eq!(ptr[0x3000], 80); + + g.load_state(&mut state1.as_slice())?; + + assert_eq!(ptr[0x0000], 20); + assert_eq!(ptr[0x1000], 100); + assert_eq!(ptr[0x2000], 60); + assert_eq!(ptr[0x3000], 44); + + Ok(()) + } +} + +#[test] +fn test_state_unreadable() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36c00000000, size: 0x1000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + let ptr = g.b.addr.slice_mut(); + g.mmap_fixed(addr, Protection::RW)?; + g.seal(); + + ptr[200] = 200; + ptr[500] = 100; + g.mprotect(addr, Protection::None)?; + let mut state0 = Vec::new(); + g.save_state(&mut state0)?; + + g.mprotect(addr, Protection::RW)?; + ptr[300] = 50; + ptr[600] = 11; + g.mprotect(addr, Protection::None)?; + let mut state1 = Vec::new(); + g.save_state(&mut state1)?; + + g.load_state(&mut state0.as_slice())?; + g.mprotect(addr, Protection::R)?; + assert_eq!(ptr[200], 200); + assert_eq!(ptr[500], 100); + assert_eq!(ptr[300], 0); + assert_eq!(ptr[600], 0); + + g.load_state(&mut state1.as_slice())?; + g.mprotect(addr, Protection::R)?; + assert_eq!(ptr[200], 200); + assert_eq!(ptr[500], 100); + assert_eq!(ptr[300], 50); + assert_eq!(ptr[600], 11); + + Ok(()) + } +} + +#[test] +fn test_thready_stack() -> TestResult { + use std::sync::{Arc, Barrier}; + use std::thread; + + let barrier = Arc::new(Barrier::new(16)); + let mut ress = Vec::>::new(); + for i in 0..16 { + let blocker = barrier.clone(); + ress.push(thread::spawn(move|| { + unsafe { + let addr = AddressRange { start: 0x36000000000 + i * 0x100000000, size: PAGESIZE * 2 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + + blocker.wait(); + g.mmap_fixed(addr, Protection::RWX)?; + g.mprotect(AddressRange { start: addr.start + PAGESIZE, size: PAGESIZE }, Protection::RWStack)?; + + let ptr = g.b.addr.slice_mut(); + let mut i = 0; + + ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xe0 ; i += 1; // mov rax,rsp + ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xfc ; i += 1; // mov rsp,rdi + ptr[i] = 0x50 ; i += 1; // push rax + ptr[i] = 0x48 ; i += 1; ptr[i] = 0x89 ; i += 1; ptr[i] = 0xc4 ; i += 1; // mov rsp,rax + ptr[i] = 0xb0 ; i += 1; ptr[i] = 0x2a ; i += 1; // mov al,0x2a + ptr[i] = 0xc3 ; // ret + + g.seal(); + + assert!(!g.b.pages[0].dirty); + assert!(!g.b.pages[1].dirty); + let tmp_rsp = addr.end(); + let res = transmute:: u8>(addr.start)(tmp_rsp); + assert_eq!(res, 42); + assert!(!g.b.pages[0].dirty); + assert!(g.b.pages[1].dirty); + + Ok(()) + } + })); + } + for h in ress { + match h.join() { + Ok(v) => v, + Err(_) => return Err(anyhow!("Thread error")), + }? + } + + Ok(()) +} + +#[test] +fn test_state_invisible() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36400000000, size: 0x4000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + let ptr = g.b.addr.slice_mut(); + g.mmap_fixed(addr, Protection::RW)?; + ptr[0x0055] = 11; + ptr[0x1055] = 22; + g.mark_invisible(AddressRange { start: 0x36400001000, size: 0x2000 })?; + ptr[0x2055] = 33; + ptr[0x3055] = 44; + + g.seal(); + + ptr[0x0055] = 0x11; + ptr[0x1055] = 0x22; + ptr[0x2055] = 0x33; + ptr[0x3055] = 0x44; + + let mut state0 = Vec::new(); + g.save_state(&mut state0)?; + + // two pages should be in the state + assert!(state0.len() > 0x2000); + assert!(state0.len() < 0x3000); + + ptr[0x0055] = 0x55; + ptr[0x1055] = 0x66; + ptr[0x2055] = 0x77; + ptr[0x3055] = 0x88; + + g.load_state(&mut state0.as_slice())?; + + assert_eq!(ptr[0x0055], 0x11); + // Some current cores require this behavior, where the invisible values are actually left untouched. + // (VB for config settings?) + // In the long term, it might be nice to redefine things so that invisible means invisible and ephemeral, + // and forcibly zero any active invisible page on loadstate. + assert_eq!(ptr[0x1055], 0x66); + assert_eq!(ptr[0x2055], 0x77); + assert_eq!(ptr[0x3055], 0x44); + + Ok(()) + } +} + +#[test] +fn test_dontneed() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36500000000, size: 0x10000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + g.seal(); + let ptr = g.b.addr.slice_mut(); + + g.mmap_fixed(addr, Protection::RW)?; + for i in 0..addr.size { + ptr[i] = i as u8; + } + let addr2 = AddressRange { start: addr.start + 0x3000, size: 0x5000 }; + g.madvise_dontneed(addr2)?; + let ptr2 = addr2.slice_mut(); + for i in 0..addr2.size { + assert_eq!(ptr2[i], 0); + } + + let mut state0 = Vec::new(); + g.save_state(&mut state0)?; + assert!(state0.len() < 0xc000); + + Ok(()) + } +} + +#[test] +fn test_remap_nomove() -> TestResult { + let addr = AddressRange { start: 0x36600000000, size: 0x10000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + + g.mmap_fixed(AddressRange { start: addr.start, size: 0x4000 }, Protection::RWX)?; + g.mremap_nomove(AddressRange { start: addr.start, size: 0x4000 }, 0x6000)?; + assert_eq!(g.b.pages[3].status, PageAllocation::Allocated(Protection::RWX)); + assert_eq!(g.b.pages[5].status, PageAllocation::Allocated(Protection::RWX)); + g.mremap_nomove(AddressRange { start: addr.start, size: 0x6000 }, 0x3000)?; + assert_eq!(g.b.pages[2].status, PageAllocation::Allocated(Protection::RWX)); + assert_eq!(g.b.pages[3].status, PageAllocation::Free); + assert_eq!(g.b.pages[5].status, PageAllocation::Free); + + Ok(()) +} + +#[test] +fn test_mmap_move() -> TestResult { + let addr = AddressRange { start: 0x36700000000, size: 0x10000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + + let p0 = g.mmap_movable(0x10000, Protection::RW, addr)?; + assert_eq!(p0, 0x36700000000); + g.munmap(AddressRange { start: 0x36700002000, size: 0x2000 })?; + g.munmap(AddressRange { start: 0x3670000a000, size: 0x1000 })?; + + let p1: usize = g.mmap_movable(0x1000, Protection::RW, addr)?; + assert_eq!(p1, 0x3670000a000); // fit in smallest hole + + Ok(()) +} + +#[test] +fn test_mremap_move_expand() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36800000000, size: 0x4000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + let ptr = g.b.addr.slice_mut(); + + let initial_addr = AddressRange { start: 0x36800002000, size: 0x1000 }; + g.mmap_fixed(initial_addr, Protection::RW)?; + ptr[0x2004] = 11; + let p1 = g.mremap_maymove(initial_addr, 0x2000, addr)?; + assert_eq!(p1, addr.start); + assert_eq!(ptr[4], 11); + g.mmap_fixed(initial_addr, Protection::RW)?; + assert_eq!(ptr[0x2004], 0); + } + Ok(()) +} + +#[test] +fn test_mremap_move_shrink() -> TestResult { + unsafe { + let addr = AddressRange { start: 0x36900000000, size: 0x4000 }; + let mut b = MemoryBlock::new(addr); + let mut g = b.enter(); + let ptr = g.b.addr.slice_mut(); + + let initial_addr = AddressRange { start: 0x36900001000, size: 0x3000 }; + g.mmap_fixed(initial_addr, Protection::RW)?; + ptr[0x1004] = 11; + let p1 = g.mremap_maymove(initial_addr, 0x1000, addr)?; + assert_eq!(p1, addr.start); + assert_eq!(ptr[4], 11); + g.mmap_fixed(initial_addr, Protection::RW)?; + assert_eq!(ptr[0x1004], 0); + } + Ok(()) +} diff --git a/waterbox/waterboxhost/src/memory_block/tripguard.rs b/waterbox/waterboxhost/src/memory_block/tripguard.rs index 946a283ab9..42447fc741 100644 --- a/waterbox/waterboxhost/src/memory_block/tripguard.rs +++ b/waterbox/waterboxhost/src/memory_block/tripguard.rs @@ -1,5 +1,4 @@ -use std::ptr::null_mut; use super::MemoryBlock; use std::sync::Mutex; use crate::*; @@ -21,7 +20,7 @@ struct GlobalData { pub unsafe fn register(block: *mut MemoryBlock) { let mut data = GLOBAL_DATA.lock().unwrap(); if !data.initialized { - initialize(); + trip_pal::initialize(); data.initialized = true; } data.active_blocks.push(MemoryBlockRef(block)); @@ -60,81 +59,87 @@ unsafe fn trip(addr: usize) -> TripResult { } page.maybe_snapshot(page_start_addr); page.dirty = true; - assert!(pal::protect(AddressRange { start: page_start_addr, size: PAGESIZE }, page.native_prot())); - TripResult::Handled + if pal::protect(AddressRange { start: page_start_addr, size: PAGESIZE }, page.native_prot()) { + TripResult::Handled + } else { + std::process::abort(); + } } #[cfg(windows)] -fn initialize() { +mod trip_pal { + use super::*; use winapi::um::errhandlingapi::*; use winapi::um::winnt::*; use winapi::vc::excpt::*; - unsafe extern "system" fn handler(p_info: *mut EXCEPTION_POINTERS) -> i32 { - let p_record = &mut *(*p_info).ExceptionRecord; - let flags = p_record.ExceptionInformation[0]; - match p_record.ExceptionCode { - STATUS_ACCESS_VIOLATION if (flags & 1) != 0 => (), // write exception - STATUS_GUARD_PAGE_VIOLATION => (), // guard exception - _ => return EXCEPTION_CONTINUE_SEARCH - } - let fault_address = p_record.ExceptionInformation[1] as usize; - match trip(fault_address) { - TripResult::Handled => EXCEPTION_CONTINUE_EXECUTION, - TripResult::NotHandled => EXCEPTION_CONTINUE_SEARCH, - } - } - unsafe { - let res = AddVectoredExceptionHandler(1 /* CALL_FIRST */, Some(handler)); - assert!(res != null_mut(), "AddVectoredExceptionHandler failed"); - } -} - -#[cfg(unix)] -type SaHandler = unsafe extern fn(i32) -> (); -#[cfg(unix)] -type SaSigaction = unsafe extern fn(i32, *const siginfo_t, *const ucontext_t) -> (); -#[cfg(unix)] -use libc::*; -#[cfg(unix)] -static mut ALTSTACK: [u8; SIGSTKSZ] = [0; SIGSTKSZ]; -#[cfg(unix)] -static mut SA_OLD: Option> = None; -#[cfg(unix)] -fn initialize() { - use std::mem::{transmute, zeroed}; - - unsafe extern fn handler(sig: i32, info: *const siginfo_t, ucontext: *const ucontext_t) { - let fault_address = (*info).si_addr() as usize; - let write = (*ucontext).uc_mcontext.gregs[REG_ERR as usize] & 2 != 0; - let rethrow = !write || match trip(fault_address) { - TripResult::NotHandled => true, - _ => false - }; - if rethrow { - if SA_OLD.as_ref().unwrap().sa_flags & SA_SIGINFO != 0 { - transmute::(SA_OLD.as_ref().unwrap().sa_sigaction)(sig, info, ucontext); - } else { - transmute::(SA_OLD.as_ref().unwrap().sa_sigaction)(sig); + pub fn initialize() { + unsafe extern "system" fn handler(p_info: *mut EXCEPTION_POINTERS) -> i32 { + let p_record = &*(*p_info).ExceptionRecord; + let flags = p_record.ExceptionInformation[0]; + match p_record.ExceptionCode { + STATUS_ACCESS_VIOLATION if (flags & 1) != 0 => (), // write exception + STATUS_GUARD_PAGE_VIOLATION => (), // guard exception + _ => return EXCEPTION_CONTINUE_SEARCH } - abort(); + let fault_address = p_record.ExceptionInformation[1] as usize; + match trip(fault_address) { + TripResult::Handled => EXCEPTION_CONTINUE_EXECUTION, + TripResult::NotHandled => EXCEPTION_CONTINUE_SEARCH, + } + } + unsafe { + let res = AddVectoredExceptionHandler(1 /* CALL_FIRST */, Some(handler)); + assert!(!res.is_null(), "AddVectoredExceptionHandler failed"); } } - unsafe { - SA_OLD = Some(Box::new(zeroed::())); - let ss = stack_t { - ss_flags: 0, - ss_sp: &mut ALTSTACK[0] as *mut u8 as *mut c_void, - ss_size: SIGSTKSZ - }; - assert!(sigaltstack(&ss, null_mut()) == 0, "sigaltstack failed"); - let mut sa = sigaction { - sa_mask: zeroed::(), - sa_sigaction: transmute::(handler), - sa_flags: SA_ONSTACK | SA_SIGINFO, - sa_restorer: None, - }; - sigfillset(&mut sa.sa_mask); - assert!(sigaction(SIGSEGV, &sa, &mut **SA_OLD.as_mut().unwrap() as *mut sigaction) == 0, "sigaction failed"); +} + +#[cfg(unix)] +mod trip_pal { + use libc::*; + use super::*; + + type SaHandler = unsafe extern fn(i32) -> (); + type SaSigaction = unsafe extern fn(i32, *const siginfo_t, *const ucontext_t) -> (); + static mut SA_OLD: Option> = None; + + pub fn initialize() { + use std::mem::{transmute, zeroed}; + + unsafe extern fn handler(sig: i32, info: *const siginfo_t, ucontext: *const ucontext_t) { + let fault_address = (*info).si_addr() as usize; + let write = (*ucontext).uc_mcontext.gregs[REG_ERR as usize] & 2 != 0; + let rethrow = !write || match trip(fault_address) { + TripResult::NotHandled => true, + _ => false + }; + if rethrow { + let sa_old = SA_OLD.as_ref().unwrap(); + if sa_old.sa_flags & SA_SIGINFO != 0 { + transmute::(sa_old.sa_sigaction)(sig, info, ucontext); + } else { + transmute::(sa_old.sa_sigaction)(sig); + } + abort(); + } + } + unsafe { + SA_OLD = Some(Box::new(zeroed())); + let ss = stack_t { + ss_flags: 0, + ss_sp: Box::into_raw(Box::new(zeroed::<[u8; SIGSTKSZ]>)) as *mut c_void, + ss_size: SIGSTKSZ + }; + assert!(sigaltstack(&ss, 0 as *mut stack_t) == 0, "sigaltstack failed"); + let mut sa = sigaction { + sa_mask: zeroed(), + sa_sigaction: transmute::(handler), + sa_flags: SA_ONSTACK | SA_SIGINFO, + sa_restorer: None, + }; + sigfillset(&mut sa.sa_mask); + assert!(sigaction(SIGSEGV, &sa, &mut **SA_OLD.as_mut().unwrap() as *mut sigaction) == 0, "sigaction failed"); + } } } diff --git a/waterbox/waterboxhost/src/syscall_defs.rs b/waterbox/waterboxhost/src/syscall_defs.rs index a0203ed3cd..803570c0ac 100644 --- a/waterbox/waterboxhost/src/syscall_defs.rs +++ b/waterbox/waterboxhost/src/syscall_defs.rs @@ -2,147 +2,751 @@ // There are various crates that contain these, but they're #[cfg]'ed to the HOST system. // We want exactly the ones that waterbox guest MUSL uses, exactly the way they're defined there +use std::{ops::Try, fmt}; + /// the result of a syscall in Rust-friendly form; OK or errno -pub type SyscallResult = Result<(), i32>; +pub type SyscallResult = Result<(), SyscallError>; /// map a syscall result as the kernel would return it -pub fn map_syscall_result(result: SyscallResult) -> isize { +pub fn syscall_ret(result: SyscallResult) -> SyscallReturn { match result { - Ok(()) => 0, - Err(i) => -i as isize, + Ok(()) => SyscallReturn::from_ok(0), + Err(e) => SyscallReturn::from_error(e) + } +} +/// map a syscall result as the kernel would return it +pub fn syscall_ret_val(result: Result) -> SyscallReturn { + match result { + Ok(v) => SyscallReturn::from_ok(v), + Err(e) => SyscallReturn::from_error(e) + } +} +pub fn syscall_ret_i64(result: Result) -> SyscallReturn { + match result { + Ok(v) => SyscallReturn::from_ok(v as usize), + Err(e) => SyscallReturn::from_error(e) + } +} +/// map a syscall result as the kernel would return it +pub fn syscall_err(result: SyscallError) -> SyscallReturn { + SyscallReturn::from_error(result) +} +/// map a syscall result as the kernel would return it +pub fn syscall_ok(result: usize) -> SyscallReturn { + SyscallReturn::from_ok(result) +} + +#[repr(transparent)] +pub struct SyscallReturn(pub usize); +impl SyscallReturn { + pub const ERROR_THRESH: usize = -4096 as isize as usize; +} +impl Try for SyscallReturn { + type Ok = usize; + type Error = SyscallError; + fn into_result(self) -> Result { + if self.0 <= SyscallReturn::ERROR_THRESH { + Ok(self.0) + } else { + Err(SyscallError(-(self.0 as i32))) + } + } + fn from_error(v: Self::Error) -> Self { + SyscallReturn(-v.0 as isize as usize) + } + fn from_ok(v: Self::Ok) -> Self { + assert!(v <= SyscallReturn::ERROR_THRESH); + SyscallReturn(v) } } -pub const EPERM: i32 = 1; -pub const ENOENT: i32 = 2; -pub const ESRCH: i32 = 3; -pub const EINTR: i32 = 4; -pub const EIO: i32 = 5; -pub const ENXIO: i32 = 6; -pub const E2BIG: i32 = 7; -pub const ENOEXEC: i32 = 8; -pub const EBADF: i32 = 9; -pub const ECHILD: i32 = 10; -pub const EAGAIN: i32 = 11; -pub const ENOMEM: i32 = 12; -pub const EACCES: i32 = 13; -pub const EFAULT: i32 = 14; -pub const ENOTBLK: i32 = 15; -pub const EBUSY: i32 = 16; -pub const EEXIST: i32 = 17; -pub const EXDEV: i32 = 18; -pub const ENODEV: i32 = 19; -pub const ENOTDIR: i32 = 20; -pub const EISDIR: i32 = 21; -pub const EINVAL: i32 = 22; -pub const ENFILE: i32 = 23; -pub const EMFILE: i32 = 24; -pub const ENOTTY: i32 = 25; -pub const ETXTBSY: i32 = 26; -pub const EFBIG: i32 = 27; -pub const ENOSPC: i32 = 28; -pub const ESPIPE: i32 = 29; -pub const EROFS: i32 = 30; -pub const EMLINK: i32 = 31; -pub const EPIPE: i32 = 32; -pub const EDOM: i32 = 33; -pub const ERANGE: i32 = 34; -pub const EDEADLK: i32 = 35; -pub const ENAMETOOLONG: i32 = 36; -pub const ENOLCK: i32 = 37; -pub const ENOSYS: i32 = 38; -pub const ENOTEMPTY: i32 = 39; -pub const ELOOP: i32 = 40; -pub const EWOULDBLOCK: i32 = EAGAIN; -pub const ENOMSG: i32 = 42; -pub const EIDRM: i32 = 43; -pub const ECHRNG: i32 = 44; -pub const EL2NSYNC: i32 = 45; -pub const EL3HLT: i32 = 46; -pub const EL3RST: i32 = 47; -pub const ELNRNG: i32 = 48; -pub const EUNATCH: i32 = 49; -pub const ENOCSI: i32 = 50; -pub const EL2HLT: i32 = 51; -pub const EBADE: i32 = 52; -pub const EBADR: i32 = 53; -pub const EXFULL: i32 = 54; -pub const ENOANO: i32 = 55; -pub const EBADRQC: i32 = 56; -pub const EBADSLT: i32 = 57; -pub const EDEADLOCK: i32 = EDEADLK; -pub const EBFONT: i32 = 59; -pub const ENOSTR: i32 = 60; -pub const ENODATA: i32 = 61; -pub const ETIME: i32 = 62; -pub const ENOSR: i32 = 63; -pub const ENONET: i32 = 64; -pub const ENOPKG: i32 = 65; -pub const EREMOTE: i32 = 66; -pub const ENOLINK: i32 = 67; -pub const EADV: i32 = 68; -pub const ESRMNT: i32 = 69; -pub const ECOMM: i32 = 70; -pub const EPROTO: i32 = 71; -pub const EMULTIHOP: i32 = 72; -pub const EDOTDOT: i32 = 73; -pub const EBADMSG: i32 = 74; -pub const EOVERFLOW: i32 = 75; -pub const ENOTUNIQ: i32 = 76; -pub const EBADFD: i32 = 77; -pub const EREMCHG: i32 = 78; -pub const ELIBACC: i32 = 79; -pub const ELIBBAD: i32 = 80; -pub const ELIBSCN: i32 = 81; -pub const ELIBMAX: i32 = 82; -pub const ELIBEXEC: i32 = 83; -pub const EILSEQ: i32 = 84; -pub const ERESTART: i32 = 85; -pub const ESTRPIPE: i32 = 86; -pub const EUSERS: i32 = 87; -pub const ENOTSOCK: i32 = 88; -pub const EDESTADDRREQ: i32 = 89; -pub const EMSGSIZE: i32 = 90; -pub const EPROTOTYPE: i32 = 91; -pub const ENOPROTOOPT: i32 = 92; -pub const EPROTONOSUPPORT: i32 = 93; -pub const ESOCKTNOSUPPORT: i32 = 94; -pub const EOPNOTSUPP: i32 = 95; -pub const ENOTSUP: i32 = EOPNOTSUPP; -pub const EPFNOSUPPORT: i32 = 96; -pub const EAFNOSUPPORT: i32 = 97; -pub const EADDRINUSE: i32 = 98; -pub const EADDRNOTAVAIL: i32 = 99; -pub const ENETDOWN: i32 = 100; -pub const ENETUNREACH: i32 = 101; -pub const ENETRESET: i32 = 102; -pub const ECONNABORTED: i32 = 103; -pub const ECONNRESET: i32 = 104; -pub const ENOBUFS: i32 = 105; -pub const EISCONN: i32 = 106; -pub const ENOTCONN: i32 = 107; -pub const ESHUTDOWN: i32 = 108; -pub const ETOOMANYREFS: i32 = 109; -pub const ETIMEDOUT: i32 = 110; -pub const ECONNREFUSED: i32 = 111; -pub const EHOSTDOWN: i32 = 112; -pub const EHOSTUNREACH: i32 = 113; -pub const EALREADY: i32 = 114; -pub const EINPROGRESS: i32 = 115; -pub const ESTALE: i32 = 116; -pub const EUCLEAN: i32 = 117; -pub const ENOTNAM: i32 = 118; -pub const ENAVAIL: i32 = 119; -pub const EISNAM: i32 = 120; -pub const EREMOTEIO: i32 = 121; -pub const EDQUOT: i32 = 122; -pub const ENOMEDIUM: i32 = 123; -pub const EMEDIUMTYPE: i32 = 124; -pub const ECANCELED: i32 = 125; -pub const ENOKEY: i32 = 126; -pub const EKEYEXPIRED: i32 = 127; -pub const EKEYREVOKED: i32 = 128; -pub const EKEYREJECTED: i32 = 129; -pub const EOWNERDEAD: i32 = 130; -pub const ENOTRECOVERABLE: i32 = 131; -pub const ERFKILL: i32 = 132; -pub const EHWPOISON: i32 = 133; +macro_rules! lookup { + ($P:ident: $T:ident { $($N:ident = $E:expr; )+ }) => ( + $(pub const $N: $T = $T($E);)+ + pub fn $P(val: &$T) -> &'static str { + match val { + $($T($E) => stringify!($E),)+ + _ => "????" + } + } + ); +} + +#[derive(Debug, Eq, PartialEq)] +#[repr(transparent)] +pub struct SyscallError(pub i32); +impl From for SyscallError { + fn from(err: i32) -> SyscallError { + SyscallError(err) + } +} +impl fmt::Display for SyscallError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "errno {}", lookup_errno(self)) + } +} +impl std::error::Error for SyscallError {} + +lookup! { lookup_errno: SyscallError { + EPERM = 1; + ENOENT = 2; + ESRCH = 3; + EINTR = 4; + EIO = 5; + ENXIO = 6; + E2BIG = 7; + ENOEXEC = 8; + EBADF = 9; + ECHILD = 10; + EAGAIN = 11; + ENOMEM = 12; + EACCES = 13; + EFAULT = 14; + ENOTBLK = 15; + EBUSY = 16; + EEXIST = 17; + EXDEV = 18; + ENODEV = 19; + ENOTDIR = 20; + EISDIR = 21; + EINVAL = 22; + ENFILE = 23; + EMFILE = 24; + ENOTTY = 25; + ETXTBSY = 26; + EFBIG = 27; + ENOSPC = 28; + ESPIPE = 29; + EROFS = 30; + EMLINK = 31; + EPIPE = 32; + EDOM = 33; + ERANGE = 34; + EDEADLK = 35; + ENAMETOOLONG = 36; + ENOLCK = 37; + ENOSYS = 38; + ENOTEMPTY = 39; + ELOOP = 40; + // EWOULDBLOCK = EAGAIN; + ENOMSG = 42; + EIDRM = 43; + ECHRNG = 44; + EL2NSYNC = 45; + EL3HLT = 46; + EL3RST = 47; + ELNRNG = 48; + EUNATCH = 49; + ENOCSI = 50; + EL2HLT = 51; + EBADE = 52; + EBADR = 53; + EXFULL = 54; + ENOANO = 55; + EBADRQC = 56; + EBADSLT = 57; + // EDEADLOCK = EDEADLK; + EBFONT = 59; + ENOSTR = 60; + ENODATA = 61; + ETIME = 62; + ENOSR = 63; + ENONET = 64; + ENOPKG = 65; + EREMOTE = 66; + ENOLINK = 67; + EADV = 68; + ESRMNT = 69; + ECOMM = 70; + EPROTO = 71; + EMULTIHOP = 72; + EDOTDOT = 73; + EBADMSG = 74; + EOVERFLOW = 75; + ENOTUNIQ = 76; + EBADFD = 77; + EREMCHG = 78; + ELIBACC = 79; + ELIBBAD = 80; + ELIBSCN = 81; + ELIBMAX = 82; + ELIBEXEC = 83; + EILSEQ = 84; + ERESTART = 85; + ESTRPIPE = 86; + EUSERS = 87; + ENOTSOCK = 88; + EDESTADDRREQ = 89; + EMSGSIZE = 90; + EPROTOTYPE = 91; + ENOPROTOOPT = 92; + EPROTONOSUPPORT = 93; + ESOCKTNOSUPPORT = 94; + EOPNOTSUPP = 95; + // ENOTSUP = EOPNOTSUPP; + EPFNOSUPPORT = 96; + EAFNOSUPPORT = 97; + EADDRINUSE = 98; + EADDRNOTAVAIL = 99; + ENETDOWN = 100; + ENETUNREACH = 101; + ENETRESET = 102; + ECONNABORTED = 103; + ECONNRESET = 104; + ENOBUFS = 105; + EISCONN = 106; + ENOTCONN = 107; + ESHUTDOWN = 108; + ETOOMANYREFS = 109; + ETIMEDOUT = 110; + ECONNREFUSED = 111; + EHOSTDOWN = 112; + EHOSTUNREACH = 113; + EALREADY = 114; + EINPROGRESS = 115; + ESTALE = 116; + EUCLEAN = 117; + ENOTNAM = 118; + ENAVAIL = 119; + EISNAM = 120; + EREMOTEIO = 121; + EDQUOT = 122; + ENOMEDIUM = 123; + EMEDIUMTYPE = 124; + ECANCELED = 125; + ENOKEY = 126; + EKEYEXPIRED = 127; + EKEYREVOKED = 128; + EKEYREJECTED = 129; + EOWNERDEAD = 130; + ENOTRECOVERABLE = 131; + ERFKILL = 132; + EHWPOISON = 133; +}} + +pub const S_IFMT: u32 = 0o0170000; + +pub const S_IFDIR: u32 = 0o0040000; +pub const S_IFCHR: u32 = 0o0020000; +pub const S_IFBLK: u32 = 0o0060000; +pub const S_IFREG: u32 = 0o0100000; +pub const S_IFIFO: u32 = 0o0010000; +pub const S_IFLNK: u32 = 0o0120000; +pub const S_IFSOCK: u32 = 0o0140000; + +pub const S_ISUID: u32 = 0o04000; +pub const S_ISGID: u32 = 0o02000; +pub const S_ISVTX: u32 = 0o01000; +pub const S_IRUSR: u32 = 0o0400; +pub const S_IWUSR: u32 = 0o0200; +pub const S_IXUSR: u32 = 0o0100; +pub const S_IRWXU: u32 = 0o0700; +pub const S_IRGRP: u32 = 0o0040; +pub const S_IWGRP: u32 = 0o0020; +pub const S_IXGRP: u32 = 0o0010; +pub const S_IRWXG: u32 = 0o0070; +pub const S_IROTH: u32 = 0o0004; +pub const S_IWOTH: u32 = 0o0002; +pub const S_IXOTH: u32 = 0o0001; +pub const S_IRWXO: u32 = 0o0007; + +/// Kernel stat object +#[repr(C)] +#[derive(Default)] +pub struct KStat { + pub st_dev: u64, + pub st_ino: u64, + pub st_nlink: u64, + + pub st_mode: u32, + pub st_uid: u32, + pub st_gid: u32, + pub __pad0: u32, + pub st_rdev: u64, + pub st_size: i64, + pub st_blksize: i64, + pub st_blocks: i64, + + pub st_atime_sec: i64, + pub st_atime_nsec: i64, + pub st_mtime_sec: i64, + pub st_mtime_nsec: i64, + pub st_ctime_sec: i64, + pub st_ctime_nsec: i64, + pub __unused0: i64, + pub __unused1: i64, + pub __unused2: i64, +} + +pub const SEEK_SET: i32 = 0; +pub const SEEK_CUR: i32 = 1; +pub const SEEK_END: i32 = 2; + +pub const O_ACCMODE: i32 = O_PATH | O_RDONLY | O_WRONLY | O_RDWR; +pub const O_PATH: i32 = 0o010000000; +pub const O_RDONLY: i32 = 0; +pub const O_WRONLY: i32 = 1; +pub const O_RDWR: i32 = 2; + +#[repr(C)] +pub struct Iovec { + pub iov_base: usize, + pub iov_len: usize, +} +impl Iovec { + pub unsafe fn slice(&self) -> &[u8] { + std::slice::from_raw_parts(self.iov_base as *const u8, self.iov_len) + } + pub unsafe fn slice_mut(&self) -> &mut [u8] { + std::slice::from_raw_parts_mut(self.iov_base as *mut u8, self.iov_len) + } +} + +#[derive(Debug, Eq, PartialEq)] +#[repr(transparent)] +pub struct SyscallNumber(pub usize); + +lookup! { lookup_syscall: SyscallNumber { + NR_READ = 0; + NR_WRITE = 1; + NR_OPEN = 2; + NR_CLOSE = 3; + NR_STAT = 4; + NR_FSTAT = 5; + NR_LSTAT = 6; + NR_POLL = 7; + NR_LSEEK = 8; + NR_MMAP = 9; + NR_MPROTECT = 10; + NR_MUNMAP = 11; + NR_BRK = 12; + NR_RT_SIGACTION = 13; + NR_RT_SIGPROCMASK = 14; + NR_RT_SIGRETURN = 15; + NR_IOCTL = 16; + NR_PREAD64 = 17; + NR_PWRITE64 = 18; + NR_READV = 19; + NR_WRITEV = 20; + NR_ACCESS = 21; + NR_PIPE = 22; + NR_SELECT = 23; + NR_SCHED_YIELD = 24; + NR_MREMAP = 25; + NR_MSYNC = 26; + NR_MINCORE = 27; + NR_MADVISE = 28; + NR_SHMGET = 29; + NR_SHMAT = 30; + NR_SHMCTL = 31; + NR_DUP = 32; + NR_DUP2 = 33; + NR_PAUSE = 34; + NR_NANOSLEEP = 35; + NR_GETITIMER = 36; + NR_ALARM = 37; + NR_SETITIMER = 38; + NR_GETPID = 39; + NR_SENDFILE = 40; + NR_SOCKET = 41; + NR_CONNECT = 42; + NR_ACCEPT = 43; + NR_SENDTO = 44; + NR_RECVFROM = 45; + NR_SENDMSG = 46; + NR_RECVMSG = 47; + NR_SHUTDOWN = 48; + NR_BIND = 49; + NR_LISTEN = 50; + NR_GETSOCKNAME = 51; + NR_GETPEERNAME = 52; + NR_SOCKETPAIR = 53; + NR_SETSOCKOPT = 54; + NR_GETSOCKOPT = 55; + NR_CLONE = 56; + NR_FORK = 57; + NR_VFORK = 58; + NR_EXECVE = 59; + NR_EXIT = 60; + NR_WAIT4 = 61; + NR_KILL = 62; + NR_UNAME = 63; + NR_SEMGET = 64; + NR_SEMOP = 65; + NR_SEMCTL = 66; + NR_SHMDT = 67; + NR_MSGGET = 68; + NR_MSGSND = 69; + NR_MSGRCV = 70; + NR_MSGCTL = 71; + NR_FCNTL = 72; + NR_FLOCK = 73; + NR_FSYNC = 74; + NR_FDATASYNC = 75; + NR_TRUNCATE = 76; + NR_FTRUNCATE = 77; + NR_GETDENTS = 78; + NR_GETCWD = 79; + NR_CHDIR = 80; + NR_FCHDIR = 81; + NR_RENAME = 82; + NR_MKDIR = 83; + NR_RMDIR = 84; + NR_CREAT = 85; + NR_LINK = 86; + NR_UNLINK = 87; + NR_SYMLINK = 88; + NR_READLINK = 89; + NR_CHMOD = 90; + NR_FCHMOD = 91; + NR_CHOWN = 92; + NR_FCHOWN = 93; + NR_LCHOWN = 94; + NR_UMASK = 95; + NR_GETTIMEOFDAY = 96; + NR_GETRLIMIT = 97; + NR_GETRUSAGE = 98; + NR_SYSINFO = 99; + NR_TIMES = 100; + NR_PTRACE = 101; + NR_GETUID = 102; + NR_SYSLOG = 103; + NR_GETGID = 104; + NR_SETUID = 105; + NR_SETGID = 106; + NR_GETEUID = 107; + NR_GETEGID = 108; + NR_SETPGID = 109; + NR_GETPPID = 110; + NR_GETPGRP = 111; + NR_SETSID = 112; + NR_SETREUID = 113; + NR_SETREGID = 114; + NR_GETGROUPS = 115; + NR_SETGROUPS = 116; + NR_SETRESUID = 117; + NR_GETRESUID = 118; + NR_SETRESGID = 119; + NR_GETRESGID = 120; + NR_GETPGID = 121; + NR_SETFSUID = 122; + NR_SETFSGID = 123; + NR_GETSID = 124; + NR_CAPGET = 125; + NR_CAPSET = 126; + NR_RT_SIGPENDING = 127; + NR_RT_SIGTIMEDWAIT = 128; + NR_RT_SIGQUEUEINFO = 129; + NR_RT_SIGSUSPEND = 130; + NR_SIGALTSTACK = 131; + NR_UTIME = 132; + NR_MKNOD = 133; + NR_USELIB = 134; + NR_PERSONALITY = 135; + NR_USTAT = 136; + NR_STATFS = 137; + NR_FSTATFS = 138; + NR_SYSFS = 139; + NR_GETPRIORITY = 140; + NR_SETPRIORITY = 141; + NR_SCHED_SETPARAM = 142; + NR_SCHED_GETPARAM = 143; + NR_SCHED_SETSCHEDULER = 144; + NR_SCHED_GETSCHEDULER = 145; + NR_SCHED_GET_PRIORITY_MAX = 146; + NR_SCHED_GET_PRIORITY_MIN = 147; + NR_SCHED_RR_GET_INTERVAL = 148; + NR_MLOCK = 149; + NR_MUNLOCK = 150; + NR_MLOCKALL = 151; + NR_MUNLOCKALL = 152; + NR_VHANGUP = 153; + NR_MODIFY_LDT = 154; + NR_PIVOT_ROOT = 155; + NR__SYSCTL = 156; + NR_PRCTL = 157; + NR_ARCH_PRCTL = 158; + NR_ADJTIMEX = 159; + NR_SETRLIMIT = 160; + NR_CHROOT = 161; + NR_SYNC = 162; + NR_ACCT = 163; + NR_SETTIMEOFDAY = 164; + NR_MOUNT = 165; + NR_UMOUNT2 = 166; + NR_SWAPON = 167; + NR_SWAPOFF = 168; + NR_REBOOT = 169; + NR_SETHOSTNAME = 170; + NR_SETDOMAINNAME = 171; + NR_IOPL = 172; + NR_IOPERM = 173; + NR_CREATE_MODULE = 174; + NR_INIT_MODULE = 175; + NR_DELETE_MODULE = 176; + NR_GET_KERNEL_SYMS = 177; + NR_QUERY_MODULE = 178; + NR_QUOTACTL = 179; + NR_NFSSERVCTL = 180; + NR_GETPMSG = 181; + NR_PUTPMSG = 182; + NR_AFS_SYSCALL = 183; + NR_TUXCALL = 184; + NR_SECURITY = 185; + NR_GETTID = 186; + NR_READAHEAD = 187; + NR_SETXATTR = 188; + NR_LSETXATTR = 189; + NR_FSETXATTR = 190; + NR_GETXATTR = 191; + NR_LGETXATTR = 192; + NR_FGETXATTR = 193; + NR_LISTXATTR = 194; + NR_LLISTXATTR = 195; + NR_FLISTXATTR = 196; + NR_REMOVEXATTR = 197; + NR_LREMOVEXATTR = 198; + NR_FREMOVEXATTR = 199; + NR_TKILL = 200; + NR_TIME = 201; + NR_FUTEX = 202; + NR_SCHED_SETAFFINITY = 203; + NR_SCHED_GETAFFINITY = 204; + NR_SET_THREAD_AREA = 205; + NR_IO_SETUP = 206; + NR_IO_DESTROY = 207; + NR_IO_GETEVENTS = 208; + NR_IO_SUBMIT = 209; + NR_IO_CANCEL = 210; + NR_GET_THREAD_AREA = 211; + NR_LOOKUP_DCOOKIE = 212; + NR_EPOLL_CREATE = 213; + NR_EPOLL_CTL_OLD = 214; + NR_EPOLL_WAIT_OLD = 215; + NR_REMAP_FILE_PAGES = 216; + NR_GETDENTS64 = 217; + NR_SET_TID_ADDRESS = 218; + NR_RESTART_SYSCALL = 219; + NR_SEMTIMEDOP = 220; + NR_FADVISE64 = 221; + NR_TIMER_CREATE = 222; + NR_TIMER_SETTIME = 223; + NR_TIMER_GETTIME = 224; + NR_TIMER_GETOVERRUN = 225; + NR_TIMER_DELETE = 226; + NR_CLOCK_SETTIME = 227; + NR_CLOCK_GETTIME = 228; + NR_CLOCK_GETRES = 229; + NR_CLOCK_NANOSLEEP = 230; + NR_EXIT_GROUP = 231; + NR_EPOLL_WAIT = 232; + NR_EPOLL_CTL = 233; + NR_TGKILL = 234; + NR_UTIMES = 235; + NR_VSERVER = 236; + NR_MBIND = 237; + NR_SET_MEMPOLICY = 238; + NR_GET_MEMPOLICY = 239; + NR_MQ_OPEN = 240; + NR_MQ_UNLINK = 241; + NR_MQ_TIMEDSEND = 242; + NR_MQ_TIMEDRECEIVE = 243; + NR_MQ_NOTIFY = 244; + NR_MQ_GETSETATTR = 245; + NR_KEXEC_LOAD = 246; + NR_WAITID = 247; + NR_ADD_KEY = 248; + NR_REQUEST_KEY = 249; + NR_KEYCTL = 250; + NR_IOPRIO_SET = 251; + NR_IOPRIO_GET = 252; + NR_INOTIFY_INIT = 253; + NR_INOTIFY_ADD_WATCH = 254; + NR_INOTIFY_RM_WATCH = 255; + NR_MIGRATE_PAGES = 256; + NR_OPENAT = 257; + NR_MKDIRAT = 258; + NR_MKNODAT = 259; + NR_FCHOWNAT = 260; + NR_FUTIMESAT = 261; + NR_NEWFSTATAT = 262; + NR_UNLINKAT = 263; + NR_RENAMEAT = 264; + NR_LINKAT = 265; + NR_SYMLINKAT = 266; + NR_READLINKAT = 267; + NR_FCHMODAT = 268; + NR_FACCESSAT = 269; + NR_PSELECT6 = 270; + NR_PPOLL = 271; + NR_UNSHARE = 272; + NR_SET_ROBUST_LIST = 273; + NR_GET_ROBUST_LIST = 274; + NR_SPLICE = 275; + NR_TEE = 276; + NR_SYNC_FILE_RANGE = 277; + NR_VMSPLICE = 278; + NR_MOVE_PAGES = 279; + NR_UTIMENSAT = 280; + NR_EPOLL_PWAIT = 281; + NR_SIGNALFD = 282; + NR_TIMERFD_CREATE = 283; + NR_EVENTFD = 284; + NR_FALLOCATE = 285; + NR_TIMERFD_SETTIME = 286; + NR_TIMERFD_GETTIME = 287; + NR_ACCEPT4 = 288; + NR_SIGNALFD4 = 289; + NR_EVENTFD2 = 290; + NR_EPOLL_CREATE1 = 291; + NR_DUP3 = 292; + NR_PIPE2 = 293; + NR_INOTIFY_INIT1 = 294; + NR_PREADV = 295; + NR_PWRITEV = 296; + NR_RT_TGSIGQUEUEINFO = 297; + NR_PERF_EVENT_OPEN = 298; + NR_RECVMMSG = 299; + NR_FANOTIFY_INIT = 300; + NR_FANOTIFY_MARK = 301; + NR_PRLIMIT64 = 302; + NR_NAME_TO_HANDLE_AT = 303; + NR_OPEN_BY_HANDLE_AT = 304; + NR_CLOCK_ADJTIME = 305; + NR_SYNCFS = 306; + NR_SENDMMSG = 307; + NR_SETNS = 308; + NR_GETCPU = 309; + NR_PROCESS_VM_READV = 310; + NR_PROCESS_VM_WRITEV = 311; + NR_KCMP = 312; + NR_FINIT_MODULE = 313; + NR_SCHED_SETATTR = 314; + NR_SCHED_GETATTR = 315; + NR_RENAMEAT2 = 316; + NR_SECCOMP = 317; + NR_GETRANDOM = 318; + NR_MEMFD_CREATE = 319; + NR_KEXEC_FILE_LOAD = 320; + NR_BPF = 321; + NR_EXECVEAT = 322; + NR_USERFAULTFD = 323; + NR_MEMBARRIER = 324; + NR_MLOCK2 = 325; + NR_COPY_FILE_RANGE = 326; + NR_PREADV2 = 327; + NR_PWRITEV2 = 328; + NR_PKEY_MPROTECT = 329; + NR_PKEY_ALLOC = 330; + NR_PKEY_FREE = 331; + NR_STATX = 332; + NR_IO_PGETEVENTS = 333; + NR_RSEQ = 334; + NR_PIDFD_SEND_SIGNAL = 424; + NR_IO_URING_SETUP = 425; + NR_IO_URING_ENTER = 426; + NR_IO_URING_REGISTER = 427; + NR_OPEN_TREE = 428; + NR_MOVE_MOUNT = 429; + NR_FSOPEN = 430; + NR_FSCONFIG = 431; + NR_FSMOUNT = 432; + NR_FSPICK = 433; + NR_PIDFD_OPEN = 434; + NR_CLONE3 = 435; +}} + +pub const MAP_FAILED: usize = 0xffffffffffffffff; + +pub const MAP_SHARED: usize = 0x01; +pub const MAP_PRIVATE: usize = 0x02; +pub const MAP_SHARED_VALIDATE: usize = 0x03; +pub const MAP_TYPE: usize = 0x0f; +pub const MAP_FIXED: usize = 0x10; +pub const MAP_ANON: usize = 0x20; +pub const MAP_32BIT: usize = 0x40; +pub const MAP_ANONYMOUS: usize = MAP_ANON; +pub const MAP_NORESERVE: usize = 0x4000; +pub const MAP_GROWSDOWN: usize = 0x0100; +pub const MAP_DENYWRITE: usize = 0x0800; +pub const MAP_EXECUTABLE: usize = 0x1000; +pub const MAP_LOCKED: usize = 0x2000; +pub const MAP_POPULATE: usize = 0x8000; +pub const MAP_NONBLOCK: usize = 0x10000; +pub const MAP_STACK: usize = 0x20000; +pub const MAP_HUGETLB: usize = 0x40000; +pub const MAP_SYNC: usize = 0x80000; +pub const MAP_FIXED_NOREPLACE: usize = 0x100000; +pub const MAP_FILE: usize = 0; + +pub const MAP_HUGE_SHIFT: usize = 26; +pub const MAP_HUGE_MASK: usize = 0x3f; +pub const MAP_HUGE_64KB: usize = 16 << 26; +pub const MAP_HUGE_512KB: usize = 19 << 26; +pub const MAP_HUGE_1MB: usize = 20 << 26; +pub const MAP_HUGE_2MB: usize = 21 << 26; +pub const MAP_HUGE_8MB: usize = 23 << 26; +pub const MAP_HUGE_16MB: usize = 24 << 26; +pub const MAP_HUGE_32MB: usize = 25 << 26; +pub const MAP_HUGE_256MB: usize = 28 << 26; +pub const MAP_HUGE_512MB: usize = 29 << 26; +pub const MAP_HUGE_1GB: usize = 30 << 26; +pub const MAP_HUGE_2GB: usize = 31 << 26; +pub const MAP_HUGE_16GB: usize = 34 << 26; + +pub const PROT_NONE: usize = 0; +pub const PROT_READ: usize = 1; +pub const PROT_WRITE: usize = 2; +pub const PROT_EXEC: usize = 4; +pub const PROT_GROWSDOWN: usize = 0x01000000; +pub const PROT_GROWSUP: usize = 0x02000000; + +pub const MS_ASYNC: usize = 1; +pub const MS_INVALIDATE: usize = 2; +pub const MS_SYNC: usize = 4; + +pub const MCL_CURRENT: usize = 1; +pub const MCL_FUTURE: usize = 2; +pub const MCL_ONFAULT: usize = 4; + +pub const POSIX_MADV_NORMAL: usize = 0; +pub const POSIX_MADV_RANDOM: usize = 1; +pub const POSIX_MADV_SEQUENTIAL: usize = 2; +pub const POSIX_MADV_WILLNEED: usize = 3; +pub const POSIX_MADV_DONTNEED: usize = 4; + +pub const MADV_NORMAL: usize = 0; +pub const MADV_RANDOM: usize = 1; +pub const MADV_SEQUENTIAL: usize = 2; +pub const MADV_WILLNEED: usize = 3; +pub const MADV_DONTNEED: usize = 4; +pub const MADV_FREE: usize = 8; +pub const MADV_REMOVE: usize = 9; +pub const MADV_DONTFORK: usize = 10; +pub const MADV_DOFORK: usize = 11; +pub const MADV_MERGEABLE: usize = 12; +pub const MADV_UNMERGEABLE: usize = 13; +pub const MADV_HUGEPAGE: usize = 14; +pub const MADV_NOHUGEPAGE: usize = 15; +pub const MADV_DONTDUMP: usize = 16; +pub const MADV_DODUMP: usize = 17; +pub const MADV_WIPEONFORK: usize = 18; +pub const MADV_KEEPONFORK: usize = 19; +pub const MADV_COLD: usize = 20; +pub const MADV_PAGEOUT: usize = 21; +pub const MADV_HWPOISON: usize = 100; +pub const MADV_SOFT_OFFLINE: usize = 101; + +pub const MREMAP_MAYMOVE: usize = 1; +pub const MREMAP_FIXED: usize = 2; + +pub const MLOCK_ONFAULT: usize = 0x01; + +pub const MFD_CLOEXEC: usize = 0x0001; +pub const MFD_ALLOW_SEALING: usize = 0x0002; +pub const MFD_HUGETLB: usize = 0x0004; + +#[repr(C)] +pub struct TimeSpec { + pub tv_sec: i64, + pub tv_nsec: i64, +}