From bb45466a4e83bbb39510c19a79fab46b7f716153 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Wed, 6 Jul 2022 09:11:10 +0200 Subject: [PATCH] Implement clone in userspace. --- Cargo.lock | 1 + Cargo.toml | 1 + src/lib.rs | 1 + src/platform/pte.rs | 1 - src/platform/redox/exec.rs | 288 +++++++++++++------------------ src/platform/redox/extra.rs | 335 ++++++++++++++++++++++++++---------- src/platform/redox/mod.rs | 28 +-- 7 files changed, 369 insertions(+), 286 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a4199296a..9c1bc7241 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -333,6 +333,7 @@ dependencies = [ "lazy_static", "memchr", "memoffset", + "plain", "posix-regex", "ralloc", "rand", diff --git a/Cargo.toml b/Cargo.toml index f63fe9fb5..e7c0d7875 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ memoffset = "0.5.1" posix-regex = { path = "posix-regex", features = ["no_std"] } rand = { version = "0.5.5", default-features = false } memchr = { version = "2.2.0", default-features = false } +plain = "0.2" [dependencies.goblin] version = "0.0.21" diff --git a/src/lib.rs b/src/lib.rs index f3883d827..f03ce4c08 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ #![allow(non_upper_case_globals)] #![allow(unused_variables)] #![feature(allocator_api)] +#![feature(array_chunks)] #![feature(asm_const)] #![feature(box_into_pin)] #![feature(c_variadic)] diff --git a/src/platform/pte.rs b/src/platform/pte.rs index fb779c6eb..b2a193f15 100644 --- a/src/platform/pte.rs +++ b/src/platform/pte.rs @@ -118,7 +118,6 @@ pub unsafe extern "C" fn pte_osThreadCreate( if stack_base as isize == -1 { return PTE_OS_GENERAL_FAILURE; } - ptr::write_bytes(stack_base as *mut u8, 0, stack_size); let stack_end = stack_base.add(stack_size); let mut stack = stack_end as *mut usize; { diff --git a/src/platform/redox/exec.rs b/src/platform/redox/exec.rs index 1b9904865..f2231068b 100644 --- a/src/platform/redox/exec.rs +++ b/src/platform/redox/exec.rs @@ -1,4 +1,5 @@ use core::convert::TryFrom; +use super::extra::FdGuard; use alloc::{ collections::{btree_map::Entry, BTreeMap}, @@ -7,14 +8,16 @@ use alloc::{ use syscall::{ data::ExecMemRange, - error::{Error, Result, ENOEXEC, ENOMEM}, - flag::{AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, MapFlags}, + error::*, + flag::{AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, MapFlags, O_WRONLY, SEEK_SET}, }; use crate::fs::File; -fn read_all(fd: usize, offset: u64, buf: &mut [u8]) -> Result<()> { - syscall::lseek(fd, offset as isize, syscall::SEEK_SET).unwrap(); +fn read_all(fd: usize, offset: Option<u64>, buf: &mut [u8]) -> Result<()> { + if let Some(offset) = offset { + syscall::lseek(fd, offset as isize, syscall::SEEK_SET).unwrap(); + } let mut total_bytes_read = 0; @@ -27,12 +30,12 @@ fn read_all(fd: usize, offset: u64, buf: &mut [u8]) -> Result<()> { Ok(()) } -fn find_free_target_addr(tree: &BTreeMap<usize, TreeEntry>, size: usize) -> Option<usize> { +fn find_free_target_addr(tree: &BTreeMap<usize, usize>, size: usize) -> Option<usize> { let mut iterator = tree.iter().peekable(); // Ignore the space between zero and the first region, to avoid null pointers. - while let Some((cur_address, entry)) = iterator.next() { - let end = *cur_address + entry.size; + while let Some((cur_address, entry_size)) = iterator.next() { + let end = *cur_address + entry_size; if let Some((next_address, _)) = iterator.peek() { if **next_address - end > size { @@ -45,20 +48,6 @@ fn find_free_target_addr(tree: &BTreeMap<usize, TreeEntry>, size: usize) -> Opti None } -struct TreeEntry { - size: usize, // always a page-size multiple - flags: MapFlags, - accessible_addr: *mut u8, // also always a page-size multiple -} -impl Drop for TreeEntry { - fn drop(&mut self) { - unsafe { - if !self.accessible_addr.is_null() { - let _ = syscall::funmap(self.accessible_addr as usize, self.size); - } - } - } -} #[cfg(target_arch = "x86_64")] const PAGE_SIZE: usize = 4096; @@ -66,6 +55,8 @@ const PAGE_SIZE: usize = 4096; const FD_ANONYMOUS: usize = !0; pub fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], args_envs_size_without_nul: usize) -> Result<usize> { + use goblin::elf64::{header::Header, program_header::program_header64::{ProgramHeader, PT_LOAD, PF_W, PF_X}}; + let fd = *file as usize; let total_args_envs_size = args_envs_size_without_nul + args.len() + envs.len(); @@ -78,198 +69,155 @@ pub fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], args_ // TODO: Introduce RAII guards to all owned allocations so that no leaks occur in case of // errors. - use goblin::elf::header::header64::Header; - let mut header_bytes = [0_u8; core::mem::size_of::<Header>()]; - - read_all(fd, 0, &mut header_bytes)?; - + read_all(fd, Some(0), &mut header_bytes)?; let header = Header::from_bytes(&header_bytes); + let grants_fd = { + let current_addrspace_fd = FdGuard::new(syscall::open("thisproc:current/addrspace", 0)?); + FdGuard::new(syscall::dup(*current_addrspace_fd, b"empty")?) + }; + let memory_fd = FdGuard::new(syscall::dup(*grants_fd, b"mem")?); + let instruction_ptr = usize::try_from(header.e_entry).map_err(|_| Error::new(ENOEXEC))?; - let mut tree = BTreeMap::<usize, TreeEntry>::new(); + // Never allow more than 1 MiB of program headers. TODO: Capabilities again? + const MAX_PH_SIZE: usize = 1024 * 1024; + let phentsize = u64::from(header.e_phentsize) as usize; + let phnum = u64::from(header.e_phnum) as usize; + let pheaders_size = phentsize.saturating_mul(phnum); + + if pheaders_size > MAX_PH_SIZE { + return Err(Error::new(E2BIG)); + } + let mut phs = vec! [0_u8; pheaders_size]; - use goblin::elf64::program_header::{self, ProgramHeader}; + let mut tree = BTreeMap::new(); + tree.insert(0, PAGE_SIZE); - let phdrs_size = (header.e_phnum as usize) * (header.e_phentsize as usize); - let phdrs_size_aligned = (phdrs_size + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE; - let phdrs_mem = unsafe { syscall::fmap(FD_ANONYMOUS, &syscall::Map { offset: 0, size: phdrs_size_aligned, address: 0, flags: MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE })? }; - read_all(fd, header.e_phoff, unsafe { core::slice::from_raw_parts_mut(phdrs_mem as *mut u8, phdrs_size) })?; + const BUFSZ: usize = 16384; + let mut buf = vec! [0_u8; BUFSZ]; - let phdrs = unsafe { core::slice::from_raw_parts(phdrs_mem as *const ProgramHeader, header.e_phnum as usize) }; + read_all(*file as usize, Some(header.e_phoff), &mut phs).map_err(|_| Error::new(EIO))?; - for segment in phdrs { + for ph_idx in 0..phnum { + let ph_bytes = &phs[ph_idx * phentsize..(ph_idx + 1) * phentsize]; + let segment: &ProgramHeader = plain::from_bytes(ph_bytes).map_err(|_| Error::new(EINVAL))?; let mut flags = syscall::PROT_READ; // W ^ X. If it is executable, do not allow it to be writable, even if requested - if segment.p_flags & program_header::PF_X == program_header::PF_X { + if segment.p_flags & PF_X == PF_X { flags |= syscall::PROT_EXEC; - } else if segment.p_flags & program_header::PF_W == program_header::PF_W { + } else if segment.p_flags & PF_W == PF_W { flags |= syscall::PROT_WRITE; } - match segment.p_type { - program_header::PT_LOAD => { - let voff = segment.p_vaddr as usize % PAGE_SIZE; - let vaddr = segment.p_vaddr as usize - voff; - let size = - (segment.p_memsz as usize + voff + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE; - - if segment.p_filesz > segment.p_memsz { - return Err(Error::new(ENOEXEC)); - } - - let mem = match tree - .range_mut(..=vaddr) - .next_back() - .filter(|(other_vaddr, entry)| **other_vaddr + entry.size > vaddr) - { - None => unsafe { - let mem = syscall::fmap( - FD_ANONYMOUS, - &syscall::Map { - offset: 0, - address: 0, - size, - flags: syscall::PROT_WRITE, - }, - ) - .map_err(|_| Error::new(ENOMEM))? - as *mut u8; - tree.insert( - vaddr, - TreeEntry { - size, - flags, - accessible_addr: mem, - }, - ); - mem - }, - Some(( - _, - &mut TreeEntry { - flags: ref mut f, - accessible_addr, - .. - }, - )) => { - *f |= flags; - accessible_addr - } - }; - read_all(fd, segment.p_offset, unsafe { - core::slice::from_raw_parts_mut(mem.add(voff), segment.p_filesz as usize) - })?; + let voff = segment.p_vaddr as usize % PAGE_SIZE; + let vaddr = segment.p_vaddr as usize - voff; + let size = + (segment.p_memsz as usize + voff + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE; + + if segment.p_filesz > segment.p_memsz { + return Err(Error::new(ENOEXEC)); + } + if segment.p_type == PT_LOAD { + mprotect_remote(*grants_fd, vaddr, size, flags)?; + syscall::lseek(*file as usize, segment.p_offset as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; + syscall::lseek(*memory_fd, segment.p_vaddr as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; + + for size in core::iter::repeat(BUFSZ).take((segment.p_filesz as usize) / BUFSZ).chain(Some((segment.p_filesz as usize) % BUFSZ)) { + read_all(*file as usize, None, &mut buf[..size]).map_err(|_| Error::new(EIO))?; + let _ = syscall::write(*memory_fd, &buf[..size]).map_err(|_| Error::new(EIO))?; + } + + if !tree.range(..=vaddr).next_back().filter(|(start, size)| **start + **size > vaddr).is_some() { + tree.insert(vaddr, size); } - _ => (), } } - let (stack_base, mut stack_mem) = unsafe { - let stack_base = syscall::fmap(FD_ANONYMOUS, &syscall::Map { offset: 0, size: STACK_SIZE, address: 0, flags: MapFlags::PROT_WRITE | MapFlags::PROT_READ | MapFlags::MAP_PRIVATE })? as *mut u8; - let stack_mem = stack_base.add(STACK_SIZE).sub(256); - - (stack_base, stack_mem) - }; - - tree.insert(STACK_TOP - STACK_SIZE, TreeEntry { - size: STACK_SIZE, - flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, - accessible_addr: stack_base, - }); - let mut stack_mem = stack_mem.cast::<usize>(); + // Setup a stack starting from the very end of the address space, and then growing downwards. + const STACK_TOP: usize = 1 << 47; + const STACK_SIZE: usize = 1024 * 1024; - let target_phdr_address = find_free_target_addr(&tree, phdrs_size_aligned).ok_or(Error::new(ENOMEM))?; - tree.insert(target_phdr_address, TreeEntry { - size: phdrs_size_aligned, - accessible_addr: phdrs_mem as *mut u8, - flags: MapFlags::PROT_READ | MapFlags::MAP_PRIVATE, - }); + mprotect_remote(*grants_fd, STACK_TOP - STACK_SIZE, STACK_SIZE, MapFlags::PROT_READ | MapFlags::PROT_WRITE)?; + tree.insert(STACK_TOP - STACK_SIZE, STACK_SIZE); let mut sp = STACK_TOP - 256; - let mut push = |word: usize| unsafe { + let mut push = |word: usize| { sp -= core::mem::size_of::<usize>(); - stack_mem = stack_mem.sub(1); - stack_mem.write(word); + let _ = syscall::lseek(*memory_fd, sp as isize, SEEK_SET)?; + let _ = syscall::write(*memory_fd, &usize::to_ne_bytes(word))?; + Ok(()) }; - push(0); - push(AT_NULL); - push(instruction_ptr); - push(AT_ENTRY); - push(target_phdr_address); - push(AT_PHDR); - push(header.e_phnum as usize); - push(AT_PHNUM); - push(header.e_phentsize as usize); - push(AT_PHENT); + let pheaders_size_aligned = (pheaders_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE; + let pheaders = find_free_target_addr(&tree, pheaders_size_aligned).ok_or(Error::new(ENOMEM))?; + tree.insert(pheaders, pheaders_size_aligned); + mprotect_remote(*grants_fd, pheaders, pheaders_size_aligned, MapFlags::PROT_READ)?; + + syscall::lseek(*memory_fd, pheaders as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; + syscall::write(*memory_fd, &phs).map_err(|_| Error::new(EIO))?; + + push(0)?; + push(AT_NULL)?; + push(header.e_entry as usize)?; + push(AT_ENTRY)?; + push(pheaders)?; + push(AT_PHDR)?; + push(header.e_phnum as usize)?; + push(AT_PHNUM)?; + push(header.e_phentsize as usize)?; + push(AT_PHENT)?; let args_envs_size_aligned = (total_args_envs_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE; let target_args_env_address = find_free_target_addr(&tree, args_envs_size_aligned).ok_or(Error::new(ENOMEM))?; + mprotect_remote(*grants_fd, target_args_env_address, args_envs_size_aligned, MapFlags::PROT_READ | MapFlags::PROT_WRITE)?; + tree.insert(target_args_env_address, args_envs_size_aligned); - unsafe { - let map = syscall::Map { - offset: 0, - flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, - address: 0, - size: args_envs_size_aligned, - }; - let ptr = syscall::fmap(FD_ANONYMOUS, &map)? as *mut u8; - let args_envs_region = core::slice::from_raw_parts_mut(ptr, total_args_envs_size); - let mut offset = 0; - - for collection in &[envs, args] { - push(0); - - for source_slice in collection.iter().rev().copied() { - push(target_args_env_address + offset); - args_envs_region[offset..offset + source_slice.len()].copy_from_slice(source_slice); - offset += source_slice.len() + 1; - } - } + let mut offset = 0; - tree.insert(target_args_env_address, TreeEntry { - accessible_addr: ptr, - size: args_envs_size_aligned, - flags: MapFlags::PROT_READ | MapFlags::MAP_PRIVATE, - }); - } - push(args.len()); + let mut argc = 0; - const STACK_TOP: usize = (1 << 47); - const STACK_SIZE: usize = 1024 * 1024; + for (collection, is_args) in [(envs, false), (args, true)] { + push(0)?; - let memranges = tree - .into_iter() - .map(|(address, mut tree_entry)| { - // Prevent use-after-free - let old_address = core::mem::replace(&mut tree_entry.accessible_addr, core::ptr::null_mut()) as usize; - - ExecMemRange { - address, - size: tree_entry.size, - flags: tree_entry.flags.bits(), - old_address, - } - }) - .collect::<Vec<_>>(); + for source_slice in collection.iter().rev() { + if is_args { argc += 1; } + push(target_args_env_address + offset)?; - /*unsafe { - let stack = &*(stack_mem as *const crate::start::Stack); + syscall::lseek(*memory_fd, (target_args_env_address + offset) as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; + let _ = syscall::write(*memory_fd, source_slice).map_err(|_| Error::new(EIO))?; + offset += source_slice.len() + 1; + } + } - }*/ + push(argc)?; unsafe { crate::ld_so::tcb::Tcb::deactivate(); } // TODO: Restore old name if exec failed? - if let Ok(name_fd) = syscall::open("thisproc:current/name", syscall::O_WRONLY) { + if let Ok(name_fd) = syscall::open("thisproc:current/name", O_WRONLY) { let _ = syscall::write(name_fd, path); let _ = syscall::close(name_fd); } drop(file); - syscall::exec(&memranges, instruction_ptr, sp)?; + let addrspace_selection_fd = FdGuard::new(syscall::open("thisproc:current/current-addrspace", O_WRONLY)?); + + let mut buf = [0_u8; 24]; + buf[..8].copy_from_slice(&usize::to_ne_bytes(*grants_fd)); + buf[8..16].copy_from_slice(&usize::to_ne_bytes(sp)); + buf[16..24].copy_from_slice(&usize::to_ne_bytes(header.e_entry as usize)); + + let _ = syscall::write(*addrspace_selection_fd, &buf); unreachable!(); } +fn mprotect_remote(socket: usize, addr: usize, len: usize, flags: MapFlags) -> Result<()> { + let mut grants_buf = [0_u8; 24]; + grants_buf[..8].copy_from_slice(&usize::to_ne_bytes(addr)); + grants_buf[8..16].copy_from_slice(&usize::to_ne_bytes(len)); + grants_buf[16..24].copy_from_slice(&usize::to_ne_bytes(flags.bits())); + syscall::write(socket, &grants_buf)?; + Ok(()) +} diff --git a/src/platform/redox/extra.rs b/src/platform/redox/extra.rs index 93fa489e4..dc7f46868 100644 --- a/src/platform/redox/extra.rs +++ b/src/platform/redox/extra.rs @@ -1,7 +1,10 @@ -use core::{ptr, slice}; +use core::{mem, ptr, slice}; use core::arch::global_asm; -use syscall::data::CloneInfo; +use syscall::data::Map; +use syscall::flag::{MapFlags, O_CLOEXEC}; +use syscall::error::{Error, Result, EINVAL, ENAMETOOLONG}; +use syscall::SIGCONT; use crate::platform::{sys::e, types::*}; @@ -51,80 +54,248 @@ pub unsafe extern "C" fn redox_physunmap(virtual_address: *mut c_void) -> c_int e(syscall::physunmap(virtual_address as usize)) as c_int } -extern "C" { - pub fn pte_clone_inner(info: *const CloneInfo) -> usize; +pub struct FdGuard { + fd: usize, + taken: bool, } +impl FdGuard { + pub fn new(fd: usize) -> Self { + Self { + fd, taken: false, + } + } + pub fn take(&mut self) -> usize { + self.taken = true; + self.fd + } +} +impl core::ops::Deref for FdGuard { + type Target = usize; + + fn deref(&self) -> &Self::Target { + &self.fd + } +} + +impl Drop for FdGuard { + fn drop(&mut self) { + if !self.taken { + let _ = syscall::close(self.fd); + } + } +} + +fn new_context() -> Result<(FdGuard, usize)> { + // Create a new context (fields such as uid/gid will be inherited from the current context). + let fd = FdGuard::new(syscall::open("thisproc:new/open_via_dup", O_CLOEXEC)?); + // Extract pid. + let mut buffer = [0_u8; 64]; + let len = syscall::fpath(*fd, &mut buffer)?; + let buffer = buffer.get(..len).ok_or(Error::new(ENAMETOOLONG))?; + + let colon_idx = buffer.iter().position(|c| *c == b':').ok_or(Error::new(EINVAL))?; + let slash_idx = buffer.iter().skip(colon_idx).position(|c| *c == b'/').ok_or(Error::new(EINVAL))? + colon_idx; + let pid_bytes = buffer.get(colon_idx + 1..slash_idx).ok_or(Error::new(EINVAL))?; + let pid_str = core::str::from_utf8(pid_bytes).map_err(|_| Error::new(EINVAL))?; + let pid = pid_str.parse::<usize>().map_err(|_| Error::new(EINVAL))?; + + Ok((fd, pid)) +} + +fn copy_str(cur_pid_fd: usize, new_pid_fd: usize, key: &str) -> Result<()> { + let cur_name_fd = FdGuard::new(syscall::dup(cur_pid_fd, key.as_bytes())?); + let new_name_fd = FdGuard::new(syscall::dup(new_pid_fd, key.as_bytes())?); + + let mut buf = [0_u8; 256]; + let len = syscall::read(*cur_name_fd, &mut buf)?; + let buf = buf.get(..len).ok_or(Error::new(ENAMETOOLONG))?; + + syscall::write(*new_name_fd, &buf)?; + + Ok(()) +} #[cfg(target_arch = "x86_64")] -global_asm!(" - .globl pte_clone_inner - .type pte_clone_inner, @function - .p2align 6", - // Parameters: <info_ptr> in RDI -"pte_clone_inner: - mov rax, {SYS_CLONE} - mov rsi, rdi - mov rdi, {CLONE_FLAGS} - mov rdx, {INFO_LEN}", - // Call clone(flags, info_ptr, info_len) syscall - "syscall - - # Check if child or parent - test rax, rax - jnz .parent +fn copy_float_env_regs(cur_pid_fd: usize, new_pid_fd: usize) -> Result<()> { + // Copy environment registers. + { + let cur_env_regs_fd = FdGuard::new(syscall::dup(cur_pid_fd, b"regs/env")?); + let new_env_regs_fd = FdGuard::new(syscall::dup(new_pid_fd, b"regs/env")?); - # Load registers - pop rax - pop rdi - pop rsi - pop rdx - pop rcx - pop r8 - pop r9 + let mut env_regs = syscall::EnvRegisters::default(); + let _ = syscall::read(*cur_env_regs_fd, &mut env_regs)?; + let _ = syscall::write(*new_env_regs_fd, &env_regs)?; + } + // Copy float registers. + { + let cur_float_regs_fd = FdGuard::new(syscall::dup(cur_pid_fd, b"regs/float")?); + let new_float_regs_fd = FdGuard::new(syscall::dup(new_pid_fd, b"regs/float")?); - # Call entry point - call rax + let mut float_regs = syscall::FloatRegisters::default(); + let _ = syscall::read(*cur_float_regs_fd, &mut float_regs)?; + let _ = syscall::write(*new_float_regs_fd, &float_regs)?; + } + + Ok(()) +} + +/// Spawns a new context sharing the same address space as the current one (i.e. a new thread). +pub unsafe fn pte_clone_impl(stack: *mut usize) -> Result<usize> { + let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?); + let (new_pid_fd, new_pid) = new_context()?; + + // Allocate a new signal stack. + { + let sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?); + + const SIGSTACK_SIZE: usize = 1024 * 256; + + // TODO: Put sigstack at high addresses? + let target_sigstack = syscall::fmap(!0, &Map { address: 0, flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, offset: 0, size: SIGSTACK_SIZE })? + SIGSTACK_SIZE; + + let _ = syscall::write(*sigstack_fd, &usize::to_ne_bytes(target_sigstack))?; + } + + copy_str(*cur_pid_fd, *new_pid_fd, "name")?; + copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?; + + // Reuse existing address space + { + let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?); + let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?); - # Exit - mov rax, {SYS_EXIT} - xor rdi, rdi - syscall + let buf = create_set_addr_space_buf(*cur_addr_space_fd, pte_clone_ret as usize, stack as usize); + let _ = syscall::write(*new_addr_space_sel_fd, &buf)?; + } + + // Reuse file table + { + let cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?); + let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?); + + let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*cur_filetable_fd))?; + } + + + copy_float_env_regs(*cur_pid_fd, *new_pid_fd)?; + + // Unblock context. + syscall::kill(new_pid, SIGCONT); + + Ok(0) +} +fn create_set_addr_space_buf(space: usize, ip: usize, sp: usize) -> [u8; mem::size_of::<usize>() * 3] { + let mut buf = [0_u8; 3 * mem::size_of::<usize>()]; + let mut chunks = buf.array_chunks_mut::<{mem::size_of::<usize>()}>(); + *chunks.next().unwrap() = usize::to_ne_bytes(space); + *chunks.next().unwrap() = usize::to_ne_bytes(sp); + *chunks.next().unwrap() = usize::to_ne_bytes(ip); + buf +} +/// Spawns a new context which will not share the same address space as the current one. File +/// descriptors from other schemes are reobtained with `dup`, and grants referencing such file +/// descriptors are reobtained through `fmap`. Other mappings are kept but duplicated using CoW. +pub fn fork_impl() -> Result<usize> { + unsafe { + Error::demux(fork_wrapper()) + } +} + +fn fork_inner(initial_rsp: *mut usize) -> Result<usize> { + let new_pid = { + let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?); + let (new_pid_fd, new_pid) = new_context()?; + + // Do not allocate new signal stack, but copy existing address (all memory will be re-mapped + // CoW later). + { + let cur_sigstack_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigstack")?); + let new_sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?); + + let mut sigstack_buf = usize::to_ne_bytes(0); + + let _ = syscall::read(*cur_sigstack_fd, &mut sigstack_buf); + let _ = syscall::write(*new_sigstack_fd, &sigstack_buf); + } + + copy_str(*cur_pid_fd, *new_pid_fd, "name")?; + copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?; + + // CoW-duplicate address space. + { + let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?); + + // FIXME: Find mappings which use external file descriptors + + let new_addr_space_fd = FdGuard::new(syscall::dup(*cur_addr_space_fd, b"exclusive")?); + let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?); - # Invalid instruction on failure to exit - ud2 + let buf = create_set_addr_space_buf(*new_addr_space_fd, fork_ret as usize, initial_rsp as usize); + let _ = syscall::write(*new_addr_space_sel_fd, &buf)?; + } - # Return PID if parent -.parent: + // Copy existing files into new file table, but do not reuse the same file table (i.e. new + // parent FDs will not show up for the child). + { + let cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?); + // TODO: Use cross_scheme_links or something similar to avoid copying the file table in the + // kernel. + let new_filetable_fd = FdGuard::new(syscall::dup(*cur_filetable_fd, b"copy")?); + let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?); + + let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*new_filetable_fd)); + } + copy_float_env_regs(*cur_pid_fd, *new_pid_fd)?; + + new_pid + }; + + // Unblock context. + syscall::kill(new_pid, SIGCONT); + + Ok(new_pid) +} +#[no_mangle] +unsafe extern "sysv64" fn __relibc_internal_fork_impl(initial_rsp: *mut usize) -> usize { + Error::mux(fork_inner(initial_rsp)) +} + +core::arch::global_asm!(" + .p2align 6 + .globl fork_wrapper + .type fork_wrapper, @function +fork_wrapper: + push rbp + mov rbp, rsp + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + + mov rdi, rsp + call __relibc_internal_fork_impl + jmp 2f + +fork_ret: + xor rax, rax +2: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + + pop rbp ret - ", - SYS_EXIT = const(syscall::SYS_EXIT), - SYS_CLONE = const(syscall::SYS_CLONE), - CLONE_FLAGS = const( - syscall::CLONE_VM.bits() - | syscall::CLONE_FS.bits() - | syscall::CLONE_FILES.bits() - | syscall::CLONE_SIGHAND.bits() - | syscall::CLONE_STACK.bits() - ), - INFO_LEN = const(core::mem::size_of::<CloneInfo>()), -); - -/*global_asm!(" - .globl pte_clone_inner - .type pte_clone_inner, @function - -pte_clone_inner: - # Move the 1st argument `stack` of this function into the second argument to clone. - mov rsi, rdi - mov rax, {SYS_CLONE} - mov rdi, {flags} - - # Call clone syscall - syscall - - # Check if child or parent - test rax, rax - jnz 2f + .size fork_wrapper, . - fork_wrapper + + .globl pte_clone_ret + .type pte_clone_ret, @function +pte_clone_ret: # Load registers pop rax @@ -138,28 +309,12 @@ pte_clone_inner: # Call entry point call rax - # Exit - mov rax, 1 - xor rdi, rdi - syscall - - # Invalid instruction on failure to exit - ud2 - - # Return PID if parent -2: ret + .size pte_clone_ret, . - pte_clone_ret +"); - .size pte_clone_inner, . - pte_clone_inner - - ", - - flags = const( - syscall::CLONE_VM.bits() - | syscall::CLONE_FS.bits() - | syscall::CLONE_FILES.bits() - | syscall::CLONE_SIGHAND.bits() - | syscall::CLONE_STACK.bits() - ), - SYS_CLONE = const(syscall::SYS_CLONE), -);*/ +extern "sysv64" { + fn fork_wrapper() -> usize; + fn fork_ret(); + fn pte_clone_ret(); +} diff --git a/src/platform/redox/mod.rs b/src/platform/redox/mod.rs index 266ffec10..a51b56dce 100644 --- a/src/platform/redox/mod.rs +++ b/src/platform/redox/mod.rs @@ -355,7 +355,7 @@ impl Pal for Sys { // Close all O_CLOEXEC file descriptors. TODO: close_range? { - let name = CStr::from_bytes_with_nul(b"thisproc:current/files\0").expect("string should be valid"); + let name = CStr::from_bytes_with_nul(b"thisproc:current/filetable\0").expect("string should be valid"); let files_fd = match File::open(name, fcntl::O_RDONLY) { Ok(f) => f, Err(_) => return -1, @@ -455,7 +455,7 @@ impl Pal for Sys { } fn fork() -> pid_t { - e(unsafe { syscall::clone(syscall::CloneFlags::empty()) }) as pid_t + e(extra::fork_impl()) as pid_t } fn fstat(fildes: c_int, buf: *mut stat) -> c_int { @@ -938,29 +938,7 @@ impl Pal for Sys { #[cfg(target_arch = "x86_64")] unsafe fn pte_clone(stack: *mut usize) -> pid_t { - let flags = syscall::CLONE_VM - | syscall::CLONE_FS - | syscall::CLONE_FILES - | syscall::CLONE_SIGHAND - | syscall::CLONE_STACK; - let flags = flags.bits(); - - use syscall::{Map, MapFlags}; - - const SIGSTACK_SIZE: usize = 1024 * 256; - - // TODO: Put sigstack at high addresses? - let target_sigstack = match syscall::fmap(!0, &Map { address: 0, flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, offset: 0, size: SIGSTACK_SIZE }) { - Ok(s) => s + SIGSTACK_SIZE, - Err(err) => return e(Err(err)) as pid_t, - }; - - let info = CloneInfo { - target_stack: stack as usize, - target_sigstack, - }; - - e(syscall::Error::demux(extra::pte_clone_inner(&info))) as pid_t + e(extra::pte_clone_impl(stack)) as pid_t } fn read(fd: c_int, buf: &mut [u8]) -> ssize_t { -- GitLab