From bb45466a4e83bbb39510c19a79fab46b7f716153 Mon Sep 17 00:00:00 2001
From: 4lDO2 <4lDO2@protonmail.com>
Date: Wed, 6 Jul 2022 09:11:10 +0200
Subject: [PATCH] Implement clone in userspace.

---
 Cargo.lock                  |   1 +
 Cargo.toml                  |   1 +
 src/lib.rs                  |   1 +
 src/platform/pte.rs         |   1 -
 src/platform/redox/exec.rs  | 288 +++++++++++++------------------
 src/platform/redox/extra.rs | 335 ++++++++++++++++++++++++++----------
 src/platform/redox/mod.rs   |  28 +--
 7 files changed, 369 insertions(+), 286 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a4199296a..9c1bc7241 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -333,6 +333,7 @@ dependencies = [
  "lazy_static",
  "memchr",
  "memoffset",
+ "plain",
  "posix-regex",
  "ralloc",
  "rand",
diff --git a/Cargo.toml b/Cargo.toml
index f63fe9fb5..e7c0d7875 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ memoffset = "0.5.1"
 posix-regex = { path = "posix-regex", features = ["no_std"] }
 rand = { version = "0.5.5", default-features = false }
 memchr = { version = "2.2.0", default-features = false }
+plain = "0.2"
 
 [dependencies.goblin]
 version = "0.0.21"
diff --git a/src/lib.rs b/src/lib.rs
index f3883d827..f03ce4c08 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,6 +3,7 @@
 #![allow(non_upper_case_globals)]
 #![allow(unused_variables)]
 #![feature(allocator_api)]
+#![feature(array_chunks)]
 #![feature(asm_const)]
 #![feature(box_into_pin)]
 #![feature(c_variadic)]
diff --git a/src/platform/pte.rs b/src/platform/pte.rs
index fb779c6eb..b2a193f15 100644
--- a/src/platform/pte.rs
+++ b/src/platform/pte.rs
@@ -118,7 +118,6 @@ pub unsafe extern "C" fn pte_osThreadCreate(
     if stack_base as isize == -1 {
         return PTE_OS_GENERAL_FAILURE;
     }
-    ptr::write_bytes(stack_base as *mut u8, 0, stack_size);
     let stack_end = stack_base.add(stack_size);
     let mut stack = stack_end as *mut usize;
     {
diff --git a/src/platform/redox/exec.rs b/src/platform/redox/exec.rs
index 1b9904865..f2231068b 100644
--- a/src/platform/redox/exec.rs
+++ b/src/platform/redox/exec.rs
@@ -1,4 +1,5 @@
 use core::convert::TryFrom;
+use super::extra::FdGuard;
 
 use alloc::{
     collections::{btree_map::Entry, BTreeMap},
@@ -7,14 +8,16 @@ use alloc::{
 
 use syscall::{
     data::ExecMemRange,
-    error::{Error, Result, ENOEXEC, ENOMEM},
-    flag::{AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, MapFlags},
+    error::*,
+    flag::{AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, MapFlags, O_WRONLY, SEEK_SET},
 };
 
 use crate::fs::File;
 
-fn read_all(fd: usize, offset: u64, buf: &mut [u8]) -> Result<()> {
-    syscall::lseek(fd, offset as isize, syscall::SEEK_SET).unwrap();
+fn read_all(fd: usize, offset: Option<u64>, buf: &mut [u8]) -> Result<()> {
+    if let Some(offset) = offset {
+        syscall::lseek(fd, offset as isize, syscall::SEEK_SET).unwrap();
+    }
 
     let mut total_bytes_read = 0;
 
@@ -27,12 +30,12 @@ fn read_all(fd: usize, offset: u64, buf: &mut [u8]) -> Result<()> {
     Ok(())
 }
 
-fn find_free_target_addr(tree: &BTreeMap<usize, TreeEntry>, size: usize) -> Option<usize> {
+fn find_free_target_addr(tree: &BTreeMap<usize, usize>, size: usize) -> Option<usize> {
     let mut iterator = tree.iter().peekable();
 
     // Ignore the space between zero and the first region, to avoid null pointers.
-    while let Some((cur_address, entry)) = iterator.next() {
-        let end = *cur_address + entry.size;
+    while let Some((cur_address, entry_size)) = iterator.next() {
+        let end = *cur_address + entry_size;
 
         if let Some((next_address, _)) = iterator.peek() {
             if **next_address - end > size {
@@ -45,20 +48,6 @@ fn find_free_target_addr(tree: &BTreeMap<usize, TreeEntry>, size: usize) -> Opti
 
     None
 }
-struct TreeEntry {
-    size: usize, // always a page-size multiple
-    flags: MapFlags,
-    accessible_addr: *mut u8, // also always a page-size multiple
-}
-impl Drop for TreeEntry {
-    fn drop(&mut self) {
-        unsafe {
-            if !self.accessible_addr.is_null() {
-                let _ = syscall::funmap(self.accessible_addr as usize, self.size);
-            }
-        }
-    }
-}
 
 #[cfg(target_arch = "x86_64")]
 const PAGE_SIZE: usize = 4096;
@@ -66,6 +55,8 @@ const PAGE_SIZE: usize = 4096;
 const FD_ANONYMOUS: usize = !0;
 
 pub fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], args_envs_size_without_nul: usize) -> Result<usize> {
+    use goblin::elf64::{header::Header, program_header::program_header64::{ProgramHeader, PT_LOAD, PF_W, PF_X}};
+
     let fd = *file as usize;
     let total_args_envs_size = args_envs_size_without_nul + args.len() + envs.len();
 
@@ -78,198 +69,155 @@ pub fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], args_
     // TODO: Introduce RAII guards to all owned allocations so that no leaks occur in case of
     // errors.
 
-    use goblin::elf::header::header64::Header;
-
     let mut header_bytes = [0_u8; core::mem::size_of::<Header>()];
-
-    read_all(fd, 0, &mut header_bytes)?;
-
+    read_all(fd, Some(0), &mut header_bytes)?;
     let header = Header::from_bytes(&header_bytes);
 
+    let grants_fd = {
+        let current_addrspace_fd = FdGuard::new(syscall::open("thisproc:current/addrspace", 0)?);
+        FdGuard::new(syscall::dup(*current_addrspace_fd, b"empty")?)
+    };
+    let memory_fd = FdGuard::new(syscall::dup(*grants_fd, b"mem")?);
+
     let instruction_ptr = usize::try_from(header.e_entry).map_err(|_| Error::new(ENOEXEC))?;
 
-    let mut tree = BTreeMap::<usize, TreeEntry>::new();
+    // Never allow more than 1 MiB of program headers. TODO: Capabilities again?
+    const MAX_PH_SIZE: usize = 1024 * 1024;
+    let phentsize = u64::from(header.e_phentsize) as usize;
+    let phnum = u64::from(header.e_phnum) as usize;
+    let pheaders_size = phentsize.saturating_mul(phnum);
+
+    if pheaders_size > MAX_PH_SIZE {
+        return Err(Error::new(E2BIG));
+    }
+    let mut phs = vec! [0_u8; pheaders_size];
 
-    use goblin::elf64::program_header::{self, ProgramHeader};
+    let mut tree = BTreeMap::new();
+    tree.insert(0, PAGE_SIZE);
 
-    let phdrs_size = (header.e_phnum as usize) * (header.e_phentsize as usize);
-    let phdrs_size_aligned = (phdrs_size + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
-    let phdrs_mem = unsafe { syscall::fmap(FD_ANONYMOUS, &syscall::Map { offset: 0, size: phdrs_size_aligned, address: 0, flags: MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE })? };
-    read_all(fd, header.e_phoff, unsafe { core::slice::from_raw_parts_mut(phdrs_mem as *mut u8, phdrs_size) })?;
+    const BUFSZ: usize = 16384;
+    let mut buf = vec! [0_u8; BUFSZ];
 
-    let phdrs = unsafe { core::slice::from_raw_parts(phdrs_mem as *const ProgramHeader, header.e_phnum as usize) };
+    read_all(*file as usize, Some(header.e_phoff), &mut phs).map_err(|_| Error::new(EIO))?;
 
-    for segment in phdrs {
+    for ph_idx in 0..phnum {
+        let ph_bytes = &phs[ph_idx * phentsize..(ph_idx + 1) * phentsize];
+        let segment: &ProgramHeader = plain::from_bytes(ph_bytes).map_err(|_| Error::new(EINVAL))?;
         let mut flags = syscall::PROT_READ;
 
         // W ^ X. If it is executable, do not allow it to be writable, even if requested
-        if segment.p_flags & program_header::PF_X == program_header::PF_X {
+        if segment.p_flags & PF_X == PF_X {
             flags |= syscall::PROT_EXEC;
-        } else if segment.p_flags & program_header::PF_W == program_header::PF_W {
+        } else if segment.p_flags & PF_W == PF_W {
             flags |= syscall::PROT_WRITE;
         }
 
-        match segment.p_type {
-            program_header::PT_LOAD => {
-                let voff = segment.p_vaddr as usize % PAGE_SIZE;
-                let vaddr = segment.p_vaddr as usize - voff;
-                let size =
-                    (segment.p_memsz as usize + voff + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
-
-                if segment.p_filesz > segment.p_memsz {
-                    return Err(Error::new(ENOEXEC));
-                }
-
-                let mem = match tree
-                    .range_mut(..=vaddr)
-                    .next_back()
-                    .filter(|(other_vaddr, entry)| **other_vaddr + entry.size > vaddr)
-                {
-                    None => unsafe {
-                        let mem = syscall::fmap(
-                            FD_ANONYMOUS,
-                            &syscall::Map {
-                                offset: 0,
-                                address: 0,
-                                size,
-                                flags: syscall::PROT_WRITE,
-                            },
-                        )
-                        .map_err(|_| Error::new(ENOMEM))?
-                            as *mut u8;
-                        tree.insert(
-                            vaddr,
-                            TreeEntry {
-                                size,
-                                flags,
-                                accessible_addr: mem,
-                            },
-                        );
-                        mem
-                    },
-                    Some((
-                        _,
-                        &mut TreeEntry {
-                            flags: ref mut f,
-                            accessible_addr,
-                            ..
-                        },
-                    )) => {
-                        *f |= flags;
-                        accessible_addr
-                    }
-                };
-                read_all(fd, segment.p_offset, unsafe {
-                    core::slice::from_raw_parts_mut(mem.add(voff), segment.p_filesz as usize)
-                })?;
+        let voff = segment.p_vaddr as usize % PAGE_SIZE;
+        let vaddr = segment.p_vaddr as usize - voff;
+        let size =
+            (segment.p_memsz as usize + voff + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE;
+
+        if segment.p_filesz > segment.p_memsz {
+            return Err(Error::new(ENOEXEC));
+        }
+        if segment.p_type == PT_LOAD {
+            mprotect_remote(*grants_fd, vaddr, size, flags)?;
+            syscall::lseek(*file as usize, segment.p_offset as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
+            syscall::lseek(*memory_fd, segment.p_vaddr as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
+
+            for size in core::iter::repeat(BUFSZ).take((segment.p_filesz as usize) / BUFSZ).chain(Some((segment.p_filesz as usize) % BUFSZ)) {
+                read_all(*file as usize, None, &mut buf[..size]).map_err(|_| Error::new(EIO))?;
+                let _ = syscall::write(*memory_fd, &buf[..size]).map_err(|_| Error::new(EIO))?;
+            }
+
+            if !tree.range(..=vaddr).next_back().filter(|(start, size)| **start + **size > vaddr).is_some() {
+                tree.insert(vaddr, size);
             }
-            _ => (),
         }
     }
-    let (stack_base, mut stack_mem) = unsafe {
-        let stack_base = syscall::fmap(FD_ANONYMOUS, &syscall::Map { offset: 0, size: STACK_SIZE, address: 0, flags: MapFlags::PROT_WRITE | MapFlags::PROT_READ | MapFlags::MAP_PRIVATE })? as *mut u8;
-        let stack_mem = stack_base.add(STACK_SIZE).sub(256);
-
-        (stack_base, stack_mem)
-    };
-
-    tree.insert(STACK_TOP - STACK_SIZE, TreeEntry {
-        size: STACK_SIZE,
-        flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE,
-        accessible_addr: stack_base,
-    });
-    let mut stack_mem = stack_mem.cast::<usize>();
+    // Setup a stack starting from the very end of the address space, and then growing downwards.
+    const STACK_TOP: usize = 1 << 47;
+    const STACK_SIZE: usize = 1024 * 1024;
 
-    let target_phdr_address = find_free_target_addr(&tree, phdrs_size_aligned).ok_or(Error::new(ENOMEM))?;
-    tree.insert(target_phdr_address, TreeEntry {
-        size: phdrs_size_aligned,
-        accessible_addr: phdrs_mem as *mut u8,
-        flags: MapFlags::PROT_READ | MapFlags::MAP_PRIVATE,
-    });
+    mprotect_remote(*grants_fd, STACK_TOP - STACK_SIZE, STACK_SIZE, MapFlags::PROT_READ | MapFlags::PROT_WRITE)?;
+    tree.insert(STACK_TOP - STACK_SIZE, STACK_SIZE);
 
     let mut sp = STACK_TOP - 256;
 
-    let mut push = |word: usize| unsafe {
+    let mut push = |word: usize| {
         sp -= core::mem::size_of::<usize>();
-        stack_mem = stack_mem.sub(1);
-        stack_mem.write(word);
+        let _ = syscall::lseek(*memory_fd, sp as isize, SEEK_SET)?;
+        let _ = syscall::write(*memory_fd, &usize::to_ne_bytes(word))?;
+        Ok(())
     };
 
-    push(0);
-    push(AT_NULL);
-    push(instruction_ptr);
-    push(AT_ENTRY);
-    push(target_phdr_address);
-    push(AT_PHDR);
-    push(header.e_phnum as usize);
-    push(AT_PHNUM);
-    push(header.e_phentsize as usize);
-    push(AT_PHENT);
+    let pheaders_size_aligned = (pheaders_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE;
+    let pheaders = find_free_target_addr(&tree, pheaders_size_aligned).ok_or(Error::new(ENOMEM))?;
+    tree.insert(pheaders, pheaders_size_aligned);
+    mprotect_remote(*grants_fd, pheaders, pheaders_size_aligned, MapFlags::PROT_READ)?;
+
+    syscall::lseek(*memory_fd, pheaders as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
+    syscall::write(*memory_fd, &phs).map_err(|_| Error::new(EIO))?;
+
+    push(0)?;
+    push(AT_NULL)?;
+    push(header.e_entry as usize)?;
+    push(AT_ENTRY)?;
+    push(pheaders)?;
+    push(AT_PHDR)?;
+    push(header.e_phnum as usize)?;
+    push(AT_PHNUM)?;
+    push(header.e_phentsize as usize)?;
+    push(AT_PHENT)?;
 
     let args_envs_size_aligned = (total_args_envs_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE;
     let target_args_env_address = find_free_target_addr(&tree, args_envs_size_aligned).ok_or(Error::new(ENOMEM))?;
+    mprotect_remote(*grants_fd, target_args_env_address, args_envs_size_aligned, MapFlags::PROT_READ | MapFlags::PROT_WRITE)?;
+    tree.insert(target_args_env_address, args_envs_size_aligned);
 
-    unsafe {
-        let map = syscall::Map {
-            offset: 0,
-            flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE,
-            address: 0,
-            size: args_envs_size_aligned,
-        };
-        let ptr = syscall::fmap(FD_ANONYMOUS, &map)? as *mut u8;
-        let args_envs_region = core::slice::from_raw_parts_mut(ptr, total_args_envs_size);
-        let mut offset = 0;
-
-        for collection in &[envs, args] {
-            push(0);
-
-            for source_slice in collection.iter().rev().copied() {
-                push(target_args_env_address + offset);
-                args_envs_region[offset..offset + source_slice.len()].copy_from_slice(source_slice);
-                offset += source_slice.len() + 1;
-            }
-        }
+    let mut offset = 0;
 
-        tree.insert(target_args_env_address, TreeEntry {
-            accessible_addr: ptr,
-            size: args_envs_size_aligned,
-            flags: MapFlags::PROT_READ | MapFlags::MAP_PRIVATE,
-        });
-    }
-    push(args.len());
+    let mut argc = 0;
 
-    const STACK_TOP: usize = (1 << 47);
-    const STACK_SIZE: usize = 1024 * 1024;
+    for (collection, is_args) in [(envs, false), (args, true)] {
+        push(0)?;
 
-    let memranges = tree
-        .into_iter()
-        .map(|(address, mut tree_entry)| {
-            // Prevent use-after-free
-            let old_address = core::mem::replace(&mut tree_entry.accessible_addr, core::ptr::null_mut()) as usize;
-
-            ExecMemRange {
-                address,
-                size: tree_entry.size,
-                flags: tree_entry.flags.bits(),
-                old_address,
-            }
-        })
-        .collect::<Vec<_>>();
+        for source_slice in collection.iter().rev() {
+            if is_args { argc += 1; }
+            push(target_args_env_address + offset)?;
 
-    /*unsafe {
-        let stack = &*(stack_mem as *const crate::start::Stack);
+            syscall::lseek(*memory_fd, (target_args_env_address + offset) as isize, SEEK_SET).map_err(|_| Error::new(EIO))?;
+            let _ = syscall::write(*memory_fd, source_slice).map_err(|_| Error::new(EIO))?;
+            offset += source_slice.len() + 1;
+        }
+    }
 
-    }*/
+    push(argc)?;
 
     unsafe { crate::ld_so::tcb::Tcb::deactivate(); }
 
     // TODO: Restore old name if exec failed?
-    if let Ok(name_fd) = syscall::open("thisproc:current/name", syscall::O_WRONLY) {
+    if let Ok(name_fd) = syscall::open("thisproc:current/name", O_WRONLY) {
         let _ = syscall::write(name_fd, path);
         let _ = syscall::close(name_fd);
     }
     drop(file);
 
-    syscall::exec(&memranges, instruction_ptr, sp)?;
+    let addrspace_selection_fd = FdGuard::new(syscall::open("thisproc:current/current-addrspace", O_WRONLY)?);
+
+    let mut buf = [0_u8; 24];
+    buf[..8].copy_from_slice(&usize::to_ne_bytes(*grants_fd));
+    buf[8..16].copy_from_slice(&usize::to_ne_bytes(sp));
+    buf[16..24].copy_from_slice(&usize::to_ne_bytes(header.e_entry as usize));
+
+    let _ = syscall::write(*addrspace_selection_fd, &buf);
     unreachable!();
 }
+fn mprotect_remote(socket: usize, addr: usize, len: usize, flags: MapFlags) -> Result<()> {
+    let mut grants_buf = [0_u8; 24];
+    grants_buf[..8].copy_from_slice(&usize::to_ne_bytes(addr));
+    grants_buf[8..16].copy_from_slice(&usize::to_ne_bytes(len));
+    grants_buf[16..24].copy_from_slice(&usize::to_ne_bytes(flags.bits()));
+    syscall::write(socket, &grants_buf)?;
+    Ok(())
+}
diff --git a/src/platform/redox/extra.rs b/src/platform/redox/extra.rs
index 93fa489e4..dc7f46868 100644
--- a/src/platform/redox/extra.rs
+++ b/src/platform/redox/extra.rs
@@ -1,7 +1,10 @@
-use core::{ptr, slice};
+use core::{mem, ptr, slice};
 use core::arch::global_asm;
 
-use syscall::data::CloneInfo;
+use syscall::data::Map;
+use syscall::flag::{MapFlags, O_CLOEXEC};
+use syscall::error::{Error, Result, EINVAL, ENAMETOOLONG};
+use syscall::SIGCONT;
 
 use crate::platform::{sys::e, types::*};
 
@@ -51,80 +54,248 @@ pub unsafe extern "C" fn redox_physunmap(virtual_address: *mut c_void) -> c_int
     e(syscall::physunmap(virtual_address as usize)) as c_int
 }
 
-extern "C" {
-    pub fn pte_clone_inner(info: *const CloneInfo) -> usize;
+pub struct FdGuard {
+    fd: usize,
+    taken: bool,
 }
+impl FdGuard {
+    pub fn new(fd: usize) -> Self {
+        Self {
+            fd, taken: false,
+        }
+    }
+    pub fn take(&mut self) -> usize {
+        self.taken = true;
+        self.fd
+    }
+}
+impl core::ops::Deref for FdGuard {
+    type Target = usize;
+
+    fn deref(&self) -> &Self::Target {
+        &self.fd
+    }
+}
+
+impl Drop for FdGuard {
+    fn drop(&mut self) {
+        if !self.taken {
+            let _ = syscall::close(self.fd);
+        }
+    }
+}
+
+fn new_context() -> Result<(FdGuard, usize)> {
+    // Create a new context (fields such as uid/gid will be inherited from the current context).
+    let fd = FdGuard::new(syscall::open("thisproc:new/open_via_dup", O_CLOEXEC)?);
 
+    // Extract pid.
+    let mut buffer = [0_u8; 64];
+    let len = syscall::fpath(*fd, &mut buffer)?;
+    let buffer = buffer.get(..len).ok_or(Error::new(ENAMETOOLONG))?;
+
+    let colon_idx = buffer.iter().position(|c| *c == b':').ok_or(Error::new(EINVAL))?;
+    let slash_idx = buffer.iter().skip(colon_idx).position(|c| *c == b'/').ok_or(Error::new(EINVAL))? + colon_idx;
+    let pid_bytes = buffer.get(colon_idx + 1..slash_idx).ok_or(Error::new(EINVAL))?;
+    let pid_str = core::str::from_utf8(pid_bytes).map_err(|_| Error::new(EINVAL))?;
+    let pid = pid_str.parse::<usize>().map_err(|_| Error::new(EINVAL))?;
+
+    Ok((fd, pid))
+}
+
+fn copy_str(cur_pid_fd: usize, new_pid_fd: usize, key: &str) -> Result<()> {
+    let cur_name_fd = FdGuard::new(syscall::dup(cur_pid_fd, key.as_bytes())?);
+    let new_name_fd = FdGuard::new(syscall::dup(new_pid_fd, key.as_bytes())?);
+
+    let mut buf = [0_u8; 256];
+    let len = syscall::read(*cur_name_fd, &mut buf)?;
+    let buf = buf.get(..len).ok_or(Error::new(ENAMETOOLONG))?;
+
+    syscall::write(*new_name_fd, &buf)?;
+
+    Ok(())
+}
 #[cfg(target_arch = "x86_64")]
-global_asm!("
-    .globl pte_clone_inner
-    .type pte_clone_inner, @function
-    .p2align 6",
-    // Parameters: <info_ptr> in RDI
-"pte_clone_inner:
-    mov rax, {SYS_CLONE}
-    mov rsi, rdi
-    mov rdi, {CLONE_FLAGS}
-    mov rdx, {INFO_LEN}",
-    // Call clone(flags, info_ptr, info_len) syscall
-    "syscall
-
-    # Check if child or parent
-    test rax, rax
-    jnz .parent
+fn copy_float_env_regs(cur_pid_fd: usize, new_pid_fd: usize) -> Result<()> {
+    // Copy environment registers.
+    {
+        let cur_env_regs_fd = FdGuard::new(syscall::dup(cur_pid_fd, b"regs/env")?);
+        let new_env_regs_fd = FdGuard::new(syscall::dup(new_pid_fd, b"regs/env")?);
 
-    # Load registers
-    pop rax
-    pop rdi
-    pop rsi
-    pop rdx
-    pop rcx
-    pop r8
-    pop r9
+        let mut env_regs = syscall::EnvRegisters::default();
+        let _ = syscall::read(*cur_env_regs_fd, &mut env_regs)?;
+        let _ = syscall::write(*new_env_regs_fd, &env_regs)?;
+    }
+    // Copy float registers.
+    {
+        let cur_float_regs_fd = FdGuard::new(syscall::dup(cur_pid_fd, b"regs/float")?);
+        let new_float_regs_fd = FdGuard::new(syscall::dup(new_pid_fd, b"regs/float")?);
 
-    # Call entry point
-    call rax
+        let mut float_regs = syscall::FloatRegisters::default();
+        let _ = syscall::read(*cur_float_regs_fd, &mut float_regs)?;
+        let _ = syscall::write(*new_float_regs_fd, &float_regs)?;
+    }
+
+    Ok(())
+}
+
+/// Spawns a new context sharing the same address space as the current one (i.e. a new thread).
+pub unsafe fn pte_clone_impl(stack: *mut usize) -> Result<usize> {
+    let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?);
+    let (new_pid_fd, new_pid) = new_context()?;
+
+    // Allocate a new signal stack.
+    {
+        let sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?);
+
+        const SIGSTACK_SIZE: usize = 1024 * 256;
+
+        // TODO: Put sigstack at high addresses?
+        let target_sigstack = syscall::fmap(!0, &Map { address: 0, flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, offset: 0, size: SIGSTACK_SIZE })? + SIGSTACK_SIZE;
+
+        let _ = syscall::write(*sigstack_fd, &usize::to_ne_bytes(target_sigstack))?;
+    }
+
+    copy_str(*cur_pid_fd, *new_pid_fd, "name")?;
+    copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?;
+
+    // Reuse existing address space
+    {
+        let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?);
+        let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?);
 
-    # Exit
-    mov rax, {SYS_EXIT}
-    xor rdi, rdi
-    syscall
+        let buf = create_set_addr_space_buf(*cur_addr_space_fd, pte_clone_ret as usize, stack as usize);
+        let _ = syscall::write(*new_addr_space_sel_fd, &buf)?;
+    }
+
+    // Reuse file table
+    {
+        let cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?);
+        let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?);
+
+        let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*cur_filetable_fd))?;
+    }
+
+
+    copy_float_env_regs(*cur_pid_fd, *new_pid_fd)?;
+
+    // Unblock context. 
+    syscall::kill(new_pid, SIGCONT);
+
+    Ok(0)
+}
+fn create_set_addr_space_buf(space: usize, ip: usize, sp: usize) -> [u8; mem::size_of::<usize>() * 3] {
+    let mut buf = [0_u8; 3 * mem::size_of::<usize>()];
+    let mut chunks = buf.array_chunks_mut::<{mem::size_of::<usize>()}>();
+    *chunks.next().unwrap() = usize::to_ne_bytes(space);
+    *chunks.next().unwrap() = usize::to_ne_bytes(sp);
+    *chunks.next().unwrap() = usize::to_ne_bytes(ip);
+    buf
+}
+/// Spawns a new context which will not share the same address space as the current one. File
+/// descriptors from other schemes are reobtained with `dup`, and grants referencing such file
+/// descriptors are reobtained through `fmap`. Other mappings are kept but duplicated using CoW.
+pub fn fork_impl() -> Result<usize> {
+    unsafe {
+        Error::demux(fork_wrapper())
+    }
+}
+
+fn fork_inner(initial_rsp: *mut usize) -> Result<usize> {
+    let new_pid = {
+        let cur_pid_fd = FdGuard::new(syscall::open("thisproc:current/open_via_dup", O_CLOEXEC)?);
+        let (new_pid_fd, new_pid) = new_context()?;
+
+        // Do not allocate new signal stack, but copy existing address (all memory will be re-mapped
+        // CoW later).
+        {
+            let cur_sigstack_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"sigstack")?);
+            let new_sigstack_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"sigstack")?);
+
+            let mut sigstack_buf = usize::to_ne_bytes(0);
+
+            let _ = syscall::read(*cur_sigstack_fd, &mut sigstack_buf);
+            let _ = syscall::write(*new_sigstack_fd, &sigstack_buf);
+        }
+
+        copy_str(*cur_pid_fd, *new_pid_fd, "name")?;
+        copy_str(*cur_pid_fd, *new_pid_fd, "cwd")?;
+
+        // CoW-duplicate address space.
+        {
+            let cur_addr_space_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"addrspace")?);
+
+            // FIXME: Find mappings which use external file descriptors
+
+            let new_addr_space_fd = FdGuard::new(syscall::dup(*cur_addr_space_fd, b"exclusive")?);
+            let new_addr_space_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-addrspace")?);
 
-    # Invalid instruction on failure to exit
-    ud2
+            let buf = create_set_addr_space_buf(*new_addr_space_fd, fork_ret as usize, initial_rsp as usize);
+            let _ = syscall::write(*new_addr_space_sel_fd, &buf)?;
+        }
 
-    # Return PID if parent
-.parent:
+        // Copy existing files into new file table, but do not reuse the same file table (i.e. new
+        // parent FDs will not show up for the child).
+        {
+            let cur_filetable_fd = FdGuard::new(syscall::dup(*cur_pid_fd, b"filetable")?);
+            // TODO: Use cross_scheme_links or something similar to avoid copying the file table in the
+            // kernel.
+            let new_filetable_fd = FdGuard::new(syscall::dup(*cur_filetable_fd, b"copy")?);
+            let new_filetable_sel_fd = FdGuard::new(syscall::dup(*new_pid_fd, b"current-filetable")?);
+
+            let _ = syscall::write(*new_filetable_sel_fd, &usize::to_ne_bytes(*new_filetable_fd));
+        }
+        copy_float_env_regs(*cur_pid_fd, *new_pid_fd)?;
+
+        new_pid
+    };
+
+    // Unblock context.
+    syscall::kill(new_pid, SIGCONT);
+
+    Ok(new_pid)
+}
+#[no_mangle]
+unsafe extern "sysv64" fn __relibc_internal_fork_impl(initial_rsp: *mut usize) -> usize {
+    Error::mux(fork_inner(initial_rsp))
+}
+
+core::arch::global_asm!("
+    .p2align 6
+    .globl fork_wrapper
+    .type fork_wrapper, @function
+fork_wrapper:
+    push rbp
+    mov rbp, rsp
+
+    push rbx
+    push rbp
+    push r12
+    push r13
+    push r14
+    push r15
+
+    mov rdi, rsp
+    call __relibc_internal_fork_impl
+    jmp 2f
+
+fork_ret:
+    xor rax, rax
+2:
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbp
+    pop rbx
+
+    pop rbp
     ret
-    ",
-    SYS_EXIT = const(syscall::SYS_EXIT),
-    SYS_CLONE = const(syscall::SYS_CLONE),
-    CLONE_FLAGS = const(
-        syscall::CLONE_VM.bits()
-            | syscall::CLONE_FS.bits()
-            | syscall::CLONE_FILES.bits()
-            | syscall::CLONE_SIGHAND.bits()
-            | syscall::CLONE_STACK.bits()
-    ),
-    INFO_LEN = const(core::mem::size_of::<CloneInfo>()),
-);
-
-/*global_asm!("
-    .globl pte_clone_inner
-    .type pte_clone_inner, @function
-
-pte_clone_inner:
-    # Move the 1st argument `stack` of this function into the second argument to clone.
-    mov rsi, rdi
-    mov rax, {SYS_CLONE}
-    mov rdi, {flags}
-
-    # Call clone syscall
-    syscall
-
-    # Check if child or parent
-    test rax, rax
-    jnz 2f
+    .size fork_wrapper, . - fork_wrapper
+
+    .globl pte_clone_ret
+    .type pte_clone_ret, @function
+pte_clone_ret:
 
     # Load registers
     pop rax
@@ -138,28 +309,12 @@ pte_clone_inner:
     # Call entry point
     call rax
 
-    # Exit
-    mov rax, 1
-    xor rdi, rdi
-    syscall
-
-    # Invalid instruction on failure to exit
-    ud2
-
-    # Return PID if parent
-2:
     ret
+    .size pte_clone_ret, . - pte_clone_ret
+");
 
-    .size pte_clone_inner, . - pte_clone_inner
-
-    ",
-
-    flags = const(
-        syscall::CLONE_VM.bits()
-            | syscall::CLONE_FS.bits()
-            | syscall::CLONE_FILES.bits()
-            | syscall::CLONE_SIGHAND.bits()
-            | syscall::CLONE_STACK.bits()
-    ),
-    SYS_CLONE = const(syscall::SYS_CLONE),
-);*/
+extern "sysv64" {
+    fn fork_wrapper() -> usize;
+    fn fork_ret();
+    fn pte_clone_ret();
+}
diff --git a/src/platform/redox/mod.rs b/src/platform/redox/mod.rs
index 266ffec10..a51b56dce 100644
--- a/src/platform/redox/mod.rs
+++ b/src/platform/redox/mod.rs
@@ -355,7 +355,7 @@ impl Pal for Sys {
 
         // Close all O_CLOEXEC file descriptors. TODO: close_range?
         {
-            let name = CStr::from_bytes_with_nul(b"thisproc:current/files\0").expect("string should be valid");
+            let name = CStr::from_bytes_with_nul(b"thisproc:current/filetable\0").expect("string should be valid");
             let files_fd = match File::open(name, fcntl::O_RDONLY) {
                 Ok(f) => f,
                 Err(_) => return -1,
@@ -455,7 +455,7 @@ impl Pal for Sys {
     }
 
     fn fork() -> pid_t {
-        e(unsafe { syscall::clone(syscall::CloneFlags::empty()) }) as pid_t
+        e(extra::fork_impl()) as pid_t
     }
 
     fn fstat(fildes: c_int, buf: *mut stat) -> c_int {
@@ -938,29 +938,7 @@ impl Pal for Sys {
 
     #[cfg(target_arch = "x86_64")]
     unsafe fn pte_clone(stack: *mut usize) -> pid_t {
-        let flags = syscall::CLONE_VM
-            | syscall::CLONE_FS
-            | syscall::CLONE_FILES
-            | syscall::CLONE_SIGHAND
-            | syscall::CLONE_STACK;
-        let flags = flags.bits();
-
-        use syscall::{Map, MapFlags};
-
-        const SIGSTACK_SIZE: usize = 1024 * 256;
-
-        // TODO: Put sigstack at high addresses?
-        let target_sigstack = match syscall::fmap(!0, &Map { address: 0, flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE, offset: 0, size: SIGSTACK_SIZE }) {
-            Ok(s) => s + SIGSTACK_SIZE,
-            Err(err) => return e(Err(err)) as pid_t,
-        };
-
-        let info = CloneInfo {
-            target_stack: stack as usize,
-            target_sigstack,
-        };
-
-        e(syscall::Error::demux(extra::pte_clone_inner(&info))) as pid_t
+        e(extra::pte_clone_impl(stack)) as pid_t
     }
 
     fn read(fd: c_int, buf: &mut [u8]) -> ssize_t {
-- 
GitLab