From 3eedbeb14d2a6f9f71fbf2776fba03a0e109c838 Mon Sep 17 00:00:00 2001
From: 4lDO2 <4lDO2@protonmail.com>
Date: Sun, 1 Aug 2021 12:09:22 +0200
Subject: [PATCH] WIP: Let userspace manage fsbase/gsbase and TLS.

---
 src/arch/x86_64/consts.rs            |  21 +--
 src/arch/x86_64/interrupt/handler.rs |   5 +
 src/arch/x86_64/start.rs             |  14 +-
 src/context/arch/x86_64.rs           |  37 ++---
 src/context/context.rs               |  73 ++++++++--
 src/context/list.rs                  |   2 +-
 src/elf.rs                           |   6 +
 src/lib.rs                           |   1 +
 src/scheme/mod.rs                    |   2 +
 src/scheme/proc.rs                   | 129 ++++++++++++++++-
 src/syscall/mod.rs                   |   1 +
 src/syscall/process.rs               | 202 ++++++++-------------------
 syscall                              |   2 +-
 13 files changed, 282 insertions(+), 213 deletions(-)

diff --git a/src/arch/x86_64/consts.rs b/src/arch/x86_64/consts.rs
index 2e1b4627..5589106e 100644
--- a/src/arch/x86_64/consts.rs
+++ b/src/arch/x86_64/consts.rs
@@ -39,13 +39,6 @@
     pub const USER_OFFSET: usize = 0;
     pub const USER_PML4: usize = (USER_OFFSET & PML4_MASK)/PML4_SIZE;
 
-    /// Offset to user TCB
-    /// Each process has 4096 bytes, at an offset of 4096 * PID
-    // TODO: Get a real 64-bit offset, and allow loading ELF sections higher up than the current
-    // limit, iff the processor supports fsgsbase (in which case it is cheap to use 64-bit FS
-    // offsets).
-    pub const USER_TCB_OFFSET: usize = 0xB000_0000;
-
     /// Offset to user arguments
     pub const USER_ARG_OFFSET: usize = USER_OFFSET + PML4_SIZE/2;
 
@@ -69,14 +62,8 @@
     /// Size of user sigstack
     pub const USER_SIGSTACK_SIZE: usize = 256 * 1024; // 256 KB
 
-    /// Offset to user TLS
-    pub const USER_TLS_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE;
-    pub const USER_TLS_PML4: usize = (USER_TLS_OFFSET & PML4_MASK)/PML4_SIZE;
-    // Maximum TLS allocated to each PID, should be approximately 8 MB
-    pub const USER_TLS_SIZE: usize = PML4_SIZE / 65536;
-
     /// Offset to user temporary image (used when cloning)
-    pub const USER_TMP_OFFSET: usize = USER_TLS_OFFSET + PML4_SIZE;
+    pub const USER_TMP_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE;
     pub const USER_TMP_PML4: usize = (USER_TMP_OFFSET & PML4_MASK)/PML4_SIZE;
 
     /// Offset to user temporary heap (used when cloning)
@@ -95,10 +82,6 @@
     pub const USER_TMP_SIGSTACK_OFFSET: usize = USER_TMP_STACK_OFFSET + PML4_SIZE;
     pub const USER_TMP_SIGSTACK_PML4: usize = (USER_TMP_SIGSTACK_OFFSET & PML4_MASK)/PML4_SIZE;
 
-    /// Offset to user temporary tls (used when cloning)
-    pub const USER_TMP_TLS_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE;
-    pub const USER_TMP_TLS_PML4: usize = (USER_TMP_TLS_OFFSET & PML4_MASK)/PML4_SIZE;
-
     /// Offset for usage in other temporary pages
-    pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_TLS_OFFSET + PML4_SIZE;
+    pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE;
     pub const USER_TMP_MISC_PML4: usize = (USER_TMP_MISC_OFFSET & PML4_MASK)/PML4_SIZE;
diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs
index ac67cc85..877792d2 100644
--- a/src/arch/x86_64/interrupt/handler.rs
+++ b/src/arch/x86_64/interrupt/handler.rs
@@ -80,6 +80,11 @@ impl IretRegisters {
             println!("RSP:   {:>016X}", { self.rsp });
             println!("SS:    {:>016X}", { self.ss });
         }
+        unsafe {
+            let fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE);
+            let gsbase = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE);
+            println!("FSBASE {:>016X}\nGSBASE {:016X}", fsbase, gsbase);
+        }
     }
 }
 
diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs
index 8f82b982..9f94e083 100644
--- a/src/arch/x86_64/start.rs
+++ b/src/arch/x86_64/start.rs
@@ -275,13 +275,13 @@ macro_rules! save_fsgsbase(
             mov ecx, {MSR_FSBASE}
             rdmsr
             shl rdx, 32
-            mov edx, eax
+            or rdx, rax
             mov r14, rdx
 
             mov ecx, {MSR_GSBASE}
             rdmsr
             shl rdx, 32
-            mov edx, eax
+            or rdx, rax
             mov r13, rdx
         "
     }
@@ -354,8 +354,11 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singl
             mov es, r15d
             mov fs, r15d
             mov gs, r15d
+            ",
 
-            ", restore_fsgsbase!(), "
+            // SS and CS will later be set via sysretq.
+
+            restore_fsgsbase!(), "
 
             // Target instruction pointer
             mov rcx, rdi
@@ -382,14 +385,15 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singl
             xor r15, r15
 
             fninit
-
+            ",
             // NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX,
             // even though the caller can give us the wrong address. But, it's marked unsafe, so
             // the caller is responsible for this! (And, the likelihood of rcx being changed in the
             // middle here, is minimal, unless the attacker already has partial control of kernel
             // memory.)
+            "
             sysretq
-        "),
+            "),
 
         flag_interrupts = const(FLAG_INTERRUPTS),
         shift_singlestep = const(SHIFT_SINGLESTEP),
diff --git a/src/context/arch/x86_64.rs b/src/context/arch/x86_64.rs
index bf35ad55..bebc8a2a 100644
--- a/src/context/arch/x86_64.rs
+++ b/src/context/arch/x86_64.rs
@@ -36,10 +36,16 @@ pub struct Context {
     rbp: usize,
     /// Stack pointer
     rsp: usize,
-    /// FSBASE
-    pub fsbase: usize,
-    /// GSBASE
-    gsbase: usize,
+    /// FSBASE.
+    ///
+    /// NOTE: Same fsgsbase behavior as with gsbase.
+    pub(crate) fsbase: usize,
+    /// GSBASE.
+    ///
+    /// NOTE: Without fsgsbase, this register will strictly be equal to the register value when
+    /// running. With fsgsbase, this is neither saved nor restored upon every syscall (there is no
+    /// need to!), and thus it must be re-read from the register before copying this struct.
+    pub(crate) gsbase: usize,
     /// FX valid?
     loadable: AbiCompatBool,
 }
@@ -52,7 +58,7 @@ enum AbiCompatBool {
 }
 
 impl Context {
-    pub fn new(pid: usize) -> Context {
+    pub fn new() -> Context {
         Context {
             loadable: AbiCompatBool::False,
             fx: 0,
@@ -65,13 +71,10 @@ impl Context {
             r15: 0,
             rbp: 0,
             rsp: 0,
-            fsbase: crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE,
+            fsbase: 0,
             gsbase: 0,
         }
     }
-    pub fn update_tcb(&mut self, pid: usize) {
-        self.fsbase = crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE;
-    }
 
     pub fn get_page_utable(&mut self) -> usize {
         self.cr3
@@ -147,19 +150,10 @@ impl Context {
     }
 }
 
-macro_rules! switch_msr(
+macro_rules! load_msr(
     ($name:literal, $offset:literal) => {
         concat!("
-            // EDX:EAX <= MSR
-
             mov ecx, {", $name, "}
-            rdmsr
-            shl rdx, 32
-            mov edx, eax
-
-            // Save old, load new.
-
-            mov [rdi + {", $offset, "}], rdx
             mov rdx, [rsi + {", $offset, "}]
             mov eax, edx
             shr rdx, 32
@@ -198,10 +192,9 @@ macro_rules! switch_fsgsbase(
 #[cfg(not(feature = "x86_fsgsbase"))]
 macro_rules! switch_fsgsbase(
     () => {
-        // TODO: Is it faster to perform two 32-bit memory accesses, rather than shifting?
         concat!(
-            switch_msr!("MSR_FSBASE", "off_fsbase"),
-            switch_msr!("MSR_KERNELGSBASE", "off_gsbase"),
+            load_msr!("MSR_FSBASE", "off_fsbase"),
+            load_msr!("MSR_KERNELGSBASE", "off_gsbase"),
         )
     }
 );
diff --git a/src/context/context.rs b/src/context/context.rs
index a6e366cd..bf34057d 100644
--- a/src/context/context.rs
+++ b/src/context/context.rs
@@ -9,6 +9,7 @@ use core::{
     alloc::{GlobalAlloc, Layout},
     cmp::Ordering,
     mem,
+    ptr::NonNull,
 };
 use spin::RwLock;
 
@@ -20,7 +21,9 @@ use crate::context::memory::{UserGrants, Memory, SharedMemory, Tls};
 use crate::ipi::{ipi, IpiKind, IpiTarget};
 use crate::scheme::{SchemeNamespace, FileHandle};
 use crate::sync::WaitMap;
+
 use crate::syscall::data::SigAction;
+use crate::syscall::error::{Result, Error, ENOMEM};
 use crate::syscall::flag::{SIG_DFL, SigActionFlags};
 
 /// Unique identifier for a context (i.e. `pid`).
@@ -203,9 +206,9 @@ pub struct Context {
     /// Current system call
     pub syscall: Option<(usize, usize, usize, usize, usize, usize)>,
     /// Head buffer to use when system call buffers are not page aligned
-    pub syscall_head: Box<[u8]>,
+    pub syscall_head: AlignedBox<[u8; PAGE_SIZE], PAGE_SIZE>,
     /// Tail buffer to use when system call buffers are not page aligned
-    pub syscall_tail: Box<[u8]>,
+    pub syscall_tail: AlignedBox<[u8; PAGE_SIZE], PAGE_SIZE>,
     /// Context is halting parent
     pub vfork: bool,
     /// Context is being waited on
@@ -230,8 +233,6 @@ pub struct Context {
     pub stack: Option<SharedMemory>,
     /// User signal stack
     pub sigstack: Option<Memory>,
-    /// User Thread local storage
-    pub tls: Option<Tls>,
     /// User grants
     pub grants: Arc<RwLock<UserGrants>>,
     /// The name of the context
@@ -253,12 +254,63 @@ pub struct Context {
     pub ptrace_stop: bool
 }
 
+// Necessary because GlobalAlloc::dealloc requires the layout to be the same, and therefore Box
+// cannot be used for increased alignment directly.
+// TODO: move to common?
+pub struct AlignedBox<T, const ALIGN: usize> {
+    inner: Unique<T>,
+}
+pub unsafe trait ValidForZero {}
+unsafe impl<const N: usize> ValidForZero for [u8; N] {}
+
+impl<T, const ALIGN: usize> AlignedBox<T, ALIGN> {
+    const LAYOUT: core::alloc::Layout = {
+        const fn max(a: usize, b: usize) -> usize {
+            if a > b { a } else { b }
+        }
+
+        match core::alloc::Layout::from_size_align(mem::size_of::<T>(), max(mem::align_of::<T>(), ALIGN)) {
+            Ok(l) => l,
+            Err(_) => panic!("layout validation failed at compile time"),
+        }
+    };
+    #[inline(always)]
+    pub fn try_zeroed() -> Result<Self>
+    where
+        T: ValidForZero,
+    {
+        Ok(unsafe {
+            let ptr = crate::ALLOCATOR.alloc_zeroed(Self::LAYOUT);
+            if ptr.is_null() {
+                return Err(Error::new(ENOMEM))?;
+            }
+            Self {
+                inner: Unique::new_unchecked(ptr.cast()),
+            }
+        })
+    }
+}
+
+impl<T, const ALIGN: usize> core::fmt::Debug for AlignedBox<T, ALIGN> {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(f, "[aligned box at {:p}, size {} alignment {}]", self.inner.as_ptr(), mem::size_of::<T>(), mem::align_of::<T>())
+    }
+}
+impl<T, const ALIGN: usize> Drop for AlignedBox<T, ALIGN> {
+    fn drop(&mut self) {
+        unsafe {
+            core::ptr::drop_in_place(self.inner.as_ptr());
+            crate::ALLOCATOR.dealloc(self.inner.as_ptr().cast(), Self::LAYOUT);
+        }
+    }
+}
+
 impl Context {
-    pub fn new(id: ContextId) -> Context {
-        let syscall_head = unsafe { Box::from_raw(crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(PAGE_SIZE, PAGE_SIZE)) as *mut [u8; PAGE_SIZE]) };
-        let syscall_tail = unsafe { Box::from_raw(crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(PAGE_SIZE, PAGE_SIZE)) as *mut [u8; PAGE_SIZE]) };
+    pub fn new(id: ContextId) -> Result<Context> {
+        let syscall_head = AlignedBox::try_zeroed()?;
+        let syscall_tail = AlignedBox::try_zeroed()?;
 
-        Context {
+        Ok(Context {
             id,
             pgid: id,
             ppid: ContextId::from(0),
@@ -282,7 +334,7 @@ impl Context {
             waitpid: Arc::new(WaitMap::new()),
             pending: VecDeque::new(),
             wake: None,
-            arch: arch::Context::new(id.into()),
+            arch: arch::Context::new(),
             kfx: None,
             kstack: None,
             ksig: None,
@@ -290,7 +342,6 @@ impl Context {
             image: Vec::new(),
             stack: None,
             sigstack: None,
-            tls: None,
             grants: Arc::new(RwLock::new(UserGrants::default())),
             name: Arc::new(RwLock::new(String::new().into_boxed_str())),
             cwd: Arc::new(RwLock::new(String::new())),
@@ -305,7 +356,7 @@ impl Context {
             ); 128])),
             regs: None,
             ptrace_stop: false
-        }
+        })
     }
 
     /// Make a relative path absolute
diff --git a/src/context/list.rs b/src/context/list.rs
index 90dd566f..5ae63cac 100644
--- a/src/context/list.rs
+++ b/src/context/list.rs
@@ -69,7 +69,7 @@ impl ContextList {
         let id = ContextId::from(self.next_id);
         self.next_id += 1;
 
-        assert!(self.map.insert(id, Arc::new(RwLock::new(Context::new(id)))).is_none());
+        assert!(self.map.insert(id, Arc::new(RwLock::new(Context::new(id)?))).is_none());
 
         Ok(self.map.get(&id).expect("Failed to insert new context. ID is out of bounds."))
     }
diff --git a/src/elf.rs b/src/elf.rs
index 4914a392..2a74ea92 100644
--- a/src/elf.rs
+++ b/src/elf.rs
@@ -82,6 +82,12 @@ impl<'a> Elf<'a> {
     pub fn program_headers(&self) -> usize {
         self.header.e_phoff as usize
     }
+    pub fn program_header_count(&self) -> usize {
+        self.header.e_phnum as usize
+    }
+    pub fn program_headers_size(&self) -> usize {
+        self.header.e_phentsize as usize
+    }
 }
 
 pub struct ElfSections<'a> {
diff --git a/src/lib.rs b/src/lib.rs
index 7f54feb9..6a48b969 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -48,6 +48,7 @@
 #![feature(concat_idents)]
 #![feature(const_btree_new)]
 #![feature(const_maybe_uninit_as_ptr)]
+#![feature(const_panic)]
 #![feature(const_ptr_offset_from)]
 #![feature(const_raw_ptr_deref)]
 #![feature(core_intrinsics)]
diff --git a/src/scheme/mod.rs b/src/scheme/mod.rs
index 74405b0f..a42095de 100644
--- a/src/scheme/mod.rs
+++ b/src/scheme/mod.rs
@@ -137,6 +137,7 @@ impl SchemeList {
         //TODO: Only memory: is in the null namespace right now. It should be removed when
         //anonymous mmap's are implemented
         self.insert(ns, "memory", |_| Arc::new(MemoryScheme::new())).unwrap();
+        self.insert(ns, "thisproc", |_| Arc::new(ProcScheme::restricted())).unwrap();
     }
 
     /// Initialize a new namespace
@@ -168,6 +169,7 @@ impl SchemeList {
         self.insert(ns, "initfs", |_| Arc::new(InitFsScheme::new())).unwrap();
         self.insert(ns, "irq", |scheme_id| Arc::new(IrqScheme::new(scheme_id))).unwrap();
         self.insert(ns, "proc", |scheme_id| Arc::new(ProcScheme::new(scheme_id))).unwrap();
+        self.insert(ns, "thisproc", |_| Arc::new(ProcScheme::restricted())).unwrap();
         self.insert(ns, "serio", |scheme_id| Arc::new(SerioScheme::new(scheme_id))).unwrap();
 
         #[cfg(feature = "live")] {
diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs
index 922905a1..d76a1bb9 100644
--- a/src/scheme/proc.rs
+++ b/src/scheme/proc.rs
@@ -6,6 +6,7 @@ use crate::{
     syscall::{
         FloatRegisters,
         IntRegisters,
+        EnvRegisters,
         data::{PtraceEvent, Stat},
         error::*,
         flag::*,
@@ -57,6 +58,9 @@ fn try_stop_context<F, T>(pid: ContextId, mut callback: F) -> Result<T>
 where
     F: FnMut(&mut Context) -> Result<T>,
 {
+    if pid == context::context_id() {
+        return Err(Error::new(EBADF));
+    }
     // Stop process
     let (was_stopped, mut running) = with_context_mut(pid, |context| {
         let was_stopped = context.ptrace_stop;
@@ -88,7 +92,8 @@ where
 #[derive(Clone, Copy, PartialEq, Eq)]
 enum RegsKind {
     Float,
-    Int
+    Int,
+    Env,
 }
 #[derive(Clone, Copy, PartialEq, Eq)]
 enum Operation {
@@ -195,6 +200,12 @@ pub static PROC_SCHEME_ID: AtomicSchemeId = AtomicSchemeId::default();
 pub struct ProcScheme {
     next_id: AtomicUsize,
     handles: RwLock<BTreeMap<usize, Handle>>,
+    access: Access,
+}
+#[derive(PartialEq)]
+pub enum Access {
+    OtherProcesses,
+    Restricted,
 }
 
 impl ProcScheme {
@@ -204,6 +215,14 @@ impl ProcScheme {
         Self {
             next_id: AtomicUsize::new(0),
             handles: RwLock::new(BTreeMap::new()),
+            access: Access::OtherProcesses,
+        }
+    }
+    pub fn restricted() -> Self {
+        Self {
+            next_id: AtomicUsize::new(0),
+            handles: RwLock::new(BTreeMap::new()),
+            access: Access::Restricted,
         }
     }
 }
@@ -211,15 +230,22 @@ impl ProcScheme {
 impl Scheme for ProcScheme {
     fn open(&self, path: &str, flags: usize, uid: u32, gid: u32) -> Result<usize> {
         let mut parts = path.splitn(2, '/');
-        let pid = parts.next()
-            .and_then(|s| s.parse().ok())
-            .map(ContextId::from)
-            .ok_or(Error::new(EINVAL))?;
+        let pid_str = parts.next()
+            .ok_or(Error::new(ENOENT))?;
+
+        let pid = if pid_str == "current" {
+            context::context_id()
+        } else if self.access == Access::Restricted {
+            return Err(Error::new(EACCES));
+        } else {
+            ContextId::from(pid_str.parse().map_err(|_| Error::new(ENOENT))?)
+        };
 
         let operation = match parts.next() {
             Some("mem") => Operation::Memory,
             Some("regs/float") => Operation::Regs(RegsKind::Float),
             Some("regs/int") => Operation::Regs(RegsKind::Int),
+            Some("regs/env") => Operation::Regs(RegsKind::Env),
             Some("trace") => Operation::Trace,
             Some("exe") => Operation::Static("exe"),
             _ => return Err(Error::new(EINVAL))
@@ -382,7 +408,8 @@ impl Scheme for ProcScheme {
             Operation::Regs(kind) => {
                 union Output {
                     float: FloatRegisters,
-                    int: IntRegisters
+                    int: IntRegisters,
+                    env: EnvRegisters,
                 }
 
                 let (output, size) = match kind {
@@ -406,7 +433,37 @@ impl Scheme for ProcScheme {
                             stack.save(&mut regs);
                             Ok((Output { int: regs }, mem::size_of::<IntRegisters>()))
                         }
-                    })?
+                    })?,
+                    RegsKind::Env => {
+                        let (fsbase, gsbase) = if info.pid == context::context_id() {
+                            #[cfg(not(feature = "x86_fsgsbase"))]
+                            unsafe {
+                                (
+                                    x86::msr::rdmsr(x86::msr::IA32_FS_BASE),
+                                    x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE),
+                                )
+                            }
+                            #[cfg(feature = "x86_fsgsbase")]
+                            unsafe {
+                                use x86::bits64::segmentation::*;
+
+                                (
+                                    rdfsbase(),
+                                    {
+                                        swapgs();
+                                        let gsbase = rdgsbase();
+                                        swapgs();
+                                        gsbase
+                                    }
+                                )
+                            }
+                        } else {
+                            try_stop_context(info.pid, |context| {
+                                Ok((context.arch.fsbase as u64, context.arch.gsbase as u64))
+                            })?
+                        };
+                        (Output { env: EnvRegisters { fsbase, gsbase }}, mem::size_of::<EnvRegisters>())
+                    }
                 };
 
                 let bytes = unsafe {
@@ -503,6 +560,9 @@ impl Scheme for ProcScheme {
                     if buf.len() < mem::size_of::<FloatRegisters>() {
                         return Ok(0);
                     }
+                    if (buf.as_ptr() as usize) % mem::align_of::<FloatRegisters>() != 0 {
+                        return Err(Error::new(EINVAL));
+                    }
                     let regs = unsafe {
                         *(buf as *const _ as *const FloatRegisters)
                     };
@@ -521,6 +581,9 @@ impl Scheme for ProcScheme {
                     if buf.len() < mem::size_of::<IntRegisters>() {
                         return Ok(0);
                     }
+                    if (buf.as_ptr() as usize) % mem::align_of::<FloatRegisters>() != 0 {
+                        return Err(Error::new(EINVAL));
+                    }
                     let regs = unsafe {
                         *(buf as *const _ as *const IntRegisters)
                     };
@@ -537,6 +600,57 @@ impl Scheme for ProcScheme {
                         }
                     })
                 }
+                RegsKind::Env => {
+                    if buf.len() < mem::size_of::<EnvRegisters>() {
+                        return Ok(0);
+                    }
+                    if (buf.as_ptr() as usize) % mem::align_of::<EnvRegisters>() != 0 {
+                        return Err(Error::new(EINVAL));
+                    }
+                    let regs = unsafe {
+                        *(buf as *const _ as *const EnvRegisters)
+                    };
+                    use rmm::{Arch as _, X8664Arch};
+                    if !(X8664Arch::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) && X8664Arch::virt_is_valid(VirtualAddress::new(regs.gsbase as usize))) {
+                        return Err(Error::new(EINVAL));
+                    }
+
+                    if info.pid == context::context_id() {
+                        #[cfg(not(feature = "x86_fsgsbase"))]
+                        unsafe {
+                            x86::msr::wrmsr(x86::msr::IA32_FS_BASE, regs.fsbase);
+                            // We have to write to KERNEL_GSBASE, because when the kernel returns to
+                            // userspace, it will have executed SWAPGS first.
+                            x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, regs.gsbase);
+
+                            match context::contexts().current().ok_or(Error::new(ESRCH))?.write().arch {
+                                ref mut arch => {
+                                    arch.fsbase = regs.fsbase as usize;
+                                    arch.gsbase = regs.gsbase as usize;
+                                }
+                            }
+                        }
+                        #[cfg(feature = "x86_fsgsbase")]
+                        unsafe {
+                            use x86::bits64::segmentation::*;
+
+                            wrfsbase(regs.fsbase);
+                            swapgs();
+                            wrgsbase(regs.gsbase);
+                            swapgs();
+
+                            // No need to update the current context; with fsgsbase enabled, these
+                            // registers are automatically saved and restored.
+                        }
+                    } else {
+                        try_stop_context(info.pid, |context| {
+                            context.arch.fsbase = regs.fsbase as usize;
+                            context.arch.gsbase = regs.gsbase as usize;
+                            Ok(())
+                        })?;
+                    }
+                    Ok(mem::size_of::<EnvRegisters>())
+                }
             },
             Operation::Trace => {
                 if buf.len() < mem::size_of::<u64>() {
@@ -621,6 +735,7 @@ impl Scheme for ProcScheme {
             Operation::Memory => "mem",
             Operation::Regs(RegsKind::Float) => "regs/float",
             Operation::Regs(RegsKind::Int) => "regs/int",
+            Operation::Regs(RegsKind::Env) => "regs/env",
             Operation::Trace => "trace",
             Operation::Static(path) => path,
         });
diff --git a/src/syscall/mod.rs b/src/syscall/mod.rs
index b90dd54d..54c66e19 100644
--- a/src/syscall/mod.rs
+++ b/src/syscall/mod.rs
@@ -7,6 +7,7 @@ extern crate syscall;
 pub use self::syscall::{
     FloatRegisters,
     IntRegisters,
+    EnvRegisters,
     data,
     error,
     flag,
diff --git a/src/syscall/process.rs b/src/syscall/process.rs
index e0ff3249..af15f50e 100644
--- a/src/syscall/process.rs
+++ b/src/syscall/process.rs
@@ -27,7 +27,7 @@ use crate::scheme::FileHandle;
 use crate::start::usermode;
 use crate::syscall::data::{SigAction, Stat};
 use crate::syscall::error::*;
-use crate::syscall::flag::{wifcontinued, wifstopped, AT_ENTRY, AT_NULL, AT_PHDR, CloneFlags,
+use crate::syscall::flag::{wifcontinued, wifstopped, AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, CloneFlags,
                            CLONE_FILES, CLONE_FS, CLONE_SIGHAND, CLONE_STACK, CLONE_VFORK, CLONE_VM,
                            MapFlags, PROT_EXEC, PROT_READ, PROT_WRITE, PTRACE_EVENT_CLONE,
                            PTRACE_STOP_EXIT, SigActionFlags, SIG_BLOCK, SIG_DFL, SIG_SETMASK, SIG_UNBLOCK,
@@ -57,7 +57,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
         let mut image = vec![];
         let mut stack_opt = None;
         let mut sigstack_opt = None;
-        let mut tls_opt = None;
         let grants;
         let name;
         let cwd;
@@ -202,36 +201,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
                 sigstack_opt = Some(new_sigstack);
             }
 
-            if let Some(ref tls) = context.tls {
-                let mut new_tls = context::memory::Tls {
-                    master: tls.master,
-                    file_size: tls.file_size,
-                    mem: context::memory::Memory::new(
-                        VirtualAddress::new(crate::USER_TMP_TLS_OFFSET),
-                        tls.mem.size(),
-                        PageFlags::new().write(true),
-                        true
-                    ),
-                    offset: tls.offset,
-                };
-
-
-                if flags.contains(CLONE_VM) {
-                    unsafe {
-                        new_tls.load();
-                    }
-                } else {
-                    unsafe {
-                        intrinsics::copy(tls.mem.start_address().data() as *const u8,
-                                        new_tls.mem.start_address().data() as *mut u8,
-                                        tls.mem.size());
-                    }
-                }
-
-                new_tls.mem.remap(tls.mem.flags());
-                tls_opt = Some(new_tls);
-            }
-
             if flags.contains(CLONE_VM) {
                 grants = Arc::clone(&context.grants);
             } else {
@@ -352,6 +321,14 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
 
             context.arch = arch;
 
+            // This is needed because these registers may have changed after this context was
+            // switched to, but before this was called.
+            #[cfg(all(target_arch = "x86_64", feature = "x86_fsgsbase"))]
+            unsafe {
+                context.arch.fsbase = x86::bits64::segmentation::rdfsbase() as usize;
+                context.arch.gsbase = x86::bits64::segmentation::rdgsbase() as usize;
+            }
+
             let mut active_utable = unsafe { ActivePageTable::new(TableKind::User) };
             let mut active_ktable = unsafe { ActivePageTable::new(TableKind::Kernel) };
 
@@ -378,10 +355,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
             let mut new_ktable = unsafe {
                 InactivePageTable::from_address(new_utable.address())
             };
-            #[cfg(target_arch = "x86_64")]
-            {
-                context.arch.update_tcb(pid.into());
-            }
 
             // Copy kernel image mapping
             {
@@ -502,15 +475,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
                 context.sigstack = Some(sigstack);
             }
 
-            // Set up TCB
-            let tcb_addr = crate::USER_TCB_OFFSET + context.id.into() * PAGE_SIZE;
-            let mut tcb = context::memory::Memory::new(
-                VirtualAddress::new(tcb_addr),
-                PAGE_SIZE,
-                PageFlags::new().write(true).user(true),
-                true
-            );
-
             #[cfg(target_arch = "aarch64")]
             {
                 if let Some(stack) = &mut context.kstack {
@@ -534,38 +498,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> {
                 }
             }
 
-            // Setup user TLS
-            if let Some(mut tls) = tls_opt {
-                // Copy TLS mapping
-                {
-                    let frame = active_utable.p4()[crate::USER_TLS_PML4].pointed_frame().expect("user tls not mapped");
-                    let flags = active_utable.p4()[crate::USER_TLS_PML4].flags();
-                    active_utable.with(&mut new_utable, &mut temporary_upage, |mapper| {
-                        mapper.p4_mut()[crate::USER_TLS_PML4].set(frame, flags);
-                    });
-                }
-
-                // TODO: Make sure size is not greater than USER_TLS_SIZE
-                let tls_addr = crate::USER_TLS_OFFSET + context.id.into() * crate::USER_TLS_SIZE;
-                //println!("{}: Copy TLS: address 0x{:x}, size 0x{:x}", context.id.into(), tls_addr, tls.mem.size());
-                tls.mem.move_to(VirtualAddress::new(tls_addr), &mut new_utable, &mut temporary_upage);
-                unsafe {
-                    *(tcb_addr as *mut usize) = tls.mem.start_address().data() + tls.mem.size();
-                }
-                context.tls = Some(tls);
-            } else {
-                //println!("{}: Copy TCB", context.id.into());
-                let parent_tcb_addr = crate::USER_TCB_OFFSET + ppid.into() * PAGE_SIZE;
-                unsafe {
-                    intrinsics::copy(parent_tcb_addr as *const u8,
-                                    tcb_addr as *mut u8,
-                                    tcb.size());
-                }
-            }
-
-            tcb.move_to(VirtualAddress::new(tcb_addr), &mut new_utable, &mut temporary_upage);
-            context.image.push(tcb.to_shared());
-
             context.name = name;
 
             context.cwd = cwd;
@@ -599,13 +531,11 @@ fn empty(context: &mut context::Context, reaping: bool) {
         assert!(context.image.is_empty());
         assert!(context.stack.is_none());
         assert!(context.sigstack.is_none());
-        assert!(context.tls.is_none());
     } else {
-        // Unmap previous image, heap, grants, stack, and tls
+        // Unmap previous image, heap, grants, stack
         context.image.clear();
         drop(context.stack.take());
         drop(context.sigstack.take());
-        drop(context.tls.take());
     }
 
     // NOTE: If we do not replace the grants `Arc`, then a strange situation can appear where the
@@ -651,10 +581,12 @@ impl Drop for ExecFile {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 fn fexec_noreturn(
     setuid: Option<u32>,
     setgid: Option<u32>,
     name: Box<str>,
+    phdrs_region: core::ops::Range<usize>,
     data: Box<[u8]>,
     args: Box<[Box<[u8]>]>,
     vars: Box<[Box<[u8]>]>,
@@ -664,6 +596,11 @@ fn fexec_noreturn(
     let singlestep;
     let mut sp = crate::USER_STACK_OFFSET + crate::USER_STACK_SIZE - 256;
 
+    let phdrs_len = 4096;
+    let phdrs_base_addr = sp - phdrs_len;
+
+    sp -= phdrs_len;
+
     {
         let (vfork, ppid, files) = {
             let contexts = context::contexts();
@@ -678,6 +615,25 @@ fn fexec_noreturn(
 
             empty(&mut context, false);
 
+            #[cfg(all(target_arch = "x86_64"))]
+            {
+                context.arch.fsbase = 0;
+                context.arch.gsbase = 0;
+
+                #[cfg(feature = "x86_fsgsbase")]
+                unsafe {
+                    x86::bits64::segmentation::wrfsbase(0);
+                    x86::bits64::segmentation::swapgs();
+                    x86::bits64::segmentation::wrgsbase(0);
+                    x86::bits64::segmentation::swapgs();
+                }
+                #[cfg(not(feature = "x86_fsgsbase"))]
+                unsafe {
+                    x86::msr::wrmsr(x86::msr::IA32_FS_BASE, 0);
+                    x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0);
+                }
+            }
+
             if let Some(uid) = setuid {
                 context.euid = uid;
             }
@@ -687,20 +643,10 @@ fn fexec_noreturn(
             }
 
             // Map and copy new segments
-            let mut tls_opt = None;
             {
                 let elf = elf::Elf::from(&data).unwrap();
                 entry = elf.entry();
 
-                // Always map TCB
-                let tcb_addr = crate::USER_TCB_OFFSET + context.id.into() * PAGE_SIZE;
-                let tcb_mem = context::memory::Memory::new(
-                    VirtualAddress::new(tcb_addr),
-                    PAGE_SIZE,
-                    PageFlags::new().write(true).user(true),
-                    true
-                );
-
                 for segment in elf.segments() {
                     match segment.p_type {
                         program_header::PT_LOAD => {
@@ -734,45 +680,11 @@ fn fexec_noreturn(
 
                             context.image.push(memory.to_shared());
                         },
-                        program_header::PT_TLS => {
-                            let aligned_size = if segment.p_align > 0 {
-                                ((segment.p_memsz + (segment.p_align - 1))/segment.p_align) * segment.p_align
-                            } else {
-                                segment.p_memsz
-                            } as usize;
-                            let rounded_size = ((aligned_size + PAGE_SIZE - 1)/PAGE_SIZE) * PAGE_SIZE;
-                            let rounded_offset = rounded_size - aligned_size;
-
-                            // TODO: Make sure size is not greater than USER_TLS_SIZE
-                            let tls_addr = crate::USER_TLS_OFFSET + context.id.into() * crate::USER_TLS_SIZE;
-                            let tls = context::memory::Tls {
-                                master: VirtualAddress::new(segment.p_vaddr as usize),
-                                file_size: segment.p_filesz as usize,
-                                mem: context::memory::Memory::new(
-                                    VirtualAddress::new(tls_addr),
-                                    rounded_size as usize,
-                                    PageFlags::new().write(true).user(true),
-                                    true
-                                ),
-                                offset: rounded_offset as usize,
-                            };
-
-                            unsafe {
-                                *(tcb_addr as *mut usize) = tls.mem.start_address().data() + tls.mem.size();
-                            }
-
-                            tls_opt = Some(tls);
-                        },
                         _ => (),
                     }
                 }
-
-                context.image.push(tcb_mem.to_shared());
             }
 
-            // Data no longer required, can deallocate
-            drop(data);
-
             // Map stack
             context.stack = Some(context::memory::Memory::new(
                 VirtualAddress::new(crate::USER_STACK_OFFSET),
@@ -789,20 +701,19 @@ fn fexec_noreturn(
                 true
             ));
 
-            // Map TLS
-            if let Some(mut tls) = tls_opt {
-                unsafe {
-                    tls.load();
-                }
-
-                context.tls = Some(tls);
-            }
-
             let mut push = |arg| {
                 sp -= mem::size_of::<usize>();
                 unsafe { *(sp as *mut usize) = arg; }
             };
 
+            unsafe {
+                let mut source = core::slice::from_raw_parts_mut(phdrs_base_addr as *mut u8, phdrs_len);
+                source[..phdrs_region.len()].copy_from_slice(&data[phdrs_region.clone()]);
+            }
+
+            // Data no longer required, can deallocate
+            drop(data);
+
             // Push auxiliary vector
             push(AT_NULL);
             for &arg in auxv.iter().rev() {
@@ -1019,7 +930,11 @@ pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>
         auxv.push(AT_ENTRY);
         auxv.push(elf.entry());
         auxv.push(AT_PHDR);
-        auxv.push(elf.program_headers());
+        auxv.push(crate::USER_STACK_OFFSET + crate::USER_STACK_SIZE - 256 - 4096);
+        auxv.push(AT_PHENT);
+        auxv.push(elf.program_headers_size());
+        auxv.push(AT_PHNUM);
+        auxv.push(elf.program_header_count());
 
         auxv
     };
@@ -1068,26 +983,19 @@ pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>
                     Some(auxv),
                 );
             },
-            program_header::PT_LOAD => {
-                let voff = segment.p_vaddr as usize % PAGE_SIZE;
-                let vaddr = segment.p_vaddr as usize - voff;
-
-                // Due to the Userspace and kernel TLS bases being located right above 2GB,
-                // limit any loadable sections to lower than that. Eventually we will need
-                // to replace this with a more intelligent TLS address
-                if vaddr >= 0x8000_0000 {
-                    println!("exec: invalid section address {:X}", segment.p_vaddr);
-                    return Err(Error::new(ENOEXEC));
-                }
-            },
             _ => (),
         }
     }
 
+    let phdr_range = elf.program_headers()..elf.program_headers() + elf.program_headers_size() * elf.program_header_count();
+    if phdr_range.len() > 4096 {
+        return Err(Error::new(ENOMEM));
+    }
+
     // This is the point of no return, quite literaly. Any checks for validity need
     // to be done before, and appropriate errors returned. Otherwise, we have nothing
     // to return to.
-    fexec_noreturn(setuid, setgid, name.into_boxed_str(), data.into_boxed_slice(), args, vars, auxv.into_boxed_slice());
+    fexec_noreturn(setuid, setgid, name.into_boxed_str(), phdr_range, data.into_boxed_slice(), args, vars, auxv.into_boxed_slice());
 }
 
 pub fn fexec(fd: FileHandle, arg_ptrs: &[[usize; 2]], var_ptrs: &[[usize; 2]]) -> Result<usize> {
diff --git a/syscall b/syscall
index 841b5f42..519a09e9 160000
--- a/syscall
+++ b/syscall
@@ -1 +1 @@
-Subproject commit 841b5f42216782ce2aee6201c55b849ce5d7ab71
+Subproject commit 519a09e96400309a14375c04815da00f7cf5f526
-- 
GitLab