From 3eedbeb14d2a6f9f71fbf2776fba03a0e109c838 Mon Sep 17 00:00:00 2001 From: 4lDO2 <4lDO2@protonmail.com> Date: Sun, 1 Aug 2021 12:09:22 +0200 Subject: [PATCH] WIP: Let userspace manage fsbase/gsbase and TLS. --- src/arch/x86_64/consts.rs | 21 +-- src/arch/x86_64/interrupt/handler.rs | 5 + src/arch/x86_64/start.rs | 14 +- src/context/arch/x86_64.rs | 37 ++--- src/context/context.rs | 73 ++++++++-- src/context/list.rs | 2 +- src/elf.rs | 6 + src/lib.rs | 1 + src/scheme/mod.rs | 2 + src/scheme/proc.rs | 129 ++++++++++++++++- src/syscall/mod.rs | 1 + src/syscall/process.rs | 202 ++++++++------------------- syscall | 2 +- 13 files changed, 282 insertions(+), 213 deletions(-) diff --git a/src/arch/x86_64/consts.rs b/src/arch/x86_64/consts.rs index 2e1b4627..5589106e 100644 --- a/src/arch/x86_64/consts.rs +++ b/src/arch/x86_64/consts.rs @@ -39,13 +39,6 @@ pub const USER_OFFSET: usize = 0; pub const USER_PML4: usize = (USER_OFFSET & PML4_MASK)/PML4_SIZE; - /// Offset to user TCB - /// Each process has 4096 bytes, at an offset of 4096 * PID - // TODO: Get a real 64-bit offset, and allow loading ELF sections higher up than the current - // limit, iff the processor supports fsgsbase (in which case it is cheap to use 64-bit FS - // offsets). - pub const USER_TCB_OFFSET: usize = 0xB000_0000; - /// Offset to user arguments pub const USER_ARG_OFFSET: usize = USER_OFFSET + PML4_SIZE/2; @@ -69,14 +62,8 @@ /// Size of user sigstack pub const USER_SIGSTACK_SIZE: usize = 256 * 1024; // 256 KB - /// Offset to user TLS - pub const USER_TLS_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE; - pub const USER_TLS_PML4: usize = (USER_TLS_OFFSET & PML4_MASK)/PML4_SIZE; - // Maximum TLS allocated to each PID, should be approximately 8 MB - pub const USER_TLS_SIZE: usize = PML4_SIZE / 65536; - /// Offset to user temporary image (used when cloning) - pub const USER_TMP_OFFSET: usize = USER_TLS_OFFSET + PML4_SIZE; + pub const USER_TMP_OFFSET: usize = USER_SIGSTACK_OFFSET + PML4_SIZE; pub const USER_TMP_PML4: usize = (USER_TMP_OFFSET & PML4_MASK)/PML4_SIZE; /// Offset to user temporary heap (used when cloning) @@ -95,10 +82,6 @@ pub const USER_TMP_SIGSTACK_OFFSET: usize = USER_TMP_STACK_OFFSET + PML4_SIZE; pub const USER_TMP_SIGSTACK_PML4: usize = (USER_TMP_SIGSTACK_OFFSET & PML4_MASK)/PML4_SIZE; - /// Offset to user temporary tls (used when cloning) - pub const USER_TMP_TLS_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE; - pub const USER_TMP_TLS_PML4: usize = (USER_TMP_TLS_OFFSET & PML4_MASK)/PML4_SIZE; - /// Offset for usage in other temporary pages - pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_TLS_OFFSET + PML4_SIZE; + pub const USER_TMP_MISC_OFFSET: usize = USER_TMP_SIGSTACK_OFFSET + PML4_SIZE; pub const USER_TMP_MISC_PML4: usize = (USER_TMP_MISC_OFFSET & PML4_MASK)/PML4_SIZE; diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs index ac67cc85..877792d2 100644 --- a/src/arch/x86_64/interrupt/handler.rs +++ b/src/arch/x86_64/interrupt/handler.rs @@ -80,6 +80,11 @@ impl IretRegisters { println!("RSP: {:>016X}", { self.rsp }); println!("SS: {:>016X}", { self.ss }); } + unsafe { + let fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE); + let gsbase = x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE); + println!("FSBASE {:>016X}\nGSBASE {:016X}", fsbase, gsbase); + } } } diff --git a/src/arch/x86_64/start.rs b/src/arch/x86_64/start.rs index 8f82b982..9f94e083 100644 --- a/src/arch/x86_64/start.rs +++ b/src/arch/x86_64/start.rs @@ -275,13 +275,13 @@ macro_rules! save_fsgsbase( mov ecx, {MSR_FSBASE} rdmsr shl rdx, 32 - mov edx, eax + or rdx, rax mov r14, rdx mov ecx, {MSR_GSBASE} rdmsr shl rdx, 32 - mov edx, eax + or rdx, rax mov r13, rdx " } @@ -354,8 +354,11 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singl mov es, r15d mov fs, r15d mov gs, r15d + ", - ", restore_fsgsbase!(), " + // SS and CS will later be set via sysretq. + + restore_fsgsbase!(), " // Target instruction pointer mov rcx, rdi @@ -382,14 +385,15 @@ pub unsafe extern "C" fn usermode(_ip: usize, _sp: usize, _arg: usize, _is_singl xor r15, r15 fninit - + ", // NOTE: Regarding the sysretq vulnerability, this is safe as we cannot modify RCX, // even though the caller can give us the wrong address. But, it's marked unsafe, so // the caller is responsible for this! (And, the likelihood of rcx being changed in the // middle here, is minimal, unless the attacker already has partial control of kernel // memory.) + " sysretq - "), + "), flag_interrupts = const(FLAG_INTERRUPTS), shift_singlestep = const(SHIFT_SINGLESTEP), diff --git a/src/context/arch/x86_64.rs b/src/context/arch/x86_64.rs index bf35ad55..bebc8a2a 100644 --- a/src/context/arch/x86_64.rs +++ b/src/context/arch/x86_64.rs @@ -36,10 +36,16 @@ pub struct Context { rbp: usize, /// Stack pointer rsp: usize, - /// FSBASE - pub fsbase: usize, - /// GSBASE - gsbase: usize, + /// FSBASE. + /// + /// NOTE: Same fsgsbase behavior as with gsbase. + pub(crate) fsbase: usize, + /// GSBASE. + /// + /// NOTE: Without fsgsbase, this register will strictly be equal to the register value when + /// running. With fsgsbase, this is neither saved nor restored upon every syscall (there is no + /// need to!), and thus it must be re-read from the register before copying this struct. + pub(crate) gsbase: usize, /// FX valid? loadable: AbiCompatBool, } @@ -52,7 +58,7 @@ enum AbiCompatBool { } impl Context { - pub fn new(pid: usize) -> Context { + pub fn new() -> Context { Context { loadable: AbiCompatBool::False, fx: 0, @@ -65,13 +71,10 @@ impl Context { r15: 0, rbp: 0, rsp: 0, - fsbase: crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE, + fsbase: 0, gsbase: 0, } } - pub fn update_tcb(&mut self, pid: usize) { - self.fsbase = crate::USER_TCB_OFFSET + pid * crate::memory::PAGE_SIZE; - } pub fn get_page_utable(&mut self) -> usize { self.cr3 @@ -147,19 +150,10 @@ impl Context { } } -macro_rules! switch_msr( +macro_rules! load_msr( ($name:literal, $offset:literal) => { concat!(" - // EDX:EAX <= MSR - mov ecx, {", $name, "} - rdmsr - shl rdx, 32 - mov edx, eax - - // Save old, load new. - - mov [rdi + {", $offset, "}], rdx mov rdx, [rsi + {", $offset, "}] mov eax, edx shr rdx, 32 @@ -198,10 +192,9 @@ macro_rules! switch_fsgsbase( #[cfg(not(feature = "x86_fsgsbase"))] macro_rules! switch_fsgsbase( () => { - // TODO: Is it faster to perform two 32-bit memory accesses, rather than shifting? concat!( - switch_msr!("MSR_FSBASE", "off_fsbase"), - switch_msr!("MSR_KERNELGSBASE", "off_gsbase"), + load_msr!("MSR_FSBASE", "off_fsbase"), + load_msr!("MSR_KERNELGSBASE", "off_gsbase"), ) } ); diff --git a/src/context/context.rs b/src/context/context.rs index a6e366cd..bf34057d 100644 --- a/src/context/context.rs +++ b/src/context/context.rs @@ -9,6 +9,7 @@ use core::{ alloc::{GlobalAlloc, Layout}, cmp::Ordering, mem, + ptr::NonNull, }; use spin::RwLock; @@ -20,7 +21,9 @@ use crate::context::memory::{UserGrants, Memory, SharedMemory, Tls}; use crate::ipi::{ipi, IpiKind, IpiTarget}; use crate::scheme::{SchemeNamespace, FileHandle}; use crate::sync::WaitMap; + use crate::syscall::data::SigAction; +use crate::syscall::error::{Result, Error, ENOMEM}; use crate::syscall::flag::{SIG_DFL, SigActionFlags}; /// Unique identifier for a context (i.e. `pid`). @@ -203,9 +206,9 @@ pub struct Context { /// Current system call pub syscall: Option<(usize, usize, usize, usize, usize, usize)>, /// Head buffer to use when system call buffers are not page aligned - pub syscall_head: Box<[u8]>, + pub syscall_head: AlignedBox<[u8; PAGE_SIZE], PAGE_SIZE>, /// Tail buffer to use when system call buffers are not page aligned - pub syscall_tail: Box<[u8]>, + pub syscall_tail: AlignedBox<[u8; PAGE_SIZE], PAGE_SIZE>, /// Context is halting parent pub vfork: bool, /// Context is being waited on @@ -230,8 +233,6 @@ pub struct Context { pub stack: Option<SharedMemory>, /// User signal stack pub sigstack: Option<Memory>, - /// User Thread local storage - pub tls: Option<Tls>, /// User grants pub grants: Arc<RwLock<UserGrants>>, /// The name of the context @@ -253,12 +254,63 @@ pub struct Context { pub ptrace_stop: bool } +// Necessary because GlobalAlloc::dealloc requires the layout to be the same, and therefore Box +// cannot be used for increased alignment directly. +// TODO: move to common? +pub struct AlignedBox<T, const ALIGN: usize> { + inner: Unique<T>, +} +pub unsafe trait ValidForZero {} +unsafe impl<const N: usize> ValidForZero for [u8; N] {} + +impl<T, const ALIGN: usize> AlignedBox<T, ALIGN> { + const LAYOUT: core::alloc::Layout = { + const fn max(a: usize, b: usize) -> usize { + if a > b { a } else { b } + } + + match core::alloc::Layout::from_size_align(mem::size_of::<T>(), max(mem::align_of::<T>(), ALIGN)) { + Ok(l) => l, + Err(_) => panic!("layout validation failed at compile time"), + } + }; + #[inline(always)] + pub fn try_zeroed() -> Result<Self> + where + T: ValidForZero, + { + Ok(unsafe { + let ptr = crate::ALLOCATOR.alloc_zeroed(Self::LAYOUT); + if ptr.is_null() { + return Err(Error::new(ENOMEM))?; + } + Self { + inner: Unique::new_unchecked(ptr.cast()), + } + }) + } +} + +impl<T, const ALIGN: usize> core::fmt::Debug for AlignedBox<T, ALIGN> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "[aligned box at {:p}, size {} alignment {}]", self.inner.as_ptr(), mem::size_of::<T>(), mem::align_of::<T>()) + } +} +impl<T, const ALIGN: usize> Drop for AlignedBox<T, ALIGN> { + fn drop(&mut self) { + unsafe { + core::ptr::drop_in_place(self.inner.as_ptr()); + crate::ALLOCATOR.dealloc(self.inner.as_ptr().cast(), Self::LAYOUT); + } + } +} + impl Context { - pub fn new(id: ContextId) -> Context { - let syscall_head = unsafe { Box::from_raw(crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(PAGE_SIZE, PAGE_SIZE)) as *mut [u8; PAGE_SIZE]) }; - let syscall_tail = unsafe { Box::from_raw(crate::ALLOCATOR.alloc(Layout::from_size_align_unchecked(PAGE_SIZE, PAGE_SIZE)) as *mut [u8; PAGE_SIZE]) }; + pub fn new(id: ContextId) -> Result<Context> { + let syscall_head = AlignedBox::try_zeroed()?; + let syscall_tail = AlignedBox::try_zeroed()?; - Context { + Ok(Context { id, pgid: id, ppid: ContextId::from(0), @@ -282,7 +334,7 @@ impl Context { waitpid: Arc::new(WaitMap::new()), pending: VecDeque::new(), wake: None, - arch: arch::Context::new(id.into()), + arch: arch::Context::new(), kfx: None, kstack: None, ksig: None, @@ -290,7 +342,6 @@ impl Context { image: Vec::new(), stack: None, sigstack: None, - tls: None, grants: Arc::new(RwLock::new(UserGrants::default())), name: Arc::new(RwLock::new(String::new().into_boxed_str())), cwd: Arc::new(RwLock::new(String::new())), @@ -305,7 +356,7 @@ impl Context { ); 128])), regs: None, ptrace_stop: false - } + }) } /// Make a relative path absolute diff --git a/src/context/list.rs b/src/context/list.rs index 90dd566f..5ae63cac 100644 --- a/src/context/list.rs +++ b/src/context/list.rs @@ -69,7 +69,7 @@ impl ContextList { let id = ContextId::from(self.next_id); self.next_id += 1; - assert!(self.map.insert(id, Arc::new(RwLock::new(Context::new(id)))).is_none()); + assert!(self.map.insert(id, Arc::new(RwLock::new(Context::new(id)?))).is_none()); Ok(self.map.get(&id).expect("Failed to insert new context. ID is out of bounds.")) } diff --git a/src/elf.rs b/src/elf.rs index 4914a392..2a74ea92 100644 --- a/src/elf.rs +++ b/src/elf.rs @@ -82,6 +82,12 @@ impl<'a> Elf<'a> { pub fn program_headers(&self) -> usize { self.header.e_phoff as usize } + pub fn program_header_count(&self) -> usize { + self.header.e_phnum as usize + } + pub fn program_headers_size(&self) -> usize { + self.header.e_phentsize as usize + } } pub struct ElfSections<'a> { diff --git a/src/lib.rs b/src/lib.rs index 7f54feb9..6a48b969 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,6 +48,7 @@ #![feature(concat_idents)] #![feature(const_btree_new)] #![feature(const_maybe_uninit_as_ptr)] +#![feature(const_panic)] #![feature(const_ptr_offset_from)] #![feature(const_raw_ptr_deref)] #![feature(core_intrinsics)] diff --git a/src/scheme/mod.rs b/src/scheme/mod.rs index 74405b0f..a42095de 100644 --- a/src/scheme/mod.rs +++ b/src/scheme/mod.rs @@ -137,6 +137,7 @@ impl SchemeList { //TODO: Only memory: is in the null namespace right now. It should be removed when //anonymous mmap's are implemented self.insert(ns, "memory", |_| Arc::new(MemoryScheme::new())).unwrap(); + self.insert(ns, "thisproc", |_| Arc::new(ProcScheme::restricted())).unwrap(); } /// Initialize a new namespace @@ -168,6 +169,7 @@ impl SchemeList { self.insert(ns, "initfs", |_| Arc::new(InitFsScheme::new())).unwrap(); self.insert(ns, "irq", |scheme_id| Arc::new(IrqScheme::new(scheme_id))).unwrap(); self.insert(ns, "proc", |scheme_id| Arc::new(ProcScheme::new(scheme_id))).unwrap(); + self.insert(ns, "thisproc", |_| Arc::new(ProcScheme::restricted())).unwrap(); self.insert(ns, "serio", |scheme_id| Arc::new(SerioScheme::new(scheme_id))).unwrap(); #[cfg(feature = "live")] { diff --git a/src/scheme/proc.rs b/src/scheme/proc.rs index 922905a1..d76a1bb9 100644 --- a/src/scheme/proc.rs +++ b/src/scheme/proc.rs @@ -6,6 +6,7 @@ use crate::{ syscall::{ FloatRegisters, IntRegisters, + EnvRegisters, data::{PtraceEvent, Stat}, error::*, flag::*, @@ -57,6 +58,9 @@ fn try_stop_context<F, T>(pid: ContextId, mut callback: F) -> Result<T> where F: FnMut(&mut Context) -> Result<T>, { + if pid == context::context_id() { + return Err(Error::new(EBADF)); + } // Stop process let (was_stopped, mut running) = with_context_mut(pid, |context| { let was_stopped = context.ptrace_stop; @@ -88,7 +92,8 @@ where #[derive(Clone, Copy, PartialEq, Eq)] enum RegsKind { Float, - Int + Int, + Env, } #[derive(Clone, Copy, PartialEq, Eq)] enum Operation { @@ -195,6 +200,12 @@ pub static PROC_SCHEME_ID: AtomicSchemeId = AtomicSchemeId::default(); pub struct ProcScheme { next_id: AtomicUsize, handles: RwLock<BTreeMap<usize, Handle>>, + access: Access, +} +#[derive(PartialEq)] +pub enum Access { + OtherProcesses, + Restricted, } impl ProcScheme { @@ -204,6 +215,14 @@ impl ProcScheme { Self { next_id: AtomicUsize::new(0), handles: RwLock::new(BTreeMap::new()), + access: Access::OtherProcesses, + } + } + pub fn restricted() -> Self { + Self { + next_id: AtomicUsize::new(0), + handles: RwLock::new(BTreeMap::new()), + access: Access::Restricted, } } } @@ -211,15 +230,22 @@ impl ProcScheme { impl Scheme for ProcScheme { fn open(&self, path: &str, flags: usize, uid: u32, gid: u32) -> Result<usize> { let mut parts = path.splitn(2, '/'); - let pid = parts.next() - .and_then(|s| s.parse().ok()) - .map(ContextId::from) - .ok_or(Error::new(EINVAL))?; + let pid_str = parts.next() + .ok_or(Error::new(ENOENT))?; + + let pid = if pid_str == "current" { + context::context_id() + } else if self.access == Access::Restricted { + return Err(Error::new(EACCES)); + } else { + ContextId::from(pid_str.parse().map_err(|_| Error::new(ENOENT))?) + }; let operation = match parts.next() { Some("mem") => Operation::Memory, Some("regs/float") => Operation::Regs(RegsKind::Float), Some("regs/int") => Operation::Regs(RegsKind::Int), + Some("regs/env") => Operation::Regs(RegsKind::Env), Some("trace") => Operation::Trace, Some("exe") => Operation::Static("exe"), _ => return Err(Error::new(EINVAL)) @@ -382,7 +408,8 @@ impl Scheme for ProcScheme { Operation::Regs(kind) => { union Output { float: FloatRegisters, - int: IntRegisters + int: IntRegisters, + env: EnvRegisters, } let (output, size) = match kind { @@ -406,7 +433,37 @@ impl Scheme for ProcScheme { stack.save(&mut regs); Ok((Output { int: regs }, mem::size_of::<IntRegisters>())) } - })? + })?, + RegsKind::Env => { + let (fsbase, gsbase) = if info.pid == context::context_id() { + #[cfg(not(feature = "x86_fsgsbase"))] + unsafe { + ( + x86::msr::rdmsr(x86::msr::IA32_FS_BASE), + x86::msr::rdmsr(x86::msr::IA32_KERNEL_GSBASE), + ) + } + #[cfg(feature = "x86_fsgsbase")] + unsafe { + use x86::bits64::segmentation::*; + + ( + rdfsbase(), + { + swapgs(); + let gsbase = rdgsbase(); + swapgs(); + gsbase + } + ) + } + } else { + try_stop_context(info.pid, |context| { + Ok((context.arch.fsbase as u64, context.arch.gsbase as u64)) + })? + }; + (Output { env: EnvRegisters { fsbase, gsbase }}, mem::size_of::<EnvRegisters>()) + } }; let bytes = unsafe { @@ -503,6 +560,9 @@ impl Scheme for ProcScheme { if buf.len() < mem::size_of::<FloatRegisters>() { return Ok(0); } + if (buf.as_ptr() as usize) % mem::align_of::<FloatRegisters>() != 0 { + return Err(Error::new(EINVAL)); + } let regs = unsafe { *(buf as *const _ as *const FloatRegisters) }; @@ -521,6 +581,9 @@ impl Scheme for ProcScheme { if buf.len() < mem::size_of::<IntRegisters>() { return Ok(0); } + if (buf.as_ptr() as usize) % mem::align_of::<FloatRegisters>() != 0 { + return Err(Error::new(EINVAL)); + } let regs = unsafe { *(buf as *const _ as *const IntRegisters) }; @@ -537,6 +600,57 @@ impl Scheme for ProcScheme { } }) } + RegsKind::Env => { + if buf.len() < mem::size_of::<EnvRegisters>() { + return Ok(0); + } + if (buf.as_ptr() as usize) % mem::align_of::<EnvRegisters>() != 0 { + return Err(Error::new(EINVAL)); + } + let regs = unsafe { + *(buf as *const _ as *const EnvRegisters) + }; + use rmm::{Arch as _, X8664Arch}; + if !(X8664Arch::virt_is_valid(VirtualAddress::new(regs.fsbase as usize)) && X8664Arch::virt_is_valid(VirtualAddress::new(regs.gsbase as usize))) { + return Err(Error::new(EINVAL)); + } + + if info.pid == context::context_id() { + #[cfg(not(feature = "x86_fsgsbase"))] + unsafe { + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, regs.fsbase); + // We have to write to KERNEL_GSBASE, because when the kernel returns to + // userspace, it will have executed SWAPGS first. + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, regs.gsbase); + + match context::contexts().current().ok_or(Error::new(ESRCH))?.write().arch { + ref mut arch => { + arch.fsbase = regs.fsbase as usize; + arch.gsbase = regs.gsbase as usize; + } + } + } + #[cfg(feature = "x86_fsgsbase")] + unsafe { + use x86::bits64::segmentation::*; + + wrfsbase(regs.fsbase); + swapgs(); + wrgsbase(regs.gsbase); + swapgs(); + + // No need to update the current context; with fsgsbase enabled, these + // registers are automatically saved and restored. + } + } else { + try_stop_context(info.pid, |context| { + context.arch.fsbase = regs.fsbase as usize; + context.arch.gsbase = regs.gsbase as usize; + Ok(()) + })?; + } + Ok(mem::size_of::<EnvRegisters>()) + } }, Operation::Trace => { if buf.len() < mem::size_of::<u64>() { @@ -621,6 +735,7 @@ impl Scheme for ProcScheme { Operation::Memory => "mem", Operation::Regs(RegsKind::Float) => "regs/float", Operation::Regs(RegsKind::Int) => "regs/int", + Operation::Regs(RegsKind::Env) => "regs/env", Operation::Trace => "trace", Operation::Static(path) => path, }); diff --git a/src/syscall/mod.rs b/src/syscall/mod.rs index b90dd54d..54c66e19 100644 --- a/src/syscall/mod.rs +++ b/src/syscall/mod.rs @@ -7,6 +7,7 @@ extern crate syscall; pub use self::syscall::{ FloatRegisters, IntRegisters, + EnvRegisters, data, error, flag, diff --git a/src/syscall/process.rs b/src/syscall/process.rs index e0ff3249..af15f50e 100644 --- a/src/syscall/process.rs +++ b/src/syscall/process.rs @@ -27,7 +27,7 @@ use crate::scheme::FileHandle; use crate::start::usermode; use crate::syscall::data::{SigAction, Stat}; use crate::syscall::error::*; -use crate::syscall::flag::{wifcontinued, wifstopped, AT_ENTRY, AT_NULL, AT_PHDR, CloneFlags, +use crate::syscall::flag::{wifcontinued, wifstopped, AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, CloneFlags, CLONE_FILES, CLONE_FS, CLONE_SIGHAND, CLONE_STACK, CLONE_VFORK, CLONE_VM, MapFlags, PROT_EXEC, PROT_READ, PROT_WRITE, PTRACE_EVENT_CLONE, PTRACE_STOP_EXIT, SigActionFlags, SIG_BLOCK, SIG_DFL, SIG_SETMASK, SIG_UNBLOCK, @@ -57,7 +57,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> { let mut image = vec![]; let mut stack_opt = None; let mut sigstack_opt = None; - let mut tls_opt = None; let grants; let name; let cwd; @@ -202,36 +201,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> { sigstack_opt = Some(new_sigstack); } - if let Some(ref tls) = context.tls { - let mut new_tls = context::memory::Tls { - master: tls.master, - file_size: tls.file_size, - mem: context::memory::Memory::new( - VirtualAddress::new(crate::USER_TMP_TLS_OFFSET), - tls.mem.size(), - PageFlags::new().write(true), - true - ), - offset: tls.offset, - }; - - - if flags.contains(CLONE_VM) { - unsafe { - new_tls.load(); - } - } else { - unsafe { - intrinsics::copy(tls.mem.start_address().data() as *const u8, - new_tls.mem.start_address().data() as *mut u8, - tls.mem.size()); - } - } - - new_tls.mem.remap(tls.mem.flags()); - tls_opt = Some(new_tls); - } - if flags.contains(CLONE_VM) { grants = Arc::clone(&context.grants); } else { @@ -352,6 +321,14 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> { context.arch = arch; + // This is needed because these registers may have changed after this context was + // switched to, but before this was called. + #[cfg(all(target_arch = "x86_64", feature = "x86_fsgsbase"))] + unsafe { + context.arch.fsbase = x86::bits64::segmentation::rdfsbase() as usize; + context.arch.gsbase = x86::bits64::segmentation::rdgsbase() as usize; + } + let mut active_utable = unsafe { ActivePageTable::new(TableKind::User) }; let mut active_ktable = unsafe { ActivePageTable::new(TableKind::Kernel) }; @@ -378,10 +355,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> { let mut new_ktable = unsafe { InactivePageTable::from_address(new_utable.address()) }; - #[cfg(target_arch = "x86_64")] - { - context.arch.update_tcb(pid.into()); - } // Copy kernel image mapping { @@ -502,15 +475,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> { context.sigstack = Some(sigstack); } - // Set up TCB - let tcb_addr = crate::USER_TCB_OFFSET + context.id.into() * PAGE_SIZE; - let mut tcb = context::memory::Memory::new( - VirtualAddress::new(tcb_addr), - PAGE_SIZE, - PageFlags::new().write(true).user(true), - true - ); - #[cfg(target_arch = "aarch64")] { if let Some(stack) = &mut context.kstack { @@ -534,38 +498,6 @@ pub fn clone(flags: CloneFlags, stack_base: usize) -> Result<ContextId> { } } - // Setup user TLS - if let Some(mut tls) = tls_opt { - // Copy TLS mapping - { - let frame = active_utable.p4()[crate::USER_TLS_PML4].pointed_frame().expect("user tls not mapped"); - let flags = active_utable.p4()[crate::USER_TLS_PML4].flags(); - active_utable.with(&mut new_utable, &mut temporary_upage, |mapper| { - mapper.p4_mut()[crate::USER_TLS_PML4].set(frame, flags); - }); - } - - // TODO: Make sure size is not greater than USER_TLS_SIZE - let tls_addr = crate::USER_TLS_OFFSET + context.id.into() * crate::USER_TLS_SIZE; - //println!("{}: Copy TLS: address 0x{:x}, size 0x{:x}", context.id.into(), tls_addr, tls.mem.size()); - tls.mem.move_to(VirtualAddress::new(tls_addr), &mut new_utable, &mut temporary_upage); - unsafe { - *(tcb_addr as *mut usize) = tls.mem.start_address().data() + tls.mem.size(); - } - context.tls = Some(tls); - } else { - //println!("{}: Copy TCB", context.id.into()); - let parent_tcb_addr = crate::USER_TCB_OFFSET + ppid.into() * PAGE_SIZE; - unsafe { - intrinsics::copy(parent_tcb_addr as *const u8, - tcb_addr as *mut u8, - tcb.size()); - } - } - - tcb.move_to(VirtualAddress::new(tcb_addr), &mut new_utable, &mut temporary_upage); - context.image.push(tcb.to_shared()); - context.name = name; context.cwd = cwd; @@ -599,13 +531,11 @@ fn empty(context: &mut context::Context, reaping: bool) { assert!(context.image.is_empty()); assert!(context.stack.is_none()); assert!(context.sigstack.is_none()); - assert!(context.tls.is_none()); } else { - // Unmap previous image, heap, grants, stack, and tls + // Unmap previous image, heap, grants, stack context.image.clear(); drop(context.stack.take()); drop(context.sigstack.take()); - drop(context.tls.take()); } // NOTE: If we do not replace the grants `Arc`, then a strange situation can appear where the @@ -651,10 +581,12 @@ impl Drop for ExecFile { } } +#[allow(clippy::too_many_arguments)] fn fexec_noreturn( setuid: Option<u32>, setgid: Option<u32>, name: Box<str>, + phdrs_region: core::ops::Range<usize>, data: Box<[u8]>, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]>]>, @@ -664,6 +596,11 @@ fn fexec_noreturn( let singlestep; let mut sp = crate::USER_STACK_OFFSET + crate::USER_STACK_SIZE - 256; + let phdrs_len = 4096; + let phdrs_base_addr = sp - phdrs_len; + + sp -= phdrs_len; + { let (vfork, ppid, files) = { let contexts = context::contexts(); @@ -678,6 +615,25 @@ fn fexec_noreturn( empty(&mut context, false); + #[cfg(all(target_arch = "x86_64"))] + { + context.arch.fsbase = 0; + context.arch.gsbase = 0; + + #[cfg(feature = "x86_fsgsbase")] + unsafe { + x86::bits64::segmentation::wrfsbase(0); + x86::bits64::segmentation::swapgs(); + x86::bits64::segmentation::wrgsbase(0); + x86::bits64::segmentation::swapgs(); + } + #[cfg(not(feature = "x86_fsgsbase"))] + unsafe { + x86::msr::wrmsr(x86::msr::IA32_FS_BASE, 0); + x86::msr::wrmsr(x86::msr::IA32_KERNEL_GSBASE, 0); + } + } + if let Some(uid) = setuid { context.euid = uid; } @@ -687,20 +643,10 @@ fn fexec_noreturn( } // Map and copy new segments - let mut tls_opt = None; { let elf = elf::Elf::from(&data).unwrap(); entry = elf.entry(); - // Always map TCB - let tcb_addr = crate::USER_TCB_OFFSET + context.id.into() * PAGE_SIZE; - let tcb_mem = context::memory::Memory::new( - VirtualAddress::new(tcb_addr), - PAGE_SIZE, - PageFlags::new().write(true).user(true), - true - ); - for segment in elf.segments() { match segment.p_type { program_header::PT_LOAD => { @@ -734,45 +680,11 @@ fn fexec_noreturn( context.image.push(memory.to_shared()); }, - program_header::PT_TLS => { - let aligned_size = if segment.p_align > 0 { - ((segment.p_memsz + (segment.p_align - 1))/segment.p_align) * segment.p_align - } else { - segment.p_memsz - } as usize; - let rounded_size = ((aligned_size + PAGE_SIZE - 1)/PAGE_SIZE) * PAGE_SIZE; - let rounded_offset = rounded_size - aligned_size; - - // TODO: Make sure size is not greater than USER_TLS_SIZE - let tls_addr = crate::USER_TLS_OFFSET + context.id.into() * crate::USER_TLS_SIZE; - let tls = context::memory::Tls { - master: VirtualAddress::new(segment.p_vaddr as usize), - file_size: segment.p_filesz as usize, - mem: context::memory::Memory::new( - VirtualAddress::new(tls_addr), - rounded_size as usize, - PageFlags::new().write(true).user(true), - true - ), - offset: rounded_offset as usize, - }; - - unsafe { - *(tcb_addr as *mut usize) = tls.mem.start_address().data() + tls.mem.size(); - } - - tls_opt = Some(tls); - }, _ => (), } } - - context.image.push(tcb_mem.to_shared()); } - // Data no longer required, can deallocate - drop(data); - // Map stack context.stack = Some(context::memory::Memory::new( VirtualAddress::new(crate::USER_STACK_OFFSET), @@ -789,20 +701,19 @@ fn fexec_noreturn( true )); - // Map TLS - if let Some(mut tls) = tls_opt { - unsafe { - tls.load(); - } - - context.tls = Some(tls); - } - let mut push = |arg| { sp -= mem::size_of::<usize>(); unsafe { *(sp as *mut usize) = arg; } }; + unsafe { + let mut source = core::slice::from_raw_parts_mut(phdrs_base_addr as *mut u8, phdrs_len); + source[..phdrs_region.len()].copy_from_slice(&data[phdrs_region.clone()]); + } + + // Data no longer required, can deallocate + drop(data); + // Push auxiliary vector push(AT_NULL); for &arg in auxv.iter().rev() { @@ -1019,7 +930,11 @@ pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]> auxv.push(AT_ENTRY); auxv.push(elf.entry()); auxv.push(AT_PHDR); - auxv.push(elf.program_headers()); + auxv.push(crate::USER_STACK_OFFSET + crate::USER_STACK_SIZE - 256 - 4096); + auxv.push(AT_PHENT); + auxv.push(elf.program_headers_size()); + auxv.push(AT_PHNUM); + auxv.push(elf.program_header_count()); auxv }; @@ -1068,26 +983,19 @@ pub fn fexec_kernel(fd: FileHandle, args: Box<[Box<[u8]>]>, vars: Box<[Box<[u8]> Some(auxv), ); }, - program_header::PT_LOAD => { - let voff = segment.p_vaddr as usize % PAGE_SIZE; - let vaddr = segment.p_vaddr as usize - voff; - - // Due to the Userspace and kernel TLS bases being located right above 2GB, - // limit any loadable sections to lower than that. Eventually we will need - // to replace this with a more intelligent TLS address - if vaddr >= 0x8000_0000 { - println!("exec: invalid section address {:X}", segment.p_vaddr); - return Err(Error::new(ENOEXEC)); - } - }, _ => (), } } + let phdr_range = elf.program_headers()..elf.program_headers() + elf.program_headers_size() * elf.program_header_count(); + if phdr_range.len() > 4096 { + return Err(Error::new(ENOMEM)); + } + // This is the point of no return, quite literaly. Any checks for validity need // to be done before, and appropriate errors returned. Otherwise, we have nothing // to return to. - fexec_noreturn(setuid, setgid, name.into_boxed_str(), data.into_boxed_slice(), args, vars, auxv.into_boxed_slice()); + fexec_noreturn(setuid, setgid, name.into_boxed_str(), phdr_range, data.into_boxed_slice(), args, vars, auxv.into_boxed_slice()); } pub fn fexec(fd: FileHandle, arg_ptrs: &[[usize; 2]], var_ptrs: &[[usize; 2]]) -> Result<usize> { diff --git a/syscall b/syscall index 841b5f42..519a09e9 160000 --- a/syscall +++ b/syscall @@ -1 +1 @@ -Subproject commit 841b5f42216782ce2aee6201c55b849ce5d7ab71 +Subproject commit 519a09e96400309a14375c04815da00f7cf5f526 -- GitLab