diff --git a/src/header/sys_auxv/mod.rs b/src/header/sys_auxv/mod.rs index 184eba1a76cbd1662c16a55e392b5c8363532275..6c4f9ccae1f6ae529ee332933a0580fc031f7526 100644 --- a/src/header/sys_auxv/mod.rs +++ b/src/header/sys_auxv/mod.rs @@ -2,33 +2,7 @@ use crate::platform::types::*; -pub const AT_NULL: usize = 0; /* End of vector */ -pub const AT_IGNORE: usize = 1; /* Entry should be ignored */ -pub const AT_EXECFD: usize = 2; /* File descriptor of program */ -pub const AT_PHDR: usize = 3; /* Program headers for program */ -pub const AT_PHENT: usize = 4; /* Size of program header entry */ -pub const AT_PHNUM: usize = 5; /* Number of program headers */ -pub const AT_PAGESZ: usize = 6; /* System page size */ -pub const AT_BASE: usize = 7; /* Base address of interpreter */ -pub const AT_FLAGS: usize = 8; /* Flags */ -pub const AT_ENTRY: usize = 9; /* Entry point of program */ -pub const AT_NOTELF: usize = 10; /* Program is not ELF */ -pub const AT_UID: usize = 11; /* Real uid */ -pub const AT_EUID: usize = 12; /* Effective uid */ -pub const AT_GID: usize = 13; /* Real gid */ -pub const AT_EGID: usize = 14; /* Effective gid */ -pub const AT_CLKTCK: usize = 17; /* Frequency of times() */ -pub const AT_PLATFORM: usize = 15; /* String identifying platform. */ -pub const AT_HWCAP: usize = 16; /* Machine-dependent hints about */ -pub const AT_FPUCW: usize = 18; /* Used FPU control word. */ -pub const AT_DCACHEBSIZE: usize = 19; /* Data cache block size. */ -pub const AT_ICACHEBSIZE: usize = 20; /* Instruction cache block size. */ -pub const AT_UCACHEBSIZE: usize = 21; /* Unified cache block size. */ -pub const AT_IGNOREPPC: usize = 22; /* Entry should be ignored. */ -pub const AT_BASE_PLATFORM: usize = 24; /* String identifying real platforms.*/ -pub const AT_RANDOM: usize = 25; /* Address of 16 random bytes. */ -pub const AT_HWCAP2: usize = 26; /* More machine-dependent hints about*/ -pub const AT_EXECFN: usize = 31; /* Filename of executable. */ +pub use crate::platform::auxv_defs::*; #[no_mangle] pub extern "C" fn getauxval(_t: c_ulong) -> c_ulong { diff --git a/src/platform/auxv_defs.rs b/src/platform/auxv_defs.rs new file mode 100644 index 0000000000000000000000000000000000000000..30a59962bb691f85b04a0eddef66ba813e2ea685 --- /dev/null +++ b/src/platform/auxv_defs.rs @@ -0,0 +1,27 @@ +pub const AT_NULL: usize = 0; /* End of vector */ +pub const AT_IGNORE: usize = 1; /* Entry should be ignored */ +pub const AT_EXECFD: usize = 2; /* File descriptor of program */ +pub const AT_PHDR: usize = 3; /* Program headers for program */ +pub const AT_PHENT: usize = 4; /* Size of program header entry */ +pub const AT_PHNUM: usize = 5; /* Number of program headers */ +pub const AT_PAGESZ: usize = 6; /* System page size */ +pub const AT_BASE: usize = 7; /* Base address of interpreter */ +pub const AT_FLAGS: usize = 8; /* Flags */ +pub const AT_ENTRY: usize = 9; /* Entry point of program */ +pub const AT_NOTELF: usize = 10; /* Program is not ELF */ +pub const AT_UID: usize = 11; /* Real uid */ +pub const AT_EUID: usize = 12; /* Effective uid */ +pub const AT_GID: usize = 13; /* Real gid */ +pub const AT_EGID: usize = 14; /* Effective gid */ +pub const AT_CLKTCK: usize = 17; /* Frequency of times() */ +pub const AT_PLATFORM: usize = 15; /* String identifying platform. */ +pub const AT_HWCAP: usize = 16; /* Machine-dependent hints about */ +pub const AT_FPUCW: usize = 18; /* Used FPU control word. */ +pub const AT_DCACHEBSIZE: usize = 19; /* Data cache block size. */ +pub const AT_ICACHEBSIZE: usize = 20; /* Instruction cache block size. */ +pub const AT_UCACHEBSIZE: usize = 21; /* Unified cache block size. */ +pub const AT_IGNOREPPC: usize = 22; /* Entry should be ignored. */ +pub const AT_BASE_PLATFORM: usize = 24; /* String identifying real platforms.*/ +pub const AT_RANDOM: usize = 25; /* Address of 16 random bytes. */ +pub const AT_HWCAP2: usize = 26; /* More machine-dependent hints about*/ +pub const AT_EXECFN: usize = 31; /* Filename of executable. */ diff --git a/src/platform/mod.rs b/src/platform/mod.rs index 890f83245dbfbca3fa55b45acd3ceee698447687..6169a3b5919a57c8704c842994f02f0910cf1a25 100644 --- a/src/platform/mod.rs +++ b/src/platform/mod.rs @@ -34,6 +34,12 @@ mod pte; pub use self::rlb::{Line, RawLineBuffer}; pub mod rlb; +#[cfg(target_os = "linux")] +pub mod auxv_defs; + +#[cfg(target_os = "redox")] +pub use redox_exec::auxv_defs; + use self::types::*; pub mod types; diff --git a/src/platform/redox/exec.rs b/src/platform/redox/exec.rs index ac3672bb37d98547313994cf13ad3f1b82ff1c01..c551eb3133db6967cab5dd83eefe2ab9fcd4dd94 100644 --- a/src/platform/redox/exec.rs +++ b/src/platform/redox/exec.rs @@ -1,20 +1,245 @@ +use crate::c_str::{CStr, CString}; +use crate::core_io::{BufReader, prelude::*, SeekFrom}; use crate::fs::File; +use crate::header::{fcntl, string::strlen}; +use crate::platform::{sys::{S_ISUID, S_ISGID}, types::*}; -use syscall::error::Result; -use redox_exec::FdGuard; +use syscall::data::Stat; +use syscall::flag::*; +use syscall::error::*; +use redox_exec::{FdGuard, FexecResult}; -pub fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], args_envs_size_without_nul: usize) -> Result<usize> { +fn fexec_impl(file: File, path: &[u8], args: &[&[u8]], envs: &[&[u8]], total_args_envs_size: usize, interp_override: Option<redox_exec::InterpOverride>) -> Result<usize> { let fd = *file; core::mem::forget(file); let image_file = FdGuard::new(fd as usize); let open_via_dup = FdGuard::new(syscall::open("thisproc:current/open_via_dup", 0)?); - let total_args_envs_size = args_envs_size_without_nul + args.len() + envs.len(); - let addrspace_selection_fd = redox_exec::fexec_impl(image_file, open_via_dup, path, args.iter().rev(), envs.iter().rev(), total_args_envs_size)?; + let addrspace_selection_fd = match redox_exec::fexec_impl(image_file, open_via_dup, path, args.iter().rev(), envs.iter().rev(), total_args_envs_size, interp_override)? { + FexecResult::Normal { addrspace_handle } => addrspace_handle, + FexecResult::Interp { image_file, open_via_dup, path, interp_override: new_interp_override } => { + drop(image_file); + drop(open_via_dup); + + // According to elf(5), PT_INTERP requires that the interpreter path be + // null-terminated. Violating this should therefore give the "format error" ENOEXEC. + let path_cstr = CStr::from_bytes_with_nul(&path).map_err(|_| Error::new(ENOEXEC))?; + + return execve(path_cstr, ArgEnv::Parsed { total_args_envs_size, args, envs }, Some(new_interp_override)); + } + }; // Dropping this FD will cause the address space switch. drop(addrspace_selection_fd); unreachable!(); } +pub enum ArgEnv<'a> { + C { argv: *const *mut c_char, envp: *const *mut c_char }, + Parsed { args: &'a [&'a [u8]], envs: &'a [&'a [u8]], total_args_envs_size: usize }, +} +pub fn execve(path: &CStr, arg_env: ArgEnv, interp_override: Option<redox_exec::InterpOverride>) -> Result<usize> { + // NOTE: We must omit O_CLOEXEC and close manually, otherwise it will be closed before we + // have even read it! + let mut image_file = File::open(path, O_RDONLY as c_int).map_err(|_| Error::new(ENOENT))?; + + // With execve now being implemented in userspace, we need to check ourselves that this + // file is actually executable. While checking for read permission is unnecessary as the + // scheme will not allow us to read otherwise, the execute bit is completely unenforced. We + // have the permission to mmap executable memory and fill it with the program even if it is + // unset, so the best we can do is check that nothing is executed by accident. + // + // TODO: At some point we might have capabilities limiting the ability to allocate + // executable memory, and in that case we might use the `escalate:` scheme as we already do + // when the binary needs setuid/setgid. + + let mut stat = Stat::default(); + syscall::fstat(*image_file as usize, &mut stat)?; + let uid = syscall::getuid()?; + let gid = syscall::getuid()?; + + let mode = if uid == stat.st_uid as usize { + (stat.st_mode >> 3 * 2) & 0o7 + } else if gid == stat.st_gid as usize { + (stat.st_mode >> 3 * 1) & 0o7 + } else { + stat.st_mode & 0o7 + }; + + if mode & 0o1 == 0o0 { + return Err(Error::new(EPERM)); + } + let wants_setugid = stat.st_mode & ((S_ISUID | S_ISGID) as u16) != 0; + + // Count arguments + let mut len = 0; + + match arg_env { + ArgEnv::C { argv, .. } => unsafe { + while !(*argv.add(len)).is_null() { + len += 1; + } + } + ArgEnv::Parsed { args, .. } => len = args.len(), + } + + let mut args: Vec<&[u8]> = Vec::with_capacity(len); + + // Read shebang (for example #!/bin/sh) + let mut _interpreter_path = None; + let is_interpreted = { + let mut read = 0; + let mut shebang = [0; 2]; + + while read < 2 { + match image_file.read(&mut shebang).map_err(|_| Error::new(ENOEXEC))? { + 0 => break, + i => read += i, + } + } + shebang == *b"#!" + }; + // Since the fexec implementation is almost fully done in userspace, the kernel can no + // longer set UID/GID accordingly, and this code checking for them before using + // hypothetical interfaces to upgrade UID/GID, can not be trusted. So we ask the + // `escalate:` scheme for help. Note that `escalate:` can be deliberately excluded from the + // scheme namespace to deny privilege escalation (such as su/sudo/doas) for untrusted + // processes. + // + // According to execve(2), Linux and most other UNIXes ignore setuid/setgid for interpreted + // executables and thereby simply keep the privileges as is. For compatibility we do that + // too. + + if is_interpreted { + // TODO: Does this support prepending args to the interpreter? E.g. + // #!/usr/bin/env python3 + + // So, this file is interpreted. + // Then, read the actual interpreter: + let mut interpreter = Vec::new(); + BufReader::new(&mut image_file).read_until(b'\n', &mut interpreter).map_err(|_| Error::new(EIO))?; + if interpreter.ends_with(&[b'\n']) { + interpreter.pop().unwrap(); + } + let cstring = CString::new(interpreter).map_err(|_| Error::new(ENOEXEC))?; + image_file = File::open(&cstring, O_RDONLY as c_int).map_err(|_| Error::new(ENOENT))?; + + // Make sure path is kept alive long enough, and push it to the arguments + _interpreter_path = Some(cstring); + let path_ref = _interpreter_path.as_ref().unwrap(); + args.push(path_ref.as_bytes()); + } else { + image_file.seek(SeekFrom::Start(0)).map_err(|_| Error::new(EIO))?; + } + + let (total_args_envs_size, args, envs): (usize, Vec<_>, Vec<_>) = match arg_env { + ArgEnv::C { mut argv, mut envp } => unsafe { + let mut args_envs_size_without_nul = 0; + + // Arguments + while !argv.read().is_null() { + let arg = argv.read(); + + let len = strlen(arg); + args.push(core::slice::from_raw_parts(arg as *const u8, len)); + args_envs_size_without_nul += len; + argv = argv.add(1); + } + + // Environment variables + let mut len = 0; + while !envp.add(len).read().is_null() { + len += 1; + } + + let mut envs: Vec<&[u8]> = Vec::with_capacity(len); + while !envp.read().is_null() { + let env = envp.read(); + + let len = strlen(env); + envs.push(core::slice::from_raw_parts(env as *const u8, len)); + args_envs_size_without_nul += len; + envp = envp.add(1); + } + (args_envs_size_without_nul + args.len() + envs.len(), args, envs) + } + ArgEnv::Parsed { args: new_args, envs, total_args_envs_size } => { + let prev_size: usize = args.iter().map(|a| a.len()).sum(); + args.extend(new_args); + (total_args_envs_size + prev_size, args, Vec::from(envs)) + } + }; + + + // Close all O_CLOEXEC file descriptors. TODO: close_range? + { + // NOTE: This approach of implementing O_CLOEXEC will not work in multithreaded + // scenarios. While execve() is undefined according to POSIX if there exist sibling + // threads, it could still be allowed by keeping certain file descriptors and instead + // set the active file table. + let files_fd = File::new(syscall::open("thisproc:current/filetable", O_RDONLY)? as c_int); + for line in BufReader::new(files_fd).lines() { + let line = match line { + Ok(l) => l, + Err(_) => break, + }; + let fd = match line.parse::<usize>() { + Ok(f) => f, + Err(_) => continue, + }; + + let flags = syscall::fcntl(fd, F_GETFD, 0)?; + + if flags & O_CLOEXEC == O_CLOEXEC { + let _ = syscall::close(fd); + } + } + } + + if !is_interpreted && wants_setugid { + // Make sure the last file descriptor not covered by O_CLOEXEC is not leaked. + drop(image_file); + + // We are now going to invoke `escalate:` rather than loading the program ourselves. + let escalate_fd = FdGuard::new(syscall::open("escalate:", O_WRONLY)?); + + // First, we write the path. + // + // TODO: For improved security, use a hypothetical SYS_DUP_FORWARD syscall to give the + // scheme our file descriptor. It can check through the kernel-overwritten stat.st_dev + // field that it pertains to a "trusted" scheme (i.e. of at least the privilege the + // new uid/gid has), although for now only root can open schemes. Passing a file + // descriptor and not a path will allow escalated to run in a limited namespace. + // + // TODO: Plus, at this point fexecve is not implemented (but specified in + // POSIX.1-2008), and to avoid bad syscalls such as fpath, passing a file descriptor + // would be better. + let _ = syscall::write(*escalate_fd, path.to_bytes()); + + // Second, we write the flattened args and envs with NUL characters separating + // individual items. This can be copied directly into the new executable's memory. + let _ = syscall::write(*escalate_fd, &flatten_with_nul(args))?; + let _ = syscall::write(*escalate_fd, &flatten_with_nul(envs))?; + + // Closing will notify the scheme, and from that point we will no longer have control + // over this process (unless it fails). We do this manually since drop cannot handle + // errors. + let fd = *escalate_fd as usize; + core::mem::forget(escalate_fd); + + syscall::close(fd)?; + + unreachable!() + } else { + fexec_impl(image_file, path.to_bytes(), &args, &envs, total_args_envs_size, interp_override) + } +} +fn flatten_with_nul<T>(iter: impl IntoIterator<Item = T>) -> Box<[u8]> where T: AsRef<[u8]> { + let mut vec = Vec::new(); + for item in iter { + vec.extend(item.as_ref()); + vec.push(b'\0'); + } + vec.into_boxed_slice() +} diff --git a/src/platform/redox/mod.rs b/src/platform/redox/mod.rs index b456e85d75c790b146838d40c746c923e7a67c9a..67c9392e718994e90960ef367c315c28533ce0f3 100644 --- a/src/platform/redox/mod.rs +++ b/src/platform/redox/mod.rs @@ -34,6 +34,11 @@ use super::{errno, types::*, Pal, Read}; static mut BRK_CUR: *mut c_void = ptr::null_mut(); static mut BRK_END: *mut c_void = ptr::null_mut(); +const PAGE_SIZE: usize = 4096; +fn round_up_to_page_size(val: usize) -> usize { + (val + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE +} + mod clone; mod epoll; mod exec; @@ -67,14 +72,6 @@ pub fn e(sys: Result<usize>) -> usize { } } } -fn flatten_with_nul<T>(iter: impl IntoIterator<Item = T>) -> Box<[u8]> where T: AsRef<[u8]> { - let mut vec = Vec::new(); - for item in iter { - vec.extend(item.as_ref()); - vec.push(b'\0'); - } - vec.into_boxed_slice() -} pub struct Sys; @@ -216,223 +213,10 @@ impl Pal for Sys { unsafe fn execve( path: &CStr, - mut argv: *const *mut c_char, - mut envp: *const *mut c_char, + argv: *const *mut c_char, + envp: *const *mut c_char, ) -> c_int { - // NOTE: We must omit O_CLOEXEC and close manually, otherwise it will be closed before we - // have even read it! - let mut file = match File::open(path, fcntl::O_RDONLY) { - Ok(file) => file, - Err(_) => return -1, - }; - let fd = *file as usize; - - // With execve now being implemented in userspace, we need to check ourselves that this - // file is actually executable. While checking for read permission is unnecessary as the - // scheme will not allow us to read otherwise, the execute bit is completely unenforced. We - // have the permission to mmap executable memory and fill it with the program even if it is - // unset, so the best we can do is check that nothing is executed by accident. - // - // TODO: At some point we might have capabilities limiting the ability to allocate - // executable memory, and in that case we might use the `escalate:` scheme as we already do - // when the binary needs setuid/setgid. - - let mut stat = redox_stat::default(); - if e(syscall::fstat(fd, &mut stat)) == !0 { - return -1; - } - let uid = e(syscall::getuid()); - if uid == !0 { - return -1; - } - let gid = e(syscall::getuid()); - if gid == !0 { - return -1; - } - - let mode = if uid == stat.st_uid as usize { - (stat.st_mode >> 3 * 2) & 0o7 - } else if gid == stat.st_gid as usize { - (stat.st_mode >> 3 * 1) & 0o7 - } else { - stat.st_mode & 0o7 - }; - - if mode & 0o1 == 0o0 { - errno = EPERM; - return -1; - } - let wants_setugid = stat.st_mode & ((S_ISUID | S_ISGID) as u16) != 0; - - // Count arguments - let mut len = 0; - while !(*argv.add(len)).is_null() { - len += 1; - } - - let mut args: Vec<&[u8]> = Vec::with_capacity(len); - - // Read shebang (for example #!/bin/sh) - let mut _interpreter_path = None; - let is_interpreted = { - let mut read = 0; - let mut shebang = [0; 2]; - - while read < 2 { - match file.read(&mut shebang) { - Ok(0) => break, - Ok(i) => read += i, - Err(_) => return -1, - } - } - shebang == *b"#!" - }; - // Since the fexec implementation is almost fully done in userspace, the kernel can no - // longer set UID/GID accordingly, and this code checking for them before using - // hypothetical interfaces to upgrade UID/GID, can not be trusted. So we ask the - // `escalate:` scheme for help. Note that `escalate:` can be deliberately excluded from the - // scheme namespace to deny privilege escalation (such as su/sudo/doas) for untrusted - // processes. - // - // According to execve(2), Linux and most other UNIXes ignore setuid/setgid for interpreted - // executables and thereby simply keep the privileges as is. For compatibility we do that - // too. - - if is_interpreted { - // So, this file is interpreted. - // Then, read the actual interpreter: - let mut interpreter = Vec::new(); - if BufReader::new(&mut file).read_until(b'\n', &mut interpreter).is_err() { - return -1; - } - if interpreter.ends_with(&[b'\n']) { - interpreter.pop().unwrap(); - } - let cstring = match CString::new(interpreter) { - Ok(cstring) => cstring, - Err(_) => return -1, - }; - file = match File::open(&cstring, fcntl::O_RDONLY) { - Ok(file) => file, - Err(_) => return -1, - }; - - // Make sure path is kept alive long enough, and push it to the arguments - _interpreter_path = Some(cstring); - let path_ref = _interpreter_path.as_ref().unwrap(); - args.push(path_ref.as_bytes()); - } else { - if file.seek(SeekFrom::Start(0)).is_err() { - return -1; - } - } - let mut args_envs_size_without_nul = 0; - - // Arguments - while !argv.read().is_null() { - let arg = argv.read(); - - let len = strlen(arg); - args.push(core::slice::from_raw_parts(arg as *const u8, len)); - args_envs_size_without_nul += len; - argv = argv.add(1); - } - - // Environment variables - let mut len = 0; - while !envp.add(len).read().is_null() { - len += 1; - } - - let mut envs: Vec<&[u8]> = Vec::with_capacity(len); - while !envp.read().is_null() { - let env = envp.read(); - - let len = strlen(env); - envs.push(core::slice::from_raw_parts(env as *const u8, len)); - args_envs_size_without_nul += len; - envp = envp.add(1); - } - - // Close all O_CLOEXEC file descriptors. TODO: close_range? - { - // NOTE: This approach of implementing O_CLOEXEC will not work in multithreaded - // scenarios. While execve() is undefined according to POSIX if there exist sibling - // threads, it could still be allowed by keeping certain file descriptors and instead - // set the active file table. - let name = CStr::from_bytes_with_nul(b"thisproc:current/filetable\0").expect("string should be valid"); - let files_fd = match File::open(name, fcntl::O_RDONLY) { - Ok(f) => f, - Err(_) => return -1, - }; - for line in BufReader::new(files_fd).lines() { - let line = match line { - Ok(l) => l, - Err(_) => break, - }; - let fd = match line.parse::<usize>() { - Ok(f) => f, - Err(_) => continue, - }; - - let flags = Self::fcntl(fd as c_int, fcntl::F_GETFD, 0); - if flags != -1 { - if flags & fcntl::O_CLOEXEC == fcntl::O_CLOEXEC { - let _ = Self::close(fd as c_int); - } - } - } - } - - if !is_interpreted && wants_setugid { - // Make sure the last file descriptor not covered by O_CLOEXEC is not leaked. - drop(file); - - let name = CStr::from_bytes_with_nul(b"escalate:\0").expect("string should be valid"); - // We are now going to invoke `escalate:` rather than loading the program ourselves. - let mut escalate_fd = match File::open(name, fcntl::O_WRONLY) { - Ok(f) => f, - Err(_) => return -1, - }; - - // First, we write the path. - // - // TODO: For improved security, use a hypothetical SYS_DUP_FORWARD syscall to give the - // scheme our file descriptor. It can check through the kernel-overwritten stat.st_dev - // field that it pertains to a "trusted" scheme (i.e. of at least the privilege the - // new uid/gid has), although for now only root can open schemes. Passing a file - // descriptor and not a path will allow escalated to run in a limited namespace. - // - // TODO: Plus, at this point fexecve is not implemented (but specified in - // POSIX.1-2008), and to avoid bad syscalls such as fpath, passing a file descriptor - // would be better. - if escalate_fd.write_all(path.to_bytes()).is_err() { - return -1; - } - - // Second, we write the flattened args and envs with NUL characters separating - // individual items. This can be copied directly into the new executable's memory. - if escalate_fd.write_all(&flatten_with_nul(args)).is_err() { - return -1; - } - if escalate_fd.write_all(&flatten_with_nul(envs)).is_err() { - return -1; - } - - // Closing will notify the scheme, and from that point we will no longer have control - // over this process (unless it fails). We do this manually since drop cannot handle - // errors. - let fd = *escalate_fd as usize; - core::mem::forget(escalate_fd); - - if let Err(err) = syscall::close(fd) { - return e(Err(err)) as c_int; - } - - unreachable!() - } else { - e(self::exec::fexec_impl(file, path.to_bytes(), &args, &envs, args_envs_size_without_nul)) as c_int - } + e(self::exec::execve(path, self::exec::ArgEnv::C { argv, envp }, None)) as c_int } fn fchdir(fd: c_int) -> c_int { @@ -701,7 +485,7 @@ impl Pal for Sys { } fn getpagesize() -> usize { - 4096 + PAGE_SIZE } fn getpgid(pid: pid_t) -> pid_t { @@ -848,7 +632,7 @@ impl Pal for Sys { ) -> *mut c_void { let map = Map { offset: off as usize, - size: len, + size: round_up_to_page_size(len), flags: syscall::MapFlags::from_bits_truncate( ((prot as usize) << 16) | ((flags as usize) & 0xFFFF), ), @@ -865,7 +649,7 @@ impl Pal for Sys { unsafe fn mprotect(addr: *mut c_void, len: usize, prot: c_int) -> c_int { e(syscall::mprotect( addr as usize, - len, + round_up_to_page_size(len), syscall::MapFlags::from_bits((prot as usize) << 16) .expect("mprotect: invalid bit pattern"), )) as c_int @@ -877,7 +661,7 @@ impl Pal for Sys { /* TODO e(syscall::msync( addr as usize, - len, + round_up_to_page_size(len), flags )) as c_int */ @@ -894,7 +678,7 @@ impl Pal for Sys { } unsafe fn munmap(addr: *mut c_void, len: usize) -> c_int { - if e(syscall::funmap(addr as usize, len)) == !0 { + if e(syscall::funmap(addr as usize, round_up_to_page_size(len))) == !0 { return !0; } 0 diff --git a/src/platform/redox/redox-exec/src/lib.rs b/src/platform/redox/redox-exec/src/lib.rs index 13f0c10f49109753308da703c7b2369a40820caf..1146b0ede05cba6713f260baf9a00a5791e1feff 100644 --- a/src/platform/redox/redox-exec/src/lib.rs +++ b/src/platform/redox/redox-exec/src/lib.rs @@ -1,34 +1,46 @@ #![no_std] -#![feature(array_chunks)] +#![feature(array_chunks, map_first_last)] extern crate alloc; -use core::convert::TryFrom; use core::mem::size_of; use alloc::{ - collections::{btree_map::Entry, BTreeMap}, - vec::Vec, + boxed::Box, + collections::BTreeMap, vec, }; use syscall::{ error::*, - flag::{AT_ENTRY, AT_NULL, AT_PHDR, AT_PHENT, AT_PHNUM, MapFlags, O_WRONLY, SEEK_SET}, + flag::{MapFlags, SEEK_SET}, }; #[cfg(target_arch = "x86_64")] const PAGE_SIZE: usize = 4096; -pub fn fexec_impl<A, E>(image_file: FdGuard, open_via_dup: FdGuard, path: &[u8], args: A, envs: E, total_args_envs_size: usize) -> Result<FdGuard> +pub enum FexecResult { + Normal { addrspace_handle: FdGuard }, + Interp { path: Box<[u8]>, image_file: FdGuard, open_via_dup: FdGuard, interp_override: InterpOverride }, +} +pub struct InterpOverride { + phs: Box<[u8]>, + at_entry: usize, + at_phnum: usize, + at_phent: usize, + name: Box<[u8]>, + tree: BTreeMap<usize, usize>, +} + +pub fn fexec_impl<A, E>(image_file: FdGuard, open_via_dup: FdGuard, path: &[u8], args: A, envs: E, total_args_envs_size: usize, mut interp_override: Option<InterpOverride>) -> Result<FexecResult> where A: IntoIterator, E: IntoIterator, A::Item: AsRef<[u8]>, E::Item: AsRef<[u8]>, { - use goblin::elf64::{header::Header, program_header::program_header64::{ProgramHeader, PT_LOAD, PF_W, PF_X}}; + use goblin::elf64::{header::Header, program_header::program_header64::{ProgramHeader, PT_LOAD, PT_INTERP, PF_W, PF_X}}; // Here, we do the minimum part of loading an application, which is what the kernel used to do. // We load the executable into memory (albeit at different offsets in this executable), fix @@ -49,20 +61,24 @@ where const MAX_PH_SIZE: usize = 1024 * 1024; let phentsize = u64::from(header.e_phentsize) as usize; let phnum = u64::from(header.e_phnum) as usize; - let pheaders_size = phentsize.saturating_mul(phnum); + let pheaders_size = phentsize.saturating_mul(phnum).saturating_add(size_of::<Header>()); if pheaders_size > MAX_PH_SIZE { return Err(Error::new(E2BIG)); } - let mut phs = vec! [0_u8; pheaders_size]; + let mut phs_raw = vec! [0_u8; pheaders_size]; + phs_raw[..size_of::<Header>()].copy_from_slice(&header_bytes); + let phs = &mut phs_raw[size_of::<Header>()..]; - let mut tree = BTreeMap::new(); - tree.insert(0, PAGE_SIZE); + // TODO: Remove clone, but this would require more as_refs and as_muts + let mut tree = interp_override.as_mut().map_or_else(|| { + core::iter::once((0, PAGE_SIZE)).collect::<BTreeMap<_, _>>() + }, |o| core::mem::take(&mut o.tree)); - const BUFSZ: usize = 65536; + const BUFSZ: usize = 1024 * 256; let mut buf = vec! [0_u8; BUFSZ]; - read_all(*image_file as usize, Some(header.e_phoff), &mut phs).map_err(|_| Error::new(EIO))?; + read_all(*image_file as usize, Some(header.e_phoff), phs).map_err(|_| Error::new(EIO))?; for ph_idx in 0..phnum { let ph_bytes = &phs[ph_idx * phentsize..(ph_idx + 1) * phentsize]; @@ -84,19 +100,42 @@ where if segment.p_filesz > segment.p_memsz { return Err(Error::new(ENOEXEC)); } - if segment.p_type == PT_LOAD { - mprotect_remote(*grants_fd, vaddr, size, flags)?; - syscall::lseek(*image_file as usize, segment.p_offset as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; - syscall::lseek(*memory_fd, segment.p_vaddr as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; - - for size in core::iter::repeat(buf.len()).take((segment.p_filesz as usize) / buf.len()).chain(Some((segment.p_filesz as usize) % buf.len())) { - read_all(*image_file as usize, None, &mut buf[..size]).map_err(|_| Error::new(EIO))?; - let _ = syscall::write(*memory_fd, &buf[..size]).map_err(|_| Error::new(EIO))?; + #[forbid(unreachable_patterns)] + match segment.p_type { + // PT_INTERP must come before any PT_LOAD, so we don't have to iterate twice. + PT_INTERP => { + let mut interp = vec! [0_u8; segment.p_filesz as usize]; + read_all(*image_file as usize, Some(segment.p_offset), &mut interp)?; + + return Ok(FexecResult::Interp { + path: interp.into_boxed_slice(), + image_file, + open_via_dup, + interp_override: InterpOverride { + at_entry: header.e_entry as usize, + at_phnum: phnum, + at_phent: phentsize, + phs: phs_raw.into_boxed_slice(), + name: path.into(), + tree, + } + }); } - - if !tree.range(..=vaddr).next_back().filter(|(start, size)| **start + **size > vaddr).is_some() { - tree.insert(vaddr, size); + PT_LOAD => { + mprotect_remote(*grants_fd, vaddr, size, flags)?; + syscall::lseek(*image_file as usize, segment.p_offset as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; + syscall::lseek(*memory_fd, segment.p_vaddr as isize, SEEK_SET).map_err(|_| Error::new(EIO))?; + + for size in core::iter::repeat(buf.len()).take((segment.p_filesz as usize) / buf.len()).chain(Some((segment.p_filesz as usize) % buf.len())) { + read_all(*image_file as usize, None, &mut buf[..size]).map_err(|_| Error::new(EIO))?; + let _ = syscall::write(*memory_fd, &buf[..size]).map_err(|_| Error::new(EIO))?; + } + + if !tree.range(..=vaddr).next_back().filter(|(start, size)| **start + **size > vaddr).is_some() { + tree.insert(vaddr, size); + } } + _ => continue, } } // Setup a stack starting from the very end of the address space, and then growing downwards. @@ -113,22 +152,31 @@ where write_all(*memory_fd, Some(sp as u64), &usize::to_ne_bytes(word)) }; - let pheaders_size_aligned = (pheaders_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE; + let pheaders_to_convey = if let Some(ref r#override) = interp_override { + &*r#override.phs + } else { + &*phs_raw + }; + let pheaders_size_aligned = (pheaders_to_convey.len()+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE; let pheaders = find_free_target_addr(&tree, pheaders_size_aligned).ok_or(Error::new(ENOMEM))?; tree.insert(pheaders, pheaders_size_aligned); mprotect_remote(*grants_fd, pheaders, pheaders_size_aligned, MapFlags::PROT_READ)?; - write_all(*memory_fd, Some(pheaders as u64), &phs)?; + write_all(*memory_fd, Some(pheaders as u64), &pheaders_to_convey)?; push(0)?; push(AT_NULL)?; push(header.e_entry as usize)?; + if let Some(ref r#override) = interp_override { + push(AT_BASE)?; + push(r#override.at_entry)?; + } push(AT_ENTRY)?; - push(pheaders)?; + push(pheaders + size_of::<Header>())?; push(AT_PHDR)?; - push(header.e_phnum as usize)?; + push(interp_override.as_ref().map_or(header.e_phnum as usize, |o| o.at_phnum))?; push(AT_PHNUM)?; - push(header.e_phentsize as usize)?; + push(interp_override.as_ref().map_or(header.e_phentsize as usize, |o| o.at_phent))?; push(AT_PHENT)?; let args_envs_size_aligned = (total_args_envs_size+PAGE_SIZE-1)/PAGE_SIZE*PAGE_SIZE; @@ -176,14 +224,18 @@ where // TODO: Restore old name if exec failed? if let Ok(name_fd) = syscall::dup(*open_via_dup, b"name").map(FdGuard::new) { - let _ = syscall::write(*name_fd, path); + let _ = syscall::write(*name_fd, interp_override.as_ref().map_or(path, |o| &o.name)); + } + { + let mmap_min_fd = FdGuard::new(syscall::dup(*open_via_dup, b"mmap-min-addr")?); + let _ = syscall::write(*mmap_min_fd, &usize::to_ne_bytes(tree.iter().rev().nth(1).map_or(0, |(off, len)| *off + *len))); } let addrspace_selection_fd = FdGuard::new(syscall::dup(*open_via_dup, b"current-addrspace")?); let _ = syscall::write(*addrspace_selection_fd, &create_set_addr_space_buf(*grants_fd, header.e_entry as usize, sp)); - Ok(addrspace_selection_fd) + Ok(FexecResult::Normal { addrspace_handle: addrspace_selection_fd }) } fn mprotect_remote(socket: usize, addr: usize, len: usize, flags: MapFlags) -> Result<()> { let mut grants_buf = [0_u8; 24]; @@ -295,3 +347,8 @@ pub fn create_set_addr_space_buf(space: usize, ip: usize, sp: usize) -> [u8; siz *chunks.next().unwrap() = usize::to_ne_bytes(ip); buf } + +#[path = "../../../auxv_defs.rs"] +pub mod auxv_defs; + +use auxv_defs::*;