Compare revisions

32aeff0c · 32aeff0c · 32aeff0c · 32aeff0c · 32aeff0c · 32aeff0c
--- a/src/mount/mod.rs
+++ b/src/mount/mod.rs
-#[cfg(not(target_os = "redox"))]
+#[cfg(all(not(target_os = "redox"), not(fuzzing)))]
 mod fuse;
+#[cfg(all(not(target_os = "redox"), fuzzing))]
+pub mod fuse;

 #[cfg(not(target_os = "redox"))]
 pub use self::fuse::mount;

--- a/src/mount/redox/mod.rs
+++ b/src/mount/redox/mod.rs
-use std::fs::File;
-use std::io::{self, Read, Write};
+use redox_scheme::{RequestKind, SignalBehavior, Socket, V2};
+use std::io;
 use std::path::Path;
 use std::sync::atomic::Ordering;
-use syscall::{Packet, SchemeMut};

-use crate::{Disk, FileSystem, IS_UMT};
+use crate::{Disk, FileSystem, Transaction, IS_UMT};

 use self::scheme::FileScheme;

@@ -15,41 +14,37 @@ pub fn mount<D, P, T, F>(filesystem: FileSystem<D>, mountpoint: P, mut callback:
 where
    D: Disk,
    P: AsRef<Path>,
-    F: FnMut(&Path) -> T,
+    F: FnOnce(&Path) -> T,
 {
    let mountpoint = mountpoint.as_ref();
-    let socket_path = format!(":{}", mountpoint.display());
-    let mut socket = File::create(&socket_path)?;
+    let socket = Socket::<V2>::create(&format!("{}", mountpoint.display()))?;

    let mounted_path = format!("{}:", mountpoint.display());
    let res = callback(Path::new(&mounted_path));

    let mut scheme = FileScheme::new(format!("{}", mountpoint.display()), filesystem);
-    loop {
-        if IS_UMT.load(Ordering::SeqCst) > 0 {
-            break Ok(res);
-        }
-
-        let mut packet = Packet::default();
-        match socket.read(&mut packet) {
-            Ok(0) => break Ok(res),
-            Ok(_ok) => (),
-            Err(err) => {
-                if err.kind() == io::ErrorKind::Interrupted {
-                    continue;
+    while IS_UMT.load(Ordering::SeqCst) == 0 {
+        let req = match socket.next_request(SignalBehavior::Restart)? {
+            None => break,
+            Some(req) => {
+                if let RequestKind::Call(r) = req.kind() {
+                    r
                } else {
-                    break Err(err);
+                    // TODO: Redoxfs does not yet support asynchronous file IO. It might still make
+                    // sense to implement cancellation for huge buffers, e.g. dd bs=1G
+                    continue;
                }
            }
-        }
+        };
+        let response = req.handle_scheme_mut(&mut scheme);

-        scheme.handle(&mut packet);
-
-        match socket.write(&packet) {
-            Ok(_ok) => (),
-            Err(err) => {
-                break Err(err);
-            }
+        if !socket.write_response(response, SignalBehavior::Restart)? {
+            break;
        }
    }
+
+    // Squash allocations and sync on unmount
+    let _ = Transaction::new(&mut scheme.fs).commit(true);
+
+    Ok(res)
 }
--- a/src/mount/redox/resource.rs
+++ b/src/mount/redox/resource.rs
-use std::cmp::{max, min};
 use std::slice;
 use std::time::{SystemTime, UNIX_EPOCH};

 use alloc::collections::BTreeMap;
+use libredox::call::MmapArgs;
 use range_tree::RangeTree;

-use syscall::{MAP_PRIVATE, PAGE_SIZE, EBADFD};
-use syscall::data::{Map, Stat, TimeSpec};
-use syscall::error::{Error, Result, EBADF, EINVAL, EISDIR, ENOMEM, EPERM};
+use syscall::data::{Stat, TimeSpec};
+use syscall::error::{Error, Result, EBADF, EINVAL, EISDIR, EPERM};
 use syscall::flag::{
    MapFlags, F_GETFL, F_SETFL, MODE_PERM, O_ACCMODE, O_APPEND, O_RDONLY, O_RDWR, O_WRONLY,
-    PROT_READ, PROT_WRITE, SEEK_CUR, SEEK_END, SEEK_SET,
+    PROT_READ, PROT_WRITE,
 };
+use syscall::{EBADFD, PAGE_SIZE};

 use crate::{Disk, Node, Transaction, TreePtr};

@@ -28,15 +28,28 @@ pub trait Resource<D: Disk> {

    fn set_path(&mut self, path: &str);

-    fn read(&mut self, buf: &mut [u8], tx: &mut Transaction<D>) -> Result<usize>;
+    fn read(&mut self, buf: &mut [u8], offset: u64, tx: &mut Transaction<D>) -> Result<usize>;

-    fn write(&mut self, buf: &[u8], tx: &mut Transaction<D>) -> Result<usize>;
+    fn write(&mut self, buf: &[u8], offset: u64, tx: &mut Transaction<D>) -> Result<usize>;

-    fn seek(&mut self, offset: isize, whence: usize, tx: &mut Transaction<D>) -> Result<isize>;
+    fn fsize(&mut self, tx: &mut Transaction<D>) -> Result<u64>;

-    fn fmap(&mut self, fmaps: &mut Fmaps, flags: MapFlags, size: usize, offset: u64, tx: &mut Transaction<D>) -> Result<usize>;
+    fn fmap(
+        &mut self,
+        fmaps: &mut Fmaps,
+        flags: MapFlags,
+        size: usize,
+        offset: u64,
+        tx: &mut Transaction<D>,
+    ) -> Result<usize>;

-    fn funmap(&mut self, fmaps: &mut Fmaps, offset: u64, size: usize, tx: &mut Transaction<D>) -> Result<usize>;
+    fn funmap(
+        &mut self,
+        fmaps: &mut Fmaps,
+        offset: u64,
+        size: usize,
+        tx: &mut Transaction<D>,
+    ) -> Result<usize>;

    fn fchmod(&mut self, mode: u16, tx: &mut Transaction<D>) -> Result<usize> {
        let mut node = tx.read_tree(self.node_ptr())?;
@@ -129,7 +142,6 @@ pub struct DirResource {
    parent_ptr_opt: Option<TreePtr<Node>>,
    node_ptr: TreePtr<Node>,
    data: Option<Vec<u8>>,
-    seek: isize,
    uid: u32,
 }

@@ -146,7 +158,6 @@ impl DirResource {
            parent_ptr_opt,
            node_ptr,
            data,
-            seek: 0,
            uid,
        }
    }
@@ -171,7 +182,6 @@ impl<D: Disk> Resource<D> for DirResource {
            parent_ptr_opt: self.parent_ptr_opt,
            node_ptr: self.node_ptr,
            data: self.data.clone(),
-            seek: self.seek,
            uid: self.uid,
        }))
    }
@@ -180,38 +190,43 @@ impl<D: Disk> Resource<D> for DirResource {
        self.path = path.to_string();
    }

-    fn read(&mut self, buf: &mut [u8], _tx: &mut Transaction<D>) -> Result<usize> {
+    fn read(&mut self, buf: &mut [u8], offset: u64, _tx: &mut Transaction<D>) -> Result<usize> {
        let data = self.data.as_ref().ok_or(Error::new(EISDIR))?;
-        let size = data.len() as isize;
-        let mut i = 0;
-        while i < buf.len() && self.seek < size {
-            buf[i] = data[self.seek as usize];
-            i += 1;
-            self.seek += 1;
-        }
-        Ok(i)
+        let src = usize::try_from(offset)
+            .ok()
+            .and_then(|o| data.get(o..))
+            .unwrap_or(&[]);
+
+        let byte_count = core::cmp::min(src.len(), buf.len());
+        buf[..byte_count].copy_from_slice(&src[..byte_count]);
+        Ok(byte_count)
    }

-    fn write(&mut self, _buf: &[u8], _tx: &mut Transaction<D>) -> Result<usize> {
+    fn write(&mut self, _buf: &[u8], _offset: u64, _tx: &mut Transaction<D>) -> Result<usize> {
        Err(Error::new(EBADF))
    }

-    fn seek(&mut self, offset: isize, whence: usize, _tx: &mut Transaction<D>) -> Result<isize> {
-        let data = self.data.as_ref().ok_or(Error::new(EBADF))?;
-        let size = data.len() as isize;
-        self.seek = match whence {
-            SEEK_SET => max(0, min(size, offset)),
-            SEEK_CUR => max(0, min(size, self.seek + offset)),
-            SEEK_END => max(0, min(size, size + offset)),
-            _ => return Err(Error::new(EINVAL)),
-        };
-        Ok(self.seek)
+    fn fsize(&mut self, _tx: &mut Transaction<D>) -> Result<u64> {
+        Ok(self.data.as_ref().ok_or(Error::new(EBADF))?.len() as u64)
    }

-    fn fmap(&mut self, _fmaps: &mut Fmaps, _flags: MapFlags, _size: usize, _offset: u64, _tx: &mut Transaction<D>) -> Result<usize> {
+    fn fmap(
+        &mut self,
+        _fmaps: &mut Fmaps,
+        _flags: MapFlags,
+        _size: usize,
+        _offset: u64,
+        _tx: &mut Transaction<D>,
+    ) -> Result<usize> {
        Err(Error::new(EBADF))
    }
-    fn funmap(&mut self, _fmaps: &mut Fmaps, _offset: u64, _size: usize, _tx: &mut Transaction<D>) -> Result<usize> {
+    fn funmap(
+        &mut self,
+        _fmaps: &mut Fmaps,
+        _offset: u64,
+        _size: usize,
+        _tx: &mut Transaction<D>,
+    ) -> Result<usize> {
        Err(Error::new(EBADF))
    }

@@ -263,16 +278,11 @@ impl Fmap {

        let buf = slice::from_raw_parts_mut(address, unaligned_size);

-        let count = match tx.read_node(
-            node_ptr,
-            offset,
-            buf,
-            atime.as_secs(),
-            atime.subsec_nanos(),
-        ) {
+        let count = match tx.read_node(node_ptr, offset, buf, atime.as_secs(), atime.subsec_nanos())
+        {
            Ok(ok) => ok,
            Err(err) => {
-                let _ = syscall::funmap(address as usize, aligned_size);
+                let _ = libredox::call::munmap(address.cast(), aligned_size);
                return Err(err);
            }
        };
@@ -287,7 +297,14 @@ impl Fmap {
        })
    }

-    pub unsafe fn sync<D: Disk>(&mut self, node_ptr: TreePtr<Node>, base: *mut u8, offset: u64, size: usize, tx: &mut Transaction<D>) -> Result<()> {
+    pub unsafe fn sync<D: Disk>(
+        &mut self,
+        node_ptr: TreePtr<Node>,
+        base: *mut u8,
+        offset: u64,
+        size: usize,
+        tx: &mut Transaction<D>,
+    ) -> Result<()> {
        if self.flags & PROT_WRITE == PROT_WRITE {
            let mtime = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
            tx.write_node(
@@ -307,11 +324,12 @@ pub struct FileResource {
    parent_ptr_opt: Option<TreePtr<Node>>,
    node_ptr: TreePtr<Node>,
    flags: usize,
-    seek: isize,
    uid: u32,
 }
+#[derive(Debug)]
 pub struct FileMmapInfo {
    base: *mut u8,
+    size: usize,
    ranges: RangeTree<Fmap>,
    pub open_fds: usize,
 }
@@ -319,6 +337,7 @@ impl Default for FileMmapInfo {
    fn default() -> Self {
        Self {
            base: core::ptr::null_mut(),
+            size: 0,
            ranges: RangeTree::new(),
            open_fds: 0,
        }
@@ -338,7 +357,6 @@ impl FileResource {
            parent_ptr_opt,
            node_ptr,
            flags,
-            seek: 0,
            uid,
        }
    }
@@ -363,7 +381,6 @@ impl<D: Disk> Resource<D> for FileResource {
            parent_ptr_opt: self.parent_ptr_opt,
            node_ptr: self.node_ptr,
            flags: self.flags,
-            seek: self.seek,
            uid: self.uid,
        }))
    }
@@ -372,58 +389,53 @@ impl<D: Disk> Resource<D> for FileResource {
        self.path = path.to_string();
    }

-    fn read(&mut self, buf: &mut [u8], tx: &mut Transaction<D>) -> Result<usize> {
-        if self.flags & O_ACCMODE == O_RDWR || self.flags & O_ACCMODE == O_RDONLY {
-            let atime = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
-            let count = tx.read_node(
-                self.node_ptr,
-                self.seek as u64,
-                buf,
-                atime.as_secs(),
-                atime.subsec_nanos(),
-            )?;
-            self.seek += count as isize;
-            Ok(count)
-        } else {
-            Err(Error::new(EBADF))
+    fn read(&mut self, buf: &mut [u8], offset: u64, tx: &mut Transaction<D>) -> Result<usize> {
+        if self.flags & O_ACCMODE != O_RDWR && self.flags & O_ACCMODE != O_RDONLY {
+            return Err(Error::new(EBADF));
        }
+        let atime = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
+        tx.read_node(
+            self.node_ptr,
+            offset,
+            buf,
+            atime.as_secs(),
+            atime.subsec_nanos(),
+        )
    }

-    fn write(&mut self, buf: &[u8], tx: &mut Transaction<D>) -> Result<usize> {
-        if self.flags & O_ACCMODE == O_RDWR || self.flags & O_ACCMODE == O_WRONLY {
-            if self.flags & O_APPEND == O_APPEND {
-                let node = tx.read_tree(self.node_ptr)?;
-                self.seek = node.data().size() as isize;
-            }
-            let mtime = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
-            let count = tx.write_node(
-                self.node_ptr,
-                self.seek as u64,
-                buf,
-                mtime.as_secs(),
-                mtime.subsec_nanos(),
-            )?;
-            self.seek += count as isize;
-            Ok(count)
-        } else {
-            Err(Error::new(EBADF))
+    fn write(&mut self, buf: &[u8], offset: u64, tx: &mut Transaction<D>) -> Result<usize> {
+        if self.flags & O_ACCMODE != O_RDWR && self.flags & O_ACCMODE != O_WRONLY {
+            return Err(Error::new(EBADF));
        }
+        let effective_offset = if self.flags & O_APPEND == O_APPEND {
+            let node = tx.read_tree(self.node_ptr)?;
+            node.data().size()
+        } else {
+            offset
+        };
+        let mtime = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();
+        tx.write_node(
+            self.node_ptr,
+            effective_offset,
+            buf,
+            mtime.as_secs(),
+            mtime.subsec_nanos(),
+        )
    }

-    fn seek(&mut self, offset: isize, whence: usize, tx: &mut Transaction<D>) -> Result<isize> {
-        self.seek = match whence {
-            SEEK_SET => max(0, offset),
-            SEEK_CUR => max(0, self.seek + offset),
-            SEEK_END => {
-                let node = tx.read_tree(self.node_ptr)?;
-                max(0, node.data().size() as isize + offset)
-            }
-            _ => return Err(Error::new(EINVAL)),
-        };
-        Ok(self.seek)
+    fn fsize(&mut self, tx: &mut Transaction<D>) -> Result<u64> {
+        let node = tx.read_tree(self.node_ptr)?;
+        Ok(node.data().size())
    }

-    fn fmap(&mut self, fmaps: &mut Fmaps, flags: MapFlags, unaligned_size: usize, offset: u64, tx: &mut Transaction<D>) -> Result<usize> {
+    fn fmap(
+        &mut self,
+        fmaps: &mut Fmaps,
+        flags: MapFlags,
+        unaligned_size: usize,
+        offset: u64,
+        tx: &mut Transaction<D>,
+    ) -> Result<usize> {
        //dbg!(&self.fmaps);
        let accmode = self.flags & O_ACCMODE;
        if flags.contains(PROT_READ) && !(accmode == O_RDWR || accmode == O_RDONLY) {
@@ -441,33 +453,44 @@ impl<D: Disk> Resource<D> for FileResource {

        // TODO: Pass entry directory to Resource trait functions, since the node_ptr can be
        // obtained by the caller.
-        let fmap_info = fmaps.get_mut(&self.node_ptr.id()).ok_or(Error::new(EBADFD))?;
-
-        let max_offset = fmap_info.ranges.end();
-        if offset + aligned_size as u64 > max_offset {
-            if fmap_info.base.is_null() {
-                fmap_info.base = unsafe {
-                    syscall::fmap(!0, &Map {
-                        size: offset as usize + aligned_size,
+        let fmap_info = fmaps
+            .get_mut(&self.node_ptr.id())
+            .ok_or(Error::new(EBADFD))?;
+
+        let new_size = (offset as usize + aligned_size).next_multiple_of(PAGE_SIZE);
+        if new_size > fmap_info.size {
+            fmap_info.base = if fmap_info.base.is_null() {
+                unsafe {
+                    libredox::call::mmap(MmapArgs {
+                        length: new_size,
                        // PRIVATE/SHARED doesn't matter once the pages are passed in the fmap
                        // handler.
-                        flags: MapFlags::PROT_READ | MapFlags::PROT_WRITE | MapFlags::MAP_PRIVATE,
+                        prot: libredox::flag::PROT_READ | libredox::flag::PROT_WRITE,
+                        flags: libredox::flag::MAP_PRIVATE,

                        offset: 0,
-                        address: 0,
+                        fd: !0,
+                        addr: core::ptr::null_mut(),
                    })? as *mut u8
-                };
+                }
            } else {
-                let new_size = (offset as usize + aligned_size).next_multiple_of(PAGE_SIZE);
-                let old_size = max_offset as usize;
-
-                fmap_info.base = unsafe {
-                    syscall::syscall5(syscall::SYS_MREMAP, fmap_info.base as usize, old_size, 0, new_size, syscall::MremapFlags::empty().bits() | (PROT_READ | PROT_WRITE).bits())? as *mut u8
-                };
-            }
+                unsafe {
+                    syscall::syscall5(
+                        syscall::SYS_MREMAP,
+                        fmap_info.base as usize,
+                        fmap_info.size,
+                        0,
+                        new_size,
+                        syscall::MremapFlags::empty().bits() | (PROT_READ | PROT_WRITE).bits(),
+                    )? as *mut u8
+                }
+            };
+            fmap_info.size = new_size;
        }

-        let affected_fmaps = fmap_info.ranges.remove_and_unused(offset..offset + aligned_size as u64);
+        let affected_fmaps = fmap_info
+            .ranges
+            .remove_and_unused(offset..offset + aligned_size as u64);

        for (range, v_opt) in affected_fmaps {
            //dbg!(&range);
@@ -475,9 +498,20 @@ impl<D: Disk> Resource<D> for FileResource {
                fmap.rc += 1;
                fmap.flags |= flags;

-                fmap_info.ranges.insert(range.start, range.end - range.start, fmap);
+                fmap_info
+                    .ranges
+                    .insert(range.start, range.end - range.start, fmap);
            } else {
-                let map = unsafe { Fmap::new(self.node_ptr, flags, unaligned_size, offset, fmap_info.base, tx)? };
+                let map = unsafe {
+                    Fmap::new(
+                        self.node_ptr,
+                        flags,
+                        unaligned_size,
+                        offset,
+                        fmap_info.base,
+                        tx,
+                    )?
+                };
                fmap_info.ranges.insert(offset, aligned_size as u64, map);
            }
        }
@@ -486,11 +520,20 @@ impl<D: Disk> Resource<D> for FileResource {
        Ok(fmap_info.base as usize + offset as usize)
    }

-    fn funmap(&mut self, fmaps: &mut Fmaps, offset: u64, size: usize, tx: &mut Transaction<D>) -> Result<usize> {
-        let fmap_info = fmaps.get_mut(&self.node_ptr.id()).ok_or(Error::new(EBADFD))?;
+    fn funmap(
+        &mut self,
+        fmaps: &mut Fmaps,
+        offset: u64,
+        size: usize,
+        tx: &mut Transaction<D>,
+    ) -> Result<usize> {
+        let fmap_info = fmaps
+            .get_mut(&self.node_ptr.id())
+            .ok_or(Error::new(EBADFD))?;

        //dbg!(&self.fmaps);
        //dbg!(self.fmaps.conflicts(offset..offset + size as u64).collect::<Vec<_>>());
+        #[allow(unused_mut)]
        let mut affected_fmaps = fmap_info.ranges.remove(offset..offset + size as u64);

        for (range, mut fmap) in affected_fmaps {
@@ -498,11 +541,19 @@ impl<D: Disk> Resource<D> for FileResource {

            //log::info!("SYNCING {}..{}", range.start, range.end);
            unsafe {
-                fmap.sync(self.node_ptr, fmap_info.base, range.start, (range.end - range.start) as usize, tx)?;
+                fmap.sync(
+                    self.node_ptr,
+                    fmap_info.base,
+                    range.start,
+                    (range.end - range.start) as usize,
+                    tx,
+                )?;
            }

            if fmap.rc > 0 {
-                fmap_info.ranges.insert(range.start, range.end - range.start, fmap);
+                fmap_info
+                    .ranges
+                    .insert(range.start, range.end - range.start, fmap);
            }
        }
        //dbg!(&self.fmaps);
@@ -529,7 +580,13 @@ impl<D: Disk> Resource<D> for FileResource {
        if let Some(fmap_info) = fmaps.get_mut(&self.node_ptr.id()) {
            for (range, fmap) in fmap_info.ranges.iter_mut() {
                unsafe {
-                    fmap.sync(self.node_ptr, fmap_info.base, range.start, (range.end - range.start) as usize, tx)?;
+                    fmap.sync(
+                        self.node_ptr,
+                        fmap_info.base,
+                        range.start,
+                        (range.end - range.start) as usize,
+                        tx,
+                    )?;
                }
            }
        }
@@ -615,9 +672,15 @@ impl range_tree::Value for Fmap {
            Err(self)
        }
    }
-    fn split(self, prev_range: Option<core::ops::Range<Self::K>>, range: core::ops::Range<Self::K>, next_range: Option<core::ops::Range<Self::K>>) -> (Option<Self>, Self, Option<Self>) {
+    #[allow(unused_variables)]
+    fn split(
+        self,
+        prev_range: Option<core::ops::Range<Self::K>>,
+        range: core::ops::Range<Self::K>,
+        next_range: Option<core::ops::Range<Self::K>>,
+    ) -> (Option<Self>, Self, Option<Self>) {
        (
-            prev_range.map(|range| Fmap {
+            prev_range.map(|_range| Fmap {
                rc: self.rc,
                flags: self.flags,
                last_page_tail: 0,
@@ -625,9 +688,13 @@ impl range_tree::Value for Fmap {
            Fmap {
                rc: self.rc,
                flags: self.flags,
-                last_page_tail: if next_range.is_none() { self.last_page_tail } else { 0 },
+                last_page_tail: if next_range.is_none() {
+                    self.last_page_tail
+                } else {
+                    0
+                },
            },
-            next_range.map(|range| Fmap {
+            next_range.map(|_range| Fmap {
                rc: self.rc,
                flags: self.flags,
                last_page_tail: self.last_page_tail,

--- a/src/mount/redox/scheme.rs
+++ b/src/mount/redox/scheme.rs
@@ -3,17 +3,23 @@ use std::str;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::time::{SystemTime, UNIX_EPOCH};

-use syscall::{EBADFD, MunmapFlags};
-use syscall::data::{Map, Stat, StatVfs, TimeSpec};
+use redox_scheme::{CallerCtx, OpenResult, SchemeMut};
+use syscall::data::{Stat, StatVfs, TimeSpec};
 use syscall::error::{
    Error, Result, EACCES, EBADF, EBUSY, EEXIST, EINVAL, EISDIR, ELOOP, ENOENT, ENOTDIR, ENOTEMPTY,
    EPERM, EXDEV,
 };
 use syscall::flag::{
-    EventFlags, MapFlags, MODE_PERM, O_ACCMODE, O_CREAT, O_DIRECTORY, O_EXCL, O_NOFOLLOW, O_RDONLY,
-    O_RDWR, O_STAT, O_SYMLINK, O_TRUNC, O_WRONLY,
+    EventFlags, MapFlags, O_ACCMODE, O_CREAT, O_DIRECTORY, O_EXCL, O_NOFOLLOW, O_RDONLY, O_RDWR,
+    O_STAT, O_SYMLINK, O_TRUNC, O_WRONLY,
+};
+use syscall::schemev2::NewFdFlags;
+use syscall::{MunmapFlags, EBADFD};
+
+use redox_path::{
+    canonicalize_to_standard, canonicalize_using_cwd, canonicalize_using_scheme, scheme_path,
+    RedoxPath,
 };
-use syscall::scheme::SchemeMut;

 use crate::{Disk, FileSystem, Node, Transaction, TreeData, TreePtr, BLOCK_SIZE};

@@ -21,7 +27,7 @@ use super::resource::{DirResource, FileResource, Resource};

 pub struct FileScheme<D: Disk> {
    name: String,
-    fs: FileSystem<D>,
+    pub(crate) fs: FileSystem<D>,
    next_id: AtomicUsize,
    files: BTreeMap<usize, Box<dyn Resource<D>>>,
    fmap: super::resource::Fmaps,
@@ -43,15 +49,21 @@ impl<D: Disk> FileScheme<D> {
        tx: &mut Transaction<D>,
        uid: u32,
        gid: u32,
-        url: &str,
+        full_path: &str,
        node: TreeData<Node>,
        nodes: &mut Vec<(TreeData<Node>, String)>,
-    ) -> Result<Vec<u8>> {
+    ) -> Result<String> {
        let atime = SystemTime::now().duration_since(UNIX_EPOCH).unwrap();

+        // symbolic link is relative to this part of the url
+        let mut working_dir =
+            dirname(full_path).unwrap_or(scheme_path(scheme_name).ok_or(Error::new(EINVAL))?);
+        // node of the link
        let mut node = node;
+
        for _ in 0..32 {
            // XXX What should the limit be?
+            assert!(node.data().is_symlink());
            let mut buf = [0; 4096];
            let count = tx.read_node(
                node.ptr(),
@@ -60,24 +72,30 @@ impl<D: Disk> FileScheme<D> {
                atime.as_secs(),
                atime.subsec_nanos(),
            )?;
-            let scheme = format!("{}:", scheme_name);
-            let canon = canonicalize(url.as_bytes(), &buf[0..count]);
-            let path = str::from_utf8(&canon[scheme.len()..])
-                .unwrap_or("")
-                .trim_matches('/');
+
+            let target = canonicalize_to_standard(
+                Some(&working_dir),
+                str::from_utf8(&buf[..count]).or(Err(Error::new(EINVAL)))?,
+            )
+            .ok_or(Error::new(EINVAL))?;
+            let target_as_path = RedoxPath::from_absolute(&target).ok_or(Error::new(EINVAL))?;
+
+            let (scheme, reference) = target_as_path.as_parts().ok_or(Error::new(EINVAL))?;
+            if scheme.as_ref() != scheme_name {
+                return Err(Error::new(EXDEV));
+            }
+            let target_reference = reference.to_string();
+
            nodes.clear();
            if let Some((next_node, next_node_name)) =
-                Self::path_nodes(scheme_name, tx, path, uid, gid, nodes)?
+                Self::path_nodes(scheme_name, tx, &target_reference, uid, gid, nodes)?
            {
                if !next_node.data().is_symlink() {
-                    if canon.starts_with(scheme.as_bytes()) {
-                        nodes.push((next_node, next_node_name));
-                        return Ok(canon[scheme.len()..].to_vec());
-                    } else {
-                        return Err(Error::new(EXDEV));
-                    }
+                    nodes.push((next_node, next_node_name));
+                    return Ok(target_reference);
                }
                node = next_node;
+                working_dir = dirname(&target).ok_or(Error::new(EINVAL))?.to_string();
            } else {
                return Err(Error::new(ENOENT));
            }
@@ -141,78 +159,15 @@ impl<D: Disk> FileScheme<D> {
    }
 }

-/// Make a relative path absolute
-/// Given a cwd of "scheme:/path"
-/// This function will turn "foo" into "scheme:/path/foo"
-/// "/foo" will turn into "scheme:/foo"
-/// "bar:/foo" will be used directly, as it is already absolute
-pub fn canonicalize(current: &[u8], path: &[u8]) -> Vec<u8> {
-    // This function is modified from a version in the kernel
-    let mut canon = if path.iter().position(|&b| b == b':').is_none() {
-        let cwd = &current[0..current.iter().rposition(|x| *x == '/' as u8).unwrap_or(0)];
-
-        let mut canon = if !path.starts_with(b"/") {
-            let mut c = cwd.to_vec();
-            if !c.ends_with(b"/") {
-                c.push(b'/');
-            }
-            c
-        } else {
-            cwd[..cwd.iter().position(|&b| b == b':').map_or(1, |i| i + 1)].to_vec()
-        };
-
-        canon.extend_from_slice(&path);
-        canon
-    } else {
-        path.to_vec()
-    };
-
-    // NOTE: assumes the scheme does not include anything like "../" or "./"
-    let mut result = {
-        let parts = canon
-            .split(|&c| c == b'/')
-            .filter(|&part| part != b".")
-            .rev()
-            .scan(0, |nskip, part| {
-                if part == b"." {
-                    Some(None)
-                } else if part == b".." {
-                    *nskip += 1;
-                    Some(None)
-                } else {
-                    if *nskip > 0 {
-                        *nskip -= 1;
-                        Some(None)
-                    } else {
-                        Some(Some(part))
-                    }
-                }
-            })
-            .filter_map(|x| x)
-            .collect::<Vec<_>>();
-        parts.iter().rev().fold(Vec::new(), |mut vec, &part| {
-            vec.extend_from_slice(part);
-            vec.push(b'/');
-            vec
-        })
-    };
-    result.pop(); // remove extra '/'
-
-    // replace with the root of the scheme if it's empty
-    if result.len() == 0 {
-        let pos = canon
-            .iter()
-            .position(|&b| b == b':')
-            .map_or(canon.len(), |p| p + 1);
-        canon.truncate(pos);
-        canon
-    } else {
-        result
-    }
+/// given a path with a scheme, return the containing directory (or scheme)
+fn dirname(path: &str) -> Option<String> {
+    canonicalize_using_cwd(Some(path), "..")
 }

 impl<D: Disk> SchemeMut for FileScheme<D> {
-    fn open(&mut self, url: &str, flags: usize, uid: u32, gid: u32) -> Result<usize> {
+    fn xopen(&mut self, url: &str, flags: usize, ctx: &CallerCtx) -> Result<OpenResult> {
+        let CallerCtx { uid, gid, .. } = *ctx;
+
        let path = url.trim_matches('/');

        // println!("Open '{}' {:X}", path, flags);
@@ -272,20 +227,20 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
                    && flags & O_SYMLINK != O_SYMLINK
                {
                    let mut resolve_nodes = Vec::new();
+                    let full_path =
+                        canonicalize_using_scheme(scheme_name, url).ok_or(Error::new(EINVAL))?;
                    let resolved = self.fs.tx(|tx| {
                        Self::resolve_symlink(
                            scheme_name,
                            tx,
                            uid,
                            gid,
-                            &format!("{}:/{}", scheme_name, url),
+                            &full_path,
                            node,
                            &mut resolve_nodes,
                        )
                    })?;
-                    let resolved_utf8 =
-                        str::from_utf8(&resolved).map_err(|_| Error::new(EINVAL))?;
-                    return self.open(resolved_utf8, flags, uid, gid);
+                    return self.xopen(&resolved, flags, ctx);
                } else if !node.data().is_symlink() && flags & O_SYMLINK == O_SYMLINK {
                    return Err(Error::new(EINVAL));
                } else {
@@ -401,41 +356,17 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
                }
            }
        };
-        self.fmap.entry(resource.node_ptr().id()).or_insert_with(Default::default).open_fds += 1;
-
+        self.fmap
+            .entry(resource.node_ptr().id())
+            .or_insert_with(Default::default)
+            .open_fds += 1;

-        let id = self.next_id.fetch_add(1, Ordering::SeqCst);
+        let id = self.next_id.fetch_add(1, Ordering::Relaxed);
        self.files.insert(id, resource);

-        Ok(id)
-    }
-
-    fn chmod(&mut self, url: &str, mode: u16, uid: u32, gid: u32) -> Result<usize> {
-        let path = url.trim_matches('/');
-
-        // println!("Chmod '{}'", path);
-
-        let scheme_name = &self.name;
-        self.fs.tx(|tx| {
-            let mut nodes = Vec::new();
-            if let Some((mut node, _node_name)) =
-                Self::path_nodes(scheme_name, tx, path, uid, gid, &mut nodes)?
-            {
-                if node.data().uid() == uid || uid == 0 {
-                    let old_mode = node.data().mode();
-                    let new_mode = (old_mode & !MODE_PERM) | (mode & MODE_PERM);
-                    if old_mode != new_mode {
-                        node.data_mut().set_mode(new_mode);
-                        tx.sync_tree(node)?;
-                    }
-
-                    Ok(0)
-                } else {
-                    Err(Error::new(EPERM))
-                }
-            } else {
-                Err(Error::new(ENOENT))
-            }
+        Ok(OpenResult::ThisScheme {
+            number: id,
+            flags: NewFdFlags::POSITIONED,
        })
    }

@@ -447,31 +378,32 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
        let scheme_name = &self.name;
        self.fs.tx(|tx| {
            let mut nodes = Vec::new();
-            if let Some((child, child_name)) =
+
+            let Some((child, child_name)) =
                Self::path_nodes(scheme_name, tx, path, uid, gid, &mut nodes)?
-            {
-                if let Some((parent, _parent_name)) = nodes.last() {
-                    if !parent.data().permission(uid, gid, Node::MODE_WRITE) {
-                        // println!("dir not writable {:o}", parent.1.mode);
-                        return Err(Error::new(EACCES));
-                    }
+            else {
+                return Err(Error::new(ENOENT));
+            };

-                    if child.data().is_dir() {
-                        if !child.data().permission(uid, gid, Node::MODE_WRITE) {
-                            // println!("dir not writable {:o}", parent.1.mode);
-                            return Err(Error::new(EACCES));
-                        }
+            let Some((parent, _parent_name)) = nodes.last() else {
+                return Err(Error::new(EPERM));
+            };

-                        tx.remove_node(parent.ptr(), &child_name, Node::MODE_DIR)
-                            .and(Ok(0))
-                    } else {
-                        Err(Error::new(ENOTDIR))
-                    }
-                } else {
-                    Err(Error::new(EPERM))
+            if !parent.data().permission(uid, gid, Node::MODE_WRITE) {
+                // println!("dir not writable {:o}", parent.1.mode);
+                return Err(Error::new(EACCES));
+            }
+
+            if child.data().is_dir() {
+                if !child.data().permission(uid, gid, Node::MODE_WRITE) {
+                    // println!("dir not writable {:o}", parent.1.mode);
+                    return Err(Error::new(EACCES));
                }
+
+                tx.remove_node(parent.ptr(), &child_name, Node::MODE_DIR)
+                    .and(Ok(0))
            } else {
-                Err(Error::new(ENOENT))
+                Err(Error::new(ENOTDIR))
            }
        })
    }
@@ -485,38 +417,36 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
        self.fs.tx(|tx| {
            let mut nodes = Vec::new();

-            // TODO: Clean up indentation using let-else, possibly elsewhere too.
-
-            if let Some((child, child_name)) =
+            let Some((child, child_name)) =
                Self::path_nodes(scheme_name, tx, path, uid, gid, &mut nodes)?
-            {
-                if let Some((parent, _parent_name)) = nodes.last() {
-                    if !parent.data().permission(uid, gid, Node::MODE_WRITE) {
-                        // println!("dir not writable {:o}", parent.1.mode);
-                        return Err(Error::new(EACCES));
-                    }
+            else {
+                return Err(Error::new(ENOENT));
+            };

-                    if !child.data().is_dir() {
-                        if child.data().uid() != uid && uid != 0 {
-                            // println!("file not owned by current user {}", parent.1.uid);
-                            return Err(Error::new(EACCES));
-                        }
+            let Some((parent, _parent_name)) = nodes.last() else {
+                return Err(Error::new(EPERM));
+            };

-                        if child.data().is_symlink() {
-                            tx.remove_node(parent.ptr(), &child_name, Node::MODE_SYMLINK)
-                                .and(Ok(0))
-                        } else {
-                            tx.remove_node(parent.ptr(), &child_name, Node::MODE_FILE)
-                                .and(Ok(0))
-                        }
-                    } else {
-                        Err(Error::new(EISDIR))
-                    }
+            if !parent.data().permission(uid, gid, Node::MODE_WRITE) {
+                // println!("dir not writable {:o}", parent.1.mode);
+                return Err(Error::new(EACCES));
+            }
+
+            if !child.data().is_dir() {
+                if child.data().uid() != uid && uid != 0 {
+                    // println!("file not owned by current user {}", parent.1.uid);
+                    return Err(Error::new(EACCES));
+                }
+
+                if child.data().is_symlink() {
+                    tx.remove_node(parent.ptr(), &child_name, Node::MODE_SYMLINK)
+                        .and(Ok(0))
                } else {
-                    Err(Error::new(EPERM))
+                    tx.remove_node(parent.ptr(), &child_name, Node::MODE_FILE)
+                        .and(Ok(0))
                }
            } else {
-                Err(Error::new(ENOENT))
+                Err(Error::new(EISDIR))
            }
        })
    }
@@ -536,39 +466,32 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
            return Err(Error::new(EBADF));
        };

-        self.fmap.get_mut(&resource.node_ptr().id()).ok_or(Error::new(EBADFD))?.open_fds += 1;
+        self.fmap
+            .get_mut(&resource.node_ptr().id())
+            .ok_or(Error::new(EBADFD))?
+            .open_fds += 1;
        let id = self.next_id.fetch_add(1, Ordering::SeqCst);
        self.files.insert(id, resource);

        Ok(id)
    }

-    #[allow(unused_variables)]
-    fn read(&mut self, id: usize, buf: &mut [u8]) -> Result<usize> {
+    fn read(&mut self, id: usize, buf: &mut [u8], offset: u64, _fcntl_flags: u32) -> Result<usize> {
        // println!("Read {}, {:X} {}", id, buf.as_ptr() as usize, buf.len());
-        if let Some(file) = self.files.get_mut(&id) {
-            self.fs.tx(|tx| file.read(buf, tx))
-        } else {
-            Err(Error::new(EBADF))
-        }
+        let file = self.files.get_mut(&id).ok_or(Error::new(EBADF))?;
+        self.fs.tx(|tx| file.read(buf, offset, tx))
    }

-    fn write(&mut self, id: usize, buf: &[u8]) -> Result<usize> {
+    fn write(&mut self, id: usize, buf: &[u8], offset: u64, _fcntl_flags: u32) -> Result<usize> {
        // println!("Write {}, {:X} {}", id, buf.as_ptr() as usize, buf.len());
-        if let Some(file) = self.files.get_mut(&id) {
-            self.fs.tx(|tx| file.write(buf, tx))
-        } else {
-            Err(Error::new(EBADF))
-        }
+        let file = self.files.get_mut(&id).ok_or(Error::new(EBADF))?;
+        self.fs.tx(|tx| file.write(buf, offset, tx))
    }

-    fn seek(&mut self, id: usize, pos: isize, whence: usize) -> Result<isize> {
+    fn fsize(&mut self, id: usize) -> Result<u64> {
        // println!("Seek {}, {} {}", id, pos, whence);
-        if let Some(file) = self.files.get_mut(&id) {
-            self.fs.tx(|tx| file.seek(pos, whence, tx))
-        } else {
-            Err(Error::new(EBADF))
-        }
+        let file = self.files.get_mut(&id).ok_or(Error::new(EBADF))?;
+        self.fs.tx(|tx| file.fsize(tx))
    }

    fn fchmod(&mut self, id: usize, mode: u16) -> Result<usize> {
@@ -784,14 +707,13 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
    }

    fn mmap_prep(&mut self, id: usize, offset: u64, size: usize, flags: MapFlags) -> Result<usize> {
-        println!("Mmap {}, {:?} {} {}", id, flags, size, offset);
        let file = self.files.get_mut(&id).ok_or(Error::new(EBADF))?;
        let fmaps = &mut self.fmap;

        self.fs.tx(|tx| file.fmap(fmaps, flags, size, offset, tx))
    }
+    #[allow(unused_variables)]
    fn munmap(&mut self, id: usize, offset: u64, size: usize, flags: MunmapFlags) -> Result<usize> {
-        println!("Munmap {}, {} {}", id, size, offset);
        let file = self.files.get_mut(&id).ok_or(Error::new(EBADF))?;
        let fmaps = &mut self.fmap;

@@ -801,9 +723,15 @@ impl<D: Disk> SchemeMut for FileScheme<D> {
    fn close(&mut self, id: usize) -> Result<usize> {
        // println!("Close {}", id);
        let file = self.files.remove(&id).ok_or(Error::new(EBADF))?;
-        let file_info = self.fmap.get_mut(&file.node_ptr().id()).ok_or(Error::new(EBADFD))?;
-
-        file_info.open_fds = file_info.open_fds.checked_sub(1).expect("open_fds not tracked correctly");
+        let file_info = self
+            .fmap
+            .get_mut(&file.node_ptr().id())
+            .ok_or(Error::new(EBADFD))?;
+
+        file_info.open_fds = file_info
+            .open_fds
+            .checked_sub(1)
+            .expect("open_fds not tracked correctly");

        // TODO: If open_fds reaches zero and there are no hardlinks (directory entries) to any
        // particular inode, remove that inode here.

--- a/src/node.rs
+++ b/src/node.rs
 use core::{fmt, mem, ops, slice};
-use simple_endian::*;
+use endian_num::Le;

-use crate::{BlockList, BlockPtr, BlockRaw};
+use crate::{BlockLevel, BlockList, BlockPtr, BlockTrait, RecordRaw, BLOCK_SIZE, RECORD_LEVEL};

+/// An index into a [`Node`]'s block table.
 pub enum NodeLevel {
    L0(usize),
    L1(usize, usize),
@@ -12,61 +13,66 @@ pub enum NodeLevel {
 }

 impl NodeLevel {
-    // Warning: this uses constant block offsets, make sure to sync with Node
-    pub fn new(mut block_offset: u64) -> Option<Self> {
+    // Warning: this uses constant record offsets, make sure to sync with Node
+
+    /// Return the [`NodeLevel`] of the record with the given index.
+    /// - the first 128 are level 0,
+    /// - the next 64*256 are level 1,
+    /// - ...and so on.
+    pub fn new(mut record_offset: u64) -> Option<Self> {
        // 1 << 8 = 256, this is the number of entries in a BlockList
        const SHIFT: u64 = 8;
        const NUM: u64 = 1 << SHIFT;
        const MASK: u64 = NUM - 1;

        const L0: u64 = 128;
-        if block_offset < L0 {
-            return Some(Self::L0((block_offset & MASK) as usize));
+        if record_offset < L0 {
+            return Some(Self::L0((record_offset & MASK) as usize));
        } else {
-            block_offset -= L0;
+            record_offset -= L0;
        }

        const L1: u64 = 64 * NUM;
-        if block_offset < L1 {
+        if record_offset < L1 {
            return Some(Self::L1(
-                ((block_offset >> SHIFT) & MASK) as usize,
-                (block_offset & MASK) as usize,
+                ((record_offset >> SHIFT) & MASK) as usize,
+                (record_offset & MASK) as usize,
            ));
        } else {
-            block_offset -= L1;
+            record_offset -= L1;
        }

        const L2: u64 = 32 * NUM * NUM;
-        if block_offset < L2 {
+        if record_offset < L2 {
            return Some(Self::L2(
-                ((block_offset >> (2 * SHIFT)) & MASK) as usize,
-                ((block_offset >> SHIFT) & MASK) as usize,
-                (block_offset & MASK) as usize,
+                ((record_offset >> (2 * SHIFT)) & MASK) as usize,
+                ((record_offset >> SHIFT) & MASK) as usize,
+                (record_offset & MASK) as usize,
            ));
        } else {
-            block_offset -= L2;
+            record_offset -= L2;
        }

        const L3: u64 = 16 * NUM * NUM * NUM;
-        if block_offset < L3 {
+        if record_offset < L3 {
            return Some(Self::L3(
-                ((block_offset >> (3 * SHIFT)) & MASK) as usize,
-                ((block_offset >> (2 * SHIFT)) & MASK) as usize,
-                ((block_offset >> SHIFT) & MASK) as usize,
-                (block_offset & MASK) as usize,
+                ((record_offset >> (3 * SHIFT)) & MASK) as usize,
+                ((record_offset >> (2 * SHIFT)) & MASK) as usize,
+                ((record_offset >> SHIFT) & MASK) as usize,
+                (record_offset & MASK) as usize,
            ));
        } else {
-            block_offset -= L3;
+            record_offset -= L3;
        }

        const L4: u64 = 12 * NUM * NUM * NUM * NUM;
-        if block_offset < L4 {
+        if record_offset < L4 {
            Some(Self::L4(
-                ((block_offset >> (4 * SHIFT)) & MASK) as usize,
-                ((block_offset >> (3 * SHIFT)) & MASK) as usize,
-                ((block_offset >> (2 * SHIFT)) & MASK) as usize,
-                ((block_offset >> SHIFT) & MASK) as usize,
-                (block_offset & MASK) as usize,
+                ((record_offset >> (4 * SHIFT)) & MASK) as usize,
+                ((record_offset >> (3 * SHIFT)) & MASK) as usize,
+                ((record_offset >> (2 * SHIFT)) & MASK) as usize,
+                ((record_offset >> SHIFT) & MASK) as usize,
+                (record_offset & MASK) as usize,
            ))
        } else {
            None
@@ -74,38 +80,86 @@ impl NodeLevel {
    }
 }

-type BlockListL1 = BlockList<BlockRaw>;
+type BlockListL1 = BlockList<RecordRaw>;
 type BlockListL2 = BlockList<BlockListL1>;
 type BlockListL3 = BlockList<BlockListL2>;
 type BlockListL4 = BlockList<BlockListL3>;

 /// A file/folder node
-#[repr(packed)]
+#[repr(C, packed)]
 pub struct Node {
-    pub mode: u16le,
-    pub uid: u32le,
-    pub gid: u32le,
-    pub links: u32le,
-    pub size: u64le,
-    pub ctime: u64le,
-    pub ctime_nsec: u32le,
-    pub mtime: u64le,
-    pub mtime_nsec: u32le,
-    pub atime: u64le,
-    pub atime_nsec: u32le,
-    pub padding: [u8; 6],
-    // 128 * BLOCK_SIZE (512 KiB, 4 KiB each)
-    pub level0: [BlockPtr<BlockRaw>; 128],
-    // 64 * 256 * BLOCK_SIZE (64 MiB, 1 MiB each)
+    /// This node's type & permissions.
+    /// - first four bits are permissions
+    /// - next four bits are permissions for the file's user
+    /// - next four bits are permissions for the file's group
+    /// - last four bits are permissions for everyone else
+    pub mode: Le<u16>,
+
+    /// The uid that owns this file
+    pub uid: Le<u32>,
+
+    /// The gid that owns this file
+    pub gid: Le<u32>,
+
+    /// The number of links to this file
+    /// (directory entries, symlinks, etc)
+    pub links: Le<u32>,
+
+    /// The length of this file, in bytes
+    pub size: Le<u64>,
+
+    pub ctime: Le<u64>,
+    pub ctime_nsec: Le<u32>,
+    pub mtime: Le<u64>,
+    pub mtime_nsec: Le<u32>,
+    pub atime: Le<u64>,
+    pub atime_nsec: Le<u32>,
+
+    pub record_level: Le<u32>,
+
+    pub padding: [u8; BLOCK_SIZE as usize - 4094],
+
+    /// The first 128 blocks of this file.
+    ///
+    /// Total size: 128 * RECORD_SIZE (16 MiB, 128 KiB each)
+    pub level0: [BlockPtr<RecordRaw>; 128],
+
+    /// The next 64 * 256 blocks of this file,
+    /// stored behind 64 level one tables.
+    ///
+    /// Total size: 64 * 256 * RECORD_SIZE (2 GiB, 32 MiB each)
    pub level1: [BlockPtr<BlockListL1>; 64],
-    // 32 * 256 * 256 * BLOCK_SIZE (8 GiB, 256 MiB each)
+
+    /// The next 32 * 256 * 256 blocks of this file,
+    /// stored behind 32 level two tables.
+    /// Each level two table points to 256 level one tables.
+    ///
+    /// Total size: 32 * 256 * 256 * RECORD_SIZE (256 GiB, 8 GiB each)
    pub level2: [BlockPtr<BlockListL2>; 32],
-    // 16 * 256 * 256 * 256 * BLOCK_SIZE (1 TiB, 64 GiB each)
+
+    /// The next 16 * 256 * 256 * 256 blocks of this file,
+    /// stored behind 16 level three tables.
+    ///
+    /// Total size: 16 * 256 * 256 * 256 * RECORD_SIZE (32 TiB, 2 TiB each)
    pub level3: [BlockPtr<BlockListL3>; 16],
-    // 12 * 256 * 256 * 256 * 256 * BLOCK_SIZE (192 TiB, 16 TiB each)
+
+    /// The next 12 * 256 * 256 * 256 * 256 blocks of this file,
+    /// stored behind 12 level four tables.
+    ///
+    /// Total size: 12 * 256 * 256 * 256 * 256 * RECORD_SIZE (6 PiB, 512 TiB each)
    pub level4: [BlockPtr<BlockListL4>; 12],
 }

+unsafe impl BlockTrait for Node {
+    fn empty(level: BlockLevel) -> Option<Self> {
+        if level.0 == 0 {
+            Some(Self::default())
+        } else {
+            None
+        }
+    }
+}
+
 impl Default for Node {
    fn default() -> Self {
        Self {
@@ -120,7 +174,8 @@ impl Default for Node {
            mtime_nsec: 0.into(),
            atime: 0.into(),
            atime_nsec: 0.into(),
-            padding: [0; 6],
+            record_level: 0.into(),
+            padding: [0; BLOCK_SIZE as usize - 4094],
            level0: [BlockPtr::default(); 128],
            level1: [BlockPtr::default(); 64],
            level2: [BlockPtr::default(); 32],
@@ -136,11 +191,13 @@ impl Node {
    pub const MODE_DIR: u16 = 0x4000;
    pub const MODE_SYMLINK: u16 = 0xA000;

+    /// Mask for node permission bits
    pub const MODE_PERM: u16 = 0x0FFF;
    pub const MODE_EXEC: u16 = 0o1;
    pub const MODE_WRITE: u16 = 0o2;
    pub const MODE_READ: u16 = 0o4;

+    /// Create a new, empty node with the given metadata
    pub fn new(mode: u16, uid: u32, gid: u32, ctime: u64, ctime_nsec: u32) -> Self {
        Self {
            mode: mode.into(),
@@ -153,40 +210,62 @@ impl Node {
            mtime_nsec: ctime_nsec.into(),
            atime: ctime.into(),
            atime_nsec: ctime_nsec.into(),
+            record_level: if mode & Self::MODE_TYPE == Self::MODE_FILE {
+                // Files take on record level
+                RECORD_LEVEL as u32
+            } else {
+                // Folders do not
+                0
+            }
+            .into(),
            ..Default::default()
        }
    }

+    /// This node's type & permissions.
+    /// - first four bits are permissions
+    /// - next four bits are permissions for the file's user
+    /// - next four bits are permissions for the file's group
+    /// - last four bits are permissions for everyone else
    pub fn mode(&self) -> u16 {
-        { self.mode }.to_native()
+        self.mode.to_ne()
    }

+    /// The uid that owns this file
    pub fn uid(&self) -> u32 {
-        { self.uid }.to_native()
+        self.uid.to_ne()
    }

+    /// The gid that owns this file
    pub fn gid(&self) -> u32 {
-        { self.gid }.to_native()
+        self.gid.to_ne()
    }

+    /// The number of links to this file
+    /// (directory entries, symlinks, etc)
    pub fn links(&self) -> u32 {
-        { self.links }.to_native()
+        self.links.to_ne()
    }

+    /// The length of this file, in bytes.
    pub fn size(&self) -> u64 {
-        { self.size }.to_native()
+        self.size.to_ne()
    }

    pub fn ctime(&self) -> (u64, u32) {
-        ({ self.ctime }.to_native(), { self.ctime_nsec }.to_native())
+        (self.ctime.to_ne(), self.ctime_nsec.to_ne())
    }

    pub fn mtime(&self) -> (u64, u32) {
-        ({ self.mtime }.to_native(), { self.mtime_nsec }.to_native())
+        (self.mtime.to_ne(), self.mtime_nsec.to_ne())
    }

    pub fn atime(&self) -> (u64, u32) {
-        ({ self.atime }.to_native(), { self.atime_nsec }.to_native())
+        (self.atime.to_ne(), self.atime_nsec.to_ne())
+    }
+
+    pub fn record_level(&self) -> BlockLevel {
+        BlockLevel(self.record_level.to_ne() as usize)
    }

    pub fn set_mode(&mut self, mode: u16) {

--- a/src/record.rs
+++ b/src/record.rs
+use alloc::{boxed::Box, vec};
+use core::ops;
+
+use crate::{BlockLevel, BlockTrait, RECORD_LEVEL};
+
+//TODO: this is a box to prevent stack overflows
+pub struct RecordRaw(Box<[u8]>);
+
+unsafe impl BlockTrait for RecordRaw {
+    fn empty(level: BlockLevel) -> Option<Self> {
+        if level.0 <= RECORD_LEVEL {
+            Some(Self(vec![0; level.bytes() as usize].into_boxed_slice()))
+        } else {
+            None
+        }
+    }
+}
+
+impl Clone for RecordRaw {
+    fn clone(&self) -> Self {
+        Self(self.0.clone())
+    }
+}
+
+impl ops::Deref for RecordRaw {
+    type Target = [u8];
+    fn deref(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl ops::DerefMut for RecordRaw {
+    fn deref_mut(&mut self) -> &mut [u8] {
+        &mut self.0
+    }
+}
+
+#[test]
+fn record_raw_size_test() {
+    for level_i in 0..RECORD_LEVEL {
+        let level = BlockLevel(level_i);
+        assert_eq!(
+            RecordRaw::empty(level).unwrap().len(),
+            level.bytes() as usize
+        );
+    }
+}
--- a/src/tests.rs
+++ b/src/tests.rs
-use std::ops::DerefMut;
+use crate::{unmount_path, DiskSparse, FileSystem, Node, TreePtr, ALLOC_GC_THRESHOLD};
 use std::path::Path;
 use std::process::Command;
-use std::{fs, sync, thread, time};
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering::Relaxed;
+use std::{fs, thread, time};

-use crate::{unmount_path, DiskSparse, FileSystem};
+static IMAGE_SEQ: AtomicUsize = AtomicUsize::new(0);

 fn with_redoxfs<T, F>(callback: F) -> T
 where
    T: Send + Sync + 'static,
-    F: FnMut(&Path) -> T + Send + Sync + 'static,
+    F: FnOnce(FileSystem<DiskSparse>) -> T + Send + Sync + 'static,
 {
-    let disk_path = "image.bin";
-    let mount_path = "image";
+    let disk_path = format!("image{}.bin", IMAGE_SEQ.fetch_add(1, Relaxed));

    let res = {
-        let disk = DiskSparse::create(dbg!(disk_path), 1024 * 1024 * 1024).unwrap();
-
-        if cfg!(not(target_os = "redox")) {
-            if !Path::new(mount_path).exists() {
-                dbg!(fs::create_dir(dbg!(mount_path))).unwrap();
-            }
-        }
+        let disk = DiskSparse::create(dbg!(&disk_path), 1024 * 1024 * 1024).unwrap();

        let ctime = dbg!(time::SystemTime::now().duration_since(time::UNIX_EPOCH)).unwrap();
        let fs = FileSystem::create(disk, None, ctime.as_secs(), ctime.subsec_nanos()).unwrap();

-        let callback_mutex = sync::Arc::new(sync::Mutex::new(callback));
+        callback(fs)
+    };
+
+    dbg!(fs::remove_file(dbg!(disk_path))).unwrap();
+
+    res
+}
+
+fn with_mounted<T, F>(callback: F) -> T
+where
+    T: Send + Sync + 'static,
+    F: FnOnce(&Path) -> T + Send + Sync + 'static,
+{
+    let mount_path_o = format!("image{}", IMAGE_SEQ.fetch_add(1, Relaxed));
+    let mount_path = mount_path_o.clone();
+
+    let res = with_redoxfs(move |fs| {
+        if cfg!(not(target_os = "redox")) {
+            if !Path::new(&mount_path).exists() {
+                dbg!(fs::create_dir(dbg!(&mount_path))).unwrap();
+            }
+        }
        let join_handle = crate::mount(fs, dbg!(mount_path), move |real_path| {
-            let callback_mutex = callback_mutex.clone();
            let real_path = real_path.to_owned();
            thread::spawn(move || {
-                let res = {
-                    let mut callback_guard = callback_mutex.lock().unwrap();
-                    let callback = callback_guard.deref_mut();
-                    callback(&real_path)
-                };
+                let res = callback(&real_path);
+                let real_path = real_path.to_str().unwrap();

                if cfg!(target_os = "redox") {
-                    dbg!(fs::remove_file(dbg!(format!(":{}", mount_path)))).unwrap();
+                    dbg!(fs::remove_file(dbg!(format!(":{}", real_path)))).unwrap();
                } else {
                    if !dbg!(Command::new("sync").status()).unwrap().success() {
                        panic!("sync failed");
                    }

-                    if !unmount_path(mount_path).is_ok() {
+                    if !unmount_path(real_path).is_ok() {
                        panic!("umount failed");
                    }
                }
@@ -54,12 +66,10 @@ where
        .unwrap();

        join_handle.join().unwrap()
-    };
-
-    dbg!(fs::remove_file(dbg!(disk_path))).unwrap();
+    });

    if cfg!(not(target_os = "redox")) {
-        dbg!(fs::remove_dir(dbg!(mount_path))).unwrap();
+        dbg!(fs::remove_dir(dbg!(mount_path_o))).unwrap();
    }

    res
@@ -67,7 +77,7 @@ where

 #[test]
 fn simple() {
-    with_redoxfs(|path| {
+    with_mounted(|path| {
        dbg!(fs::create_dir(&path.join("test"))).unwrap();
    })
 }
@@ -78,36 +88,36 @@ fn mmap() {
    use syscall;

    //TODO
-    with_redoxfs(|path| {
+    with_mounted(|path| {
        use std::slice;

        let path = dbg!(path.join("test"));

        let mmap_inner = |write: bool| {
-            let fd = dbg!(syscall::open(
+            let fd = dbg!(libredox::call::open(
                path.to_str().unwrap(),
-                syscall::O_CREAT | syscall::O_RDWR | syscall::O_CLOEXEC
+                libredox::flag::O_CREAT | libredox::flag::O_RDWR | libredox::flag::O_CLOEXEC,
+                0,
            ))
            .unwrap();

            let map = unsafe {
                slice::from_raw_parts_mut(
-                    dbg!(syscall::fmap(
+                    dbg!(libredox::call::mmap(libredox::call::MmapArgs {
                        fd,
-                        &syscall::Map {
-                            offset: 0,
-                            size: 128,
-                            flags: syscall::PROT_READ | syscall::PROT_WRITE,
-                            address: 0,
-                        }
-                    ))
+                        offset: 0,
+                        length: 128,
+                        prot: libredox::flag::PROT_READ | libredox::flag::PROT_WRITE,
+                        flags: libredox::flag::MAP_SHARED,
+                        addr: core::ptr::null_mut(),
+                    }))
                    .unwrap() as *mut u8,
                    128,
                )
            };

            // Maps should be available after closing
-            assert_eq!(dbg!(syscall::close(fd)), Ok(0));
+            assert_eq!(dbg!(libredox::call::close(fd)), Ok(()));

            for i in 0..128 {
                if write {
@@ -119,8 +129,8 @@ fn mmap() {
            //TODO: add msync
            unsafe {
                assert_eq!(
-                    dbg!(syscall::funmap(map.as_mut_ptr() as usize, map.len())),
-                    Ok(0)
+                    dbg!(libredox::call::munmap(map.as_mut_ptr().cast(), map.len())),
+                    Ok(())
                );
            }
        };
@@ -129,3 +139,54 @@ fn mmap() {
        mmap_inner(false);
    })
 }
+
+#[test]
+fn create_remove_should_not_increase_size() {
+    with_redoxfs(|mut fs| {
+        let initially_free = fs.allocator().free();
+
+        let tree_ptr = TreePtr::<Node>::root();
+        let name = "test";
+        let _ = fs
+            .tx(|tx| {
+                tx.create_node(tree_ptr, name, Node::MODE_FILE | 0644, 1, 0)?;
+                tx.remove_node(tree_ptr, name, Node::MODE_FILE)
+            })
+            .unwrap();
+
+        assert_eq!(fs.allocator().free(), initially_free);
+    });
+}
+
+#[test]
+fn many_create_remove_should_not_increase_size() {
+    with_redoxfs(|mut fs| {
+        let initially_free = fs.allocator().free();
+        let tree_ptr = TreePtr::<Node>::root();
+        let name = "test";
+
+        // Iterate over 255 times to prove deleted files don't retain space within the node tree
+        // Iterate to an ALLOC_GC_THRESHOLD boundary to ensure the allocator GC reclaims space
+        let start = fs.header.generation.to_ne();
+        let end = start + ALLOC_GC_THRESHOLD;
+        let end = end - (end % ALLOC_GC_THRESHOLD) + 1 + ALLOC_GC_THRESHOLD;
+        for i in start..end {
+            let _ = fs
+                .tx(|tx| {
+                    tx.create_node(
+                        tree_ptr,
+                        &format!("{}{}", name, i),
+                        Node::MODE_FILE | 0644,
+                        1,
+                        0,
+                    )?;
+                    tx.remove_node(tree_ptr, &format!("{}{}", name, i), Node::MODE_FILE)
+                })
+                .unwrap();
+        }
+
+        // Any value greater than 0 indicates a storage leak
+        let diff = initially_free - fs.allocator().free();
+        assert_eq!(diff, 0);
+    });
+}
--- a/src/transaction.rs
+++ b/src/transaction.rs
 use alloc::{
+    boxed::Box,
    collections::{BTreeMap, VecDeque},
    vec::Vec,
 };
@@ -12,9 +13,9 @@ use syscall::error::{
 };

 use crate::{
-    AllocEntry, AllocList, Allocator, BlockData, BlockPtr, BlockRaw, DirEntry, DirList, Disk,
-    FileSystem, Header, Node, NodeLevel, TreeData, TreePtr, ALLOC_LIST_ENTRIES, BLOCK_SIZE,
-    HEADER_RING,
+    AllocEntry, AllocList, Allocator, BlockAddr, BlockData, BlockLevel, BlockPtr, BlockTrait,
+    DirEntry, DirList, Disk, FileSystem, Header, Node, NodeLevel, RecordRaw, TreeData, TreePtr,
+    ALLOC_GC_THRESHOLD, ALLOC_LIST_ENTRIES, DIR_ENTRY_MAX_LENGTH, HEADER_RING,
 };

 pub struct Transaction<'a, D: Disk> {
@@ -25,13 +26,13 @@ pub struct Transaction<'a, D: Disk> {
    pub header_changed: bool,
    allocator: Allocator,
    allocator_log: VecDeque<AllocEntry>,
-    deallocate: Vec<u64>,
-    write_cache: BTreeMap<u64, BlockRaw>,
+    deallocate: Vec<BlockAddr>,
+    write_cache: BTreeMap<BlockAddr, Box<[u8]>>,
 }

 impl<'a, D: Disk> Transaction<'a, D> {
    pub(crate) fn new(fs: &'a mut FileSystem<D>) -> Self {
-        let header = fs.header.clone();
+        let header = fs.header;
        let allocator = fs.allocator.clone();
        Self {
            fs,
@@ -51,30 +52,38 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Ok(())
    }

-    // Unsafe because order must be done carefully and changes must be flushed to disk
-    unsafe fn allocate(&mut self) -> Result<u64> {
-        match self.allocator.allocate() {
+    //
+    // MARK: block operations
+    //
+
+    /// Allocate a new block of size `level`, returning its address.
+    /// - returns `Err(ENOSPC)` if a block of this size could not be alloated.
+    /// - unsafe because order must be done carefully and changes must be flushed to disk
+    unsafe fn allocate(&mut self, level: BlockLevel) -> Result<BlockAddr> {
+        match self.allocator.allocate(level) {
            Some(addr) => {
-                self.allocator_log.push_back(AllocEntry::new(addr, -1));
+                self.allocator_log.push_back(AllocEntry::allocate(addr));
                Ok(addr)
            }
            None => Err(Error::new(ENOSPC)),
        }
    }

-    // Unsafe because order must be done carefully and changes must be flushed to disk
-    unsafe fn deallocate(&mut self, addr: u64) {
+    /// Deallocate the given block.
+    /// - unsafe because order must be done carefully and changes must be flushed to disk
+    unsafe fn deallocate(&mut self, addr: BlockAddr) {
        //TODO: should we use some sort of not-null abstraction?
-        assert!(addr != 0);
+        assert!(!addr.is_null());

        // Remove from write_cache if it is there, since it no longer needs to be written
+        //TODO: for larger blocks do we need to check for sub-blocks in here?
        self.write_cache.remove(&addr);

        // Search and remove the last matching entry in allocator_log
        let mut found = false;
        for i in (0..self.allocator_log.len()).rev() {
            let entry = self.allocator_log[i];
-            if entry.addr() == addr && entry.count() == -1 {
+            if entry.index() == addr.index() && entry.count() == -addr.level().blocks() {
                found = true;
                self.allocator_log.remove(i);
                break;
@@ -90,7 +99,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
        }
    }

-    fn deallocate_block<T>(&mut self, ptr: BlockPtr<T>) {
+    fn deallocate_block<T: BlockTrait>(&mut self, ptr: BlockPtr<T>) {
        if !ptr.is_null() {
            unsafe {
                self.deallocate(ptr.addr());
@@ -98,28 +107,39 @@ impl<'a, D: Disk> Transaction<'a, D> {
        }
    }

-    fn sync_allocator(&mut self, squash: bool) -> Result<bool> {
+    /// Drain `self.allocator_log` and `self.deallocate`,
+    /// updating the [`AllocList`] with the resulting state.
+    ///
+    /// This method does not write anything to disk,
+    /// all writes are cached.
+    ///
+    /// To keep the allocator log from growing excessively, it will
+    /// periodically be fully rebuilt using the state of `self.allocator`.
+    /// This rebuild can be forced by setting `force_squash` to `true`.
+    fn sync_allocator(&mut self, force_squash: bool) -> Result<bool> {
        let mut prev_ptr = BlockPtr::default();
-        if squash {
+        let should_gc = self.header.generation() % ALLOC_GC_THRESHOLD == 0
+            && self.header.generation() >= ALLOC_GC_THRESHOLD;
+        if force_squash || should_gc {
            // Clear and rebuild alloc log
            self.allocator_log.clear();
            let levels = self.allocator.levels();
            for level in (0..levels.len()).rev() {
                let count = (1 << level) as i64;
-                'addrs: for &addr in levels[level].iter() {
+                'indexs: for &index in levels[level].iter() {
                    for entry in self.allocator_log.iter_mut() {
-                        if addr + count as u64 == entry.addr() {
+                        if index + count as u64 == entry.index() {
                            // New entry is at start of existing entry
-                            *entry = AllocEntry::new(addr, count + entry.count());
-                            continue 'addrs;
-                        } else if entry.addr() + entry.count() as u64 == addr {
+                            *entry = AllocEntry::new(index, count + entry.count());
+                            continue 'indexs;
+                        } else if entry.index() + entry.count() as u64 == index {
                            // New entry is at end of existing entry
-                            *entry = AllocEntry::new(entry.addr(), entry.count() + count);
-                            continue 'addrs;
+                            *entry = AllocEntry::new(entry.index(), entry.count() + count);
+                            continue 'indexs;
                        }
                    }

-                    self.allocator_log.push_back(AllocEntry::new(addr, count));
+                    self.allocator_log.push_back(AllocEntry::new(index, count));
                }
            }

@@ -127,7 +147,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
            let mut alloc_ptr = self.header.alloc;
            while !alloc_ptr.is_null() {
                let alloc = self.read_block(alloc_ptr)?;
-                self.deallocate_block(alloc_ptr);
+                self.deallocate.push(alloc.addr());
                alloc_ptr = alloc.data().prev;
            }
        } else {
@@ -147,9 +167,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
            }

            // Prepare to deallocate old alloc block
-            unsafe {
-                self.deallocate(alloc.addr());
-            }
+            self.deallocate.push(alloc.addr());

            // Link to previous alloc block
            prev_ptr = alloc.data().prev;
@@ -160,18 +178,18 @@ impl<'a, D: Disk> Transaction<'a, D> {
        while new_blocks.len() * ALLOC_LIST_ENTRIES
            <= self.allocator_log.len() + self.deallocate.len()
        {
-            new_blocks.push(unsafe { self.allocate()? });
+            new_blocks.push(unsafe { self.allocate(BlockLevel::default())? });
        }

        // De-allocate old blocks (after allocation to prevent re-use)
        //TODO: optimize allocator log in memory
        while let Some(addr) = self.deallocate.pop() {
            self.allocator.deallocate(addr);
-            self.allocator_log.push_back(AllocEntry::new(addr, 1));
+            self.allocator_log.push_back(AllocEntry::deallocate(addr));
        }

        for new_block in new_blocks {
-            let mut alloc = BlockData::new(new_block, AllocList::default());
+            let mut alloc = BlockData::<AllocList>::empty(new_block).unwrap();
            alloc.data_mut().prev = prev_ptr;
            for entry in alloc.data_mut().entries.iter_mut() {
                if let Some(log_entry) = self.allocator_log.pop_front() {
@@ -189,17 +207,20 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Ok(true)
    }

-    //TODO: change this function, provide another way to squash, only write header in commit
-    pub fn sync(&mut self, squash: bool) -> Result<bool> {
+    /// Write all changes cached in this [`Transaction`] to disk.
+    pub fn sync(&mut self, force_squash: bool) -> Result<bool> {
        // Make sure alloc is synced
-        self.sync_allocator(squash)?;
+        self.sync_allocator(force_squash)?;

        // Write all items in write cache
        for (addr, raw) in self.write_cache.iter_mut() {
+            // sync_alloc must have changed alloc block pointer
+            // if we have any blocks to write
            assert!(self.header_changed);
+
            self.fs.encrypt(raw);
-            let count = unsafe { self.fs.disk.write_at(self.fs.block + addr, &raw)? };
-            if count != mem::size_of::<BlockRaw>() {
+            let count = unsafe { self.fs.disk.write_at(self.fs.block + addr.index(), raw)? };
+            if count != raw.len() {
                // Read wrong number of bytes
                #[cfg(feature = "log")]
                log::error!("SYNC WRITE_CACHE: WRONG NUMBER OF BYTES");
@@ -208,6 +229,10 @@ impl<'a, D: Disk> Transaction<'a, D> {
        }
        self.write_cache.clear();

+        // Do nothing if there are no changes to write.
+        //
+        // This only happens if `self.write_cache` was empty,
+        // and the fs header wasn't changed by another operation.
        if !self.header_changed {
            return Ok(false);
        }
@@ -233,7 +258,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Ok(true)
    }

-    pub fn read_block<T: Default + DerefMut<Target = [u8]>>(
+    pub fn read_block<T: BlockTrait + DerefMut<Target = [u8]>>(
        &mut self,
        ptr: BlockPtr<T>,
    ) -> Result<BlockData<T>> {
@@ -244,16 +269,23 @@ impl<'a, D: Disk> Transaction<'a, D> {
            return Err(Error::new(ENOENT));
        }

-        let mut data = T::default();
+        let mut data = match T::empty(ptr.addr().level()) {
+            Some(some) => some,
+            None => {
+                #[cfg(feature = "log")]
+                log::error!("READ_BLOCK: INVALID BLOCK LEVEL FOR TYPE");
+                return Err(Error::new(ENOENT));
+            }
+        };
        if let Some(raw) = self.write_cache.get(&ptr.addr()) {
            data.copy_from_slice(raw);
        } else {
            let count = unsafe {
                self.fs
                    .disk
-                    .read_at(self.fs.block + ptr.addr(), &mut data)?
+                    .read_at(self.fs.block + ptr.addr().index(), &mut data)?
            };
-            if count != mem::size_of::<T>() {
+            if count != data.len() {
                // Read wrong number of bytes
                #[cfg(feature = "log")]
                log::error!("READ_BLOCK: WRONG NUMBER OF BYTES");
@@ -268,10 +300,10 @@ impl<'a, D: Disk> Transaction<'a, D> {
            // Incorrect hash
            #[cfg(feature = "log")]
            log::error!(
-                "READ_BLOCK: INCORRECT HASH {} != {} for block {}",
+                "READ_BLOCK: INCORRECT HASH 0x{:X} != 0x{:X} for block 0x{:X}",
                block_ptr.hash(),
                ptr.hash(),
-                ptr.addr()
+                ptr.addr().index()
            );
            return Err(Error::new(EIO));
        }
@@ -282,27 +314,63 @@ impl<'a, D: Disk> Transaction<'a, D> {
    ///
    /// # Safety
    /// Unsafe because it creates strange BlockData types that must be swapped before use
-    unsafe fn read_block_or_default<T: Default + DerefMut<Target = [u8]>>(
+    unsafe fn read_block_or_empty<T: BlockTrait + DerefMut<Target = [u8]>>(
        &mut self,
        ptr: BlockPtr<T>,
    ) -> Result<BlockData<T>> {
        if ptr.is_null() {
-            Ok(BlockData::new(0, T::default()))
+            match T::empty(ptr.addr().level()) {
+                Some(empty) => Ok(BlockData::new(BlockAddr::default(), empty)),
+                None => {
+                    #[cfg(feature = "log")]
+                    log::error!("READ_BLOCK_OR_EMPTY: INVALID BLOCK LEVEL FOR TYPE");
+                    Err(Error::new(ENOENT))
+                }
+            }
        } else {
            self.read_block(ptr)
        }
    }

+    unsafe fn read_record<T: BlockTrait + DerefMut<Target = [u8]>>(
+        &mut self,
+        ptr: BlockPtr<T>,
+        level: BlockLevel,
+    ) -> Result<BlockData<T>> {
+        let record = unsafe { self.read_block_or_empty(ptr)? };
+        if record.addr().level() >= level {
+            // Return record if it is larger than or equal to requested level
+            return Ok(record);
+        }
+
+        // If a larger level was requested,
+        // create a fake record with the requested level
+        // and fill it with the data in the original record.
+        let (_old_addr, old_raw) = unsafe { record.into_parts() };
+        let mut raw = match T::empty(level) {
+            Some(empty) => empty,
+            None => {
+                #[cfg(feature = "log")]
+                log::error!("READ_RECORD: INVALID BLOCK LEVEL FOR TYPE");
+                return Err(Error::new(ENOENT));
+            }
+        };
+        let len = min(raw.len(), old_raw.len());
+        raw[..len].copy_from_slice(&old_raw[..len]);
+        Ok(BlockData::new(BlockAddr::null(level), raw))
+    }
+
    /// Write block data to a new address, returning new address
-    pub fn sync_block<T: Deref<Target = [u8]>>(
+    pub fn sync_block<T: BlockTrait + Deref<Target = [u8]>>(
        &mut self,
        mut block: BlockData<T>,
    ) -> Result<BlockPtr<T>> {
        // Swap block to new address
-        let old_addr = block.swap_addr(unsafe { self.allocate()? });
+        let level = block.addr().level();
+        let old_addr = block.swap_addr(unsafe { self.allocate(level)? });
        // Deallocate old address (will only take effect after sync_allocator, which helps to
        // prevent re-use before a new header is written
-        if old_addr != 0 {
+        if !old_addr.is_null() {
            unsafe {
                self.deallocate(old_addr);
            }
@@ -315,29 +383,36 @@ impl<'a, D: Disk> Transaction<'a, D> {
    ///
    /// # Safety
    /// Unsafe to encourage CoW semantics
-    pub(crate) unsafe fn write_block<T: Deref<Target = [u8]>>(
+    pub(crate) unsafe fn write_block<T: BlockTrait + Deref<Target = [u8]>>(
        &mut self,
        block: BlockData<T>,
    ) -> Result<BlockPtr<T>> {
-        if block.addr() == 0 {
+        if block.addr().is_null() {
            // Pointer is invalid
            #[cfg(feature = "log")]
            log::error!("WRITE_BLOCK: POINTER IS NULL");
            return Err(Error::new(ENOENT));
        }

-        //TODO: transmute?
-        let mut raw = BlockRaw::default();
-        raw.copy_from_slice(block.data());
-        self.write_cache.insert(block.addr(), raw);
+        //TODO: do not convert to boxed slice if it already is one
+        self.write_cache.insert(
+            block.addr(),
+            block.data().deref().to_vec().into_boxed_slice(),
+        );

        Ok(block.create_ptr())
    }

-    pub fn read_tree<T: Default + DerefMut<Target = [u8]>>(
+    //
+    // MARK: tree operations
+    //
+
+    /// Walk the tree and return the contents and address
+    /// of the data block that `ptr` points too.
+    fn read_tree_and_addr<T: BlockTrait + DerefMut<Target = [u8]>>(
        &mut self,
        ptr: TreePtr<T>,
-    ) -> Result<TreeData<T>> {
+    ) -> Result<(TreeData<T>, BlockAddr)> {
        if ptr.is_null() {
            // ID is invalid (should this return None?)
            #[cfg(feature = "log")]
@@ -353,28 +428,46 @@ impl<'a, D: Disk> Transaction<'a, D> {
        let raw = self.read_block(l0.data().ptrs[i0])?;

        //TODO: transmute instead of copy?
-        let mut data = T::default();
+        let mut data = match T::empty(BlockLevel::default()) {
+            Some(some) => some,
+            None => {
+                #[cfg(feature = "log")]
+                log::error!("READ_TREE: INVALID BLOCK LEVEL FOR TYPE");
+                return Err(Error::new(ENOENT));
+            }
+        };
        data.copy_from_slice(raw.data());

-        Ok(TreeData::new(ptr.id(), data))
+        Ok((TreeData::new(ptr.id(), data), raw.addr()))
    }

-    //TODO: improve performance, reduce writes
+    /// Walk the tree and return the contents of the data block that `ptr` points too.
+    pub fn read_tree<T: BlockTrait + DerefMut<Target = [u8]>>(
+        &mut self,
+        ptr: TreePtr<T>,
+    ) -> Result<TreeData<T>> {
+        Ok(self.read_tree_and_addr(ptr)?.0)
+    }
+
+    /// Insert `block_ptr` into the first free slot in the tree,
+    /// returning a pointer to that slot.
    pub fn insert_tree<T: Deref<Target = [u8]>>(
        &mut self,
        block_ptr: BlockPtr<T>,
    ) -> Result<TreePtr<T>> {
+        // TODO: improve performance, reduce writes
+
        // Remember that if there is a free block at any level it will always sync when it
        // allocates at the lowest level, so we can save a write by not writing each level as it
        // is allocated.
        unsafe {
            let mut l3 = self.read_block(self.header.tree)?;
            for i3 in 0..l3.data().ptrs.len() {
-                let mut l2 = self.read_block_or_default(l3.data().ptrs[i3])?;
+                let mut l2 = self.read_block_or_empty(l3.data().ptrs[i3])?;
                for i2 in 0..l2.data().ptrs.len() {
-                    let mut l1 = self.read_block_or_default(l2.data().ptrs[i2])?;
+                    let mut l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
                    for i1 in 0..l1.data().ptrs.len() {
-                        let mut l0 = self.read_block_or_default(l1.data().ptrs[i1])?;
+                        let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
                        for i0 in 0..l0.data().ptrs.len() {
                            let pn = l0.data().ptrs[i0];

@@ -390,6 +483,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
                                continue;
                            }

+                            // TODO: do we need to write all of these?
                            // Write updates to newly allocated blocks
                            l0.data_mut().ptrs[i0] = block_ptr.cast();
                            l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
@@ -408,6 +502,36 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Err(Error::new(ENOSPC))
    }

+    /// Clear the previously claimed slot in the tree for the given `ptr`. Note that this
+    /// should only be called after the corresponding node block has already been deallocated.
+    fn remove_tree<T: BlockTrait + DerefMut<Target = [u8]>>(
+        &mut self,
+        ptr: TreePtr<T>,
+    ) -> Result<()> {
+        if ptr.is_null() {
+            // ID is invalid (should this return None?)
+            #[cfg(feature = "log")]
+            log::error!("READ_TREE: ID IS NULL");
+            return Err(Error::new(ENOENT));
+        }
+
+        let (i3, i2, i1, i0) = ptr.indexes();
+        let mut l3 = self.read_block(self.header.tree)?;
+        let mut l2 = self.read_block(l3.data().ptrs[i3])?;
+        let mut l1 = self.read_block(l2.data().ptrs[i2])?;
+        let mut l0 = self.read_block(l1.data().ptrs[i1])?;
+
+        // Clear the value in the tree, but do not deallocate the block, as that should already
+        // have been done at the node level.
+        l0.data_mut().ptrs[i0] = BlockPtr::default();
+        l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
+        l2.data_mut().ptrs[i2] = self.sync_block(l1)?;
+        l3.data_mut().ptrs[i3] = self.sync_block(l2)?;
+        self.header.tree = self.sync_block(l3)?;
+        self.header_changed = true;
+        Ok(())
+    }
+
    pub fn sync_trees<T: Deref<Target = [u8]>>(&mut self, nodes: &[TreeData<T>]) -> Result<()> {
        for node in nodes.iter().rev() {
            let ptr = node.ptr();
@@ -451,18 +575,27 @@ impl<'a, D: Disk> Transaction<'a, D> {
        self.sync_trees(&[node])
    }

-    //TODO: use more efficient methods for reading directories
+    //
+    // MARK: node operations
+    //
+
+    // TODO: use more efficient methods for reading directories
+    /// Write all children of `parent_ptr` to `children`.
+    /// `parent_ptr` must point to a directory node.
    pub fn child_nodes(
        &mut self,
        parent_ptr: TreePtr<Node>,
        children: &mut Vec<DirEntry>,
    ) -> Result<()> {
        let parent = self.read_tree(parent_ptr)?;
-        for block_offset in 0..(parent.data().size() / BLOCK_SIZE) {
-            let block_ptr = self.node_block_ptr(&parent, block_offset)?;
+        let record_level = parent.data().record_level();
+        for record_offset in 0..(parent.data().size() / record_level.bytes()) {
+            let block_ptr = self.node_record_ptr(&parent, record_offset)?;
+            // TODO: is this safe? what if child_nodes is called on
+            // a node that isn't a directory?
            let dir_ptr: BlockPtr<DirList> = unsafe { block_ptr.cast() };
            let dir = self.read_block(dir_ptr)?;
-            for entry in dir.data().entries {
+            for entry in dir.data().entries.iter() {
                let node_ptr = entry.node_ptr();

                // Skip empty entries
@@ -470,7 +603,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
                    continue;
                }

-                children.push(entry);
+                children.push(*entry);
            }
        }

@@ -478,13 +611,16 @@ impl<'a, D: Disk> Transaction<'a, D> {
    }

    //TODO: improve performance (h-tree?)
+    /// Find a node that is a child of the `parent_ptr` and is named `name`.
+    /// Returns ENOENT if this node is not found.
    pub fn find_node(&mut self, parent_ptr: TreePtr<Node>, name: &str) -> Result<TreeData<Node>> {
        let parent = self.read_tree(parent_ptr)?;
-        for block_offset in 0..(parent.data().size() / BLOCK_SIZE) {
-            let block_ptr = self.node_block_ptr(&parent, block_offset)?;
+        let record_level = parent.data().record_level();
+        for block_offset in 0..(parent.data().size() / record_level.bytes()) {
+            let block_ptr = self.node_record_ptr(&parent, block_offset)?;
            let dir_ptr: BlockPtr<DirList> = unsafe { block_ptr.cast() };
            let dir = self.read_block(dir_ptr)?;
-            for entry in dir.data().entries {
+            for entry in dir.data().entries.iter() {
                let node_ptr = entry.node_ptr();

                // Skip empty entries
@@ -505,7 +641,8 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Err(Error::new(ENOENT))
    }

-    //TODO: improve performance (h-tree?)
+    // TODO: improve performance (h-tree?)
+    /// Create a new node in the tree with the given parameters.
    pub fn create_node(
        &mut self,
        parent_ptr: TreePtr<Node>,
@@ -514,18 +651,12 @@ impl<'a, D: Disk> Transaction<'a, D> {
        ctime: u64,
        ctime_nsec: u32,
    ) -> Result<TreeData<Node>> {
-        if name.contains(':') {
-            return Err(Error::new(EINVAL));
-        }
-
-        if self.find_node(parent_ptr, name).is_ok() {
-            return Err(Error::new(EEXIST));
-        }
+        self.check_name(&parent_ptr, name)?;

        unsafe {
            let parent = self.read_tree(parent_ptr)?;
            let node_block_data = BlockData::new(
-                self.allocate()?,
+                self.allocate(BlockLevel::default())?,
                Node::new(
                    mode,
                    parent.data().uid(),
@@ -550,57 +681,57 @@ impl<'a, D: Disk> Transaction<'a, D> {
        name: &str,
        node_ptr: TreePtr<Node>,
    ) -> Result<()> {
-        if name.contains(':') {
-            return Err(Error::new(EINVAL));
-        }
-
-        if self.find_node(parent_ptr, name).is_ok() {
-            return Err(Error::new(EEXIST));
-        }
+        self.check_name(&parent_ptr, name)?;

        let mut parent = self.read_tree(parent_ptr)?;
-
        let mut node = self.read_tree(node_ptr)?;
+
+        // Increment node reference counter
        let links = node.data().links();
        node.data_mut().set_links(links + 1);

-        let entry = DirEntry::new(node_ptr, name).ok_or(Error::new(EINVAL))?;
+        let entry = DirEntry::new(node_ptr, name);

-        let block_end = parent.data().size() / BLOCK_SIZE;
-        for block_offset in 0..block_end {
-            let mut dir_block_ptr = self.node_block_ptr(&parent, block_offset)?;
-            let mut dir_ptr: BlockPtr<DirList> = unsafe { dir_block_ptr.cast() };
+        let record_level = parent.data().record_level();
+        let record_end = parent.data().size() / record_level.bytes();
+        for record_offset in 0..record_end {
+            let mut dir_record_ptr = self.node_record_ptr(&parent, record_offset)?;
+            let mut dir_ptr: BlockPtr<DirList> = unsafe { dir_record_ptr.cast() };
            let mut dir = self.read_block(dir_ptr)?;
-            let mut dir_changed = false;
+
            for old_entry in dir.data_mut().entries.iter_mut() {
-                // Skip filled entries
                if !old_entry.node_ptr().is_null() {
                    continue;
                }

+                // Write our new entry into the first
+                // free slot in this directory
                *old_entry = entry;
-                dir_changed = true;
-                break;
-            }
-            if dir_changed {
-                dir_ptr = self.sync_block(dir)?;
-                dir_block_ptr = unsafe { dir_ptr.cast() };

-                self.sync_node_block_ptr(&mut parent, block_offset, dir_block_ptr)?;
+                // Write updated blocks
+                dir_ptr = self.sync_block(dir)?;
+                dir_record_ptr = unsafe { dir_ptr.cast() };
+                self.sync_node_record_ptr(&mut parent, record_offset, dir_record_ptr)?;
                self.sync_trees(&[parent, node])?;

                return Ok(());
            }
        }

-        // Append a new dirlist, with first entry set to new entry
-        let mut dir = BlockData::new(unsafe { self.allocate()? }, DirList::default());
+        // We couldn't find a free direntry slot, this directory is full.
+        // We now need to add a new dirlist block to the parent node,
+        // with `entry` as its first member.
+
+        let mut dir =
+            BlockData::<DirList>::empty(unsafe { self.allocate(BlockLevel::default())? }).unwrap();
        dir.data_mut().entries[0] = entry;
        let dir_ptr = unsafe { self.write_block(dir)? };
-        let dir_block_ptr: BlockPtr<BlockRaw> = unsafe { dir_ptr.cast() };
+        let dir_record_ptr = unsafe { dir_ptr.cast() };

-        self.sync_node_block_ptr(&mut parent, block_end, dir_block_ptr)?;
-        parent.data_mut().set_size((block_end + 1) * BLOCK_SIZE);
+        self.sync_node_record_ptr(&mut parent, record_end, dir_record_ptr)?;
+        parent
+            .data_mut()
+            .set_size((record_end + 1) * record_level.bytes());
        self.sync_trees(&[parent, node])?;

        Ok(())
@@ -608,10 +739,11 @@ impl<'a, D: Disk> Transaction<'a, D> {

    pub fn remove_node(&mut self, parent_ptr: TreePtr<Node>, name: &str, mode: u16) -> Result<()> {
        let mut parent = self.read_tree(parent_ptr)?;
-        let blocks = parent.data().size() / BLOCK_SIZE;
-        for block_offset in 0..blocks {
-            let mut dir_block_ptr = self.node_block_ptr(&parent, block_offset)?;
-            let mut dir_ptr: BlockPtr<DirList> = unsafe { dir_block_ptr.cast() };
+        let record_level = parent.data().record_level();
+        let records = parent.data().size() / record_level.bytes();
+        for record_offset in 0..records {
+            let mut dir_record_ptr = self.node_record_ptr(&parent, record_offset)?;
+            let mut dir_ptr: BlockPtr<DirList> = unsafe { dir_record_ptr.cast() };
            let mut dir = self.read_block(dir_ptr)?;
            let mut node_opt = None;
            for entry in dir.data_mut().entries.iter_mut() {
@@ -626,15 +758,18 @@ impl<'a, D: Disk> Transaction<'a, D> {
                if let Some(entry_name) = entry.name() {
                    if entry_name == name {
                        // Read node and test type against requested type
-                        let node = self.read_tree(node_ptr)?;
+                        let (node, addr) = self.read_tree_and_addr(node_ptr)?;
                        if node.data().mode() & Node::MODE_TYPE == mode {
-                            if node.data().is_dir() && node.data().size() > 0 && node.data().links() == 1 {
+                            if node.data().is_dir()
+                                && node.data().size() > 0
+                                && node.data().links() == 1
+                            {
                                // Tried to remove directory that still has entries
                                return Err(Error::new(ENOTEMPTY));
                            }

                            // Save node and clear entry
-                            node_opt = Some(node);
+                            node_opt = Some((entry.node_ptr(), node, addr));
                            *entry = DirEntry::default();
                            break;
                        } else if node.data().is_dir() {
@@ -648,28 +783,55 @@ impl<'a, D: Disk> Transaction<'a, D> {
                }
            }

-            if let Some(mut node) = node_opt {
+            if let Some((node_tree_ptr, mut node, addr)) = node_opt {
                let links = node.data().links();
-                if links > 1 {
+                let remove_node = if links > 1 {
                    node.data_mut().set_links(links - 1);
+                    false
                } else {
                    node.data_mut().set_links(0);
                    self.truncate_node_inner(&mut node, 0)?;
-                }
+                    true
+                };

-                if block_offset == blocks - 1 && dir.data().is_empty() {
-                    // Remove empty parent block, if it is at the end
-                    self.remove_node_block_ptr(&mut parent, block_offset)?;
-                    parent.data_mut().set_size(block_offset * BLOCK_SIZE);
+                if record_offset == records - 1 && dir.data().is_empty() {
+                    let mut remove_record = record_offset;
+                    loop {
+                        // Remove empty parent record, if it is at the end
+                        self.remove_node_record_ptr(&mut parent, remove_record)?;
+                        parent
+                            .data_mut()
+                            .set_size(remove_record * record_level.bytes());
+
+                        // Keep going for any other empty records
+                        if remove_record > 0 {
+                            remove_record -= 1;
+                            dir_record_ptr = self.node_record_ptr(&parent, remove_record)?;
+                            dir_ptr = unsafe { dir_record_ptr.cast() };
+                            dir = self.read_block(dir_ptr)?;
+                            if dir.data().is_empty() {
+                                continue;
+                            }
+                        }
+                        break;
+                    }
                } else {
-                    // Save new parent block
+                    // Save new parent record
                    dir_ptr = self.sync_block(dir)?;
-                    dir_block_ptr = unsafe { dir_ptr.cast() };
-                    self.sync_node_block_ptr(&mut parent, block_offset, dir_block_ptr)?;
+                    dir_record_ptr = unsafe { dir_ptr.cast() };
+                    self.sync_node_record_ptr(&mut parent, record_offset, dir_record_ptr)?;
                }

-                // Sync both parent and node at the same time
-                self.sync_trees(&[parent, node])?;
+                if remove_node {
+                    self.sync_tree(parent)?;
+                    self.remove_tree(node_tree_ptr)?;
+                    unsafe {
+                        self.deallocate(addr);
+                    }
+                } else {
+                    // Sync both parent and node at the same time
+                    self.sync_trees(&[parent, node])?;
+                }

                return Ok(());
            }
@@ -687,7 +849,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
    ) -> Result<()> {
        let orig = self.find_node(orig_parent_ptr, orig_name)?;

-        //TODO: only allow ENOENT as an error?
+        // TODO: only allow ENOENT as an error?
        if let Ok(new) = self.find_node(new_parent_ptr, new_name) {
            // Move to same name, return
            if new.id() == orig.id() {
@@ -695,6 +857,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
            }

            // Remove new name
+            // (we renamed to a node that already exists, overwrite it.)
            self.remove_node(
                new_parent_ptr,
                new_name,
@@ -715,153 +878,176 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Ok(())
    }

-    fn node_block_ptr(
+    fn check_name(&mut self, parent_ptr: &TreePtr<Node>, name: &str) -> Result<()> {
+        if name.contains(':') {
+            return Err(Error::new(EINVAL));
+        }
+
+        if name.len() > DIR_ENTRY_MAX_LENGTH {
+            return Err(Error::new(EINVAL));
+        }
+
+        if self.find_node(parent_ptr.clone(), name).is_ok() {
+            return Err(Error::new(EEXIST));
+        }
+
+        Ok(())
+    }
+
+    /// Get a pointer to a the record of `node` with the given offset.
+    /// (i.e, to the `n`th record of `node`.)
+    fn node_record_ptr(
        &mut self,
        node: &TreeData<Node>,
-        block_offset: u64,
-    ) -> Result<BlockPtr<BlockRaw>> {
-        match NodeLevel::new(block_offset).ok_or(Error::new(ERANGE))? {
-            NodeLevel::L0(i0) => Ok(node.data().level0[i0]),
-            NodeLevel::L1(i1, i0) => {
-                let l0 = self.read_block(node.data().level1[i1])?;
-                Ok(l0.data().ptrs[i0])
-            }
-            NodeLevel::L2(i2, i1, i0) => {
-                let l1 = self.read_block(node.data().level2[i2])?;
-                let l0 = self.read_block(l1.data().ptrs[i1])?;
-                Ok(l0.data().ptrs[i0])
-            }
-            NodeLevel::L3(i3, i2, i1, i0) => {
-                let l2 = self.read_block(node.data().level3[i3])?;
-                let l1 = self.read_block(l2.data().ptrs[i2])?;
-                let l0 = self.read_block(l1.data().ptrs[i1])?;
-                Ok(l0.data().ptrs[i0])
-            }
-            NodeLevel::L4(i4, i3, i2, i1, i0) => {
-                let l3 = self.read_block(node.data().level4[i4])?;
-                let l2 = self.read_block(l3.data().ptrs[i3])?;
-                let l1 = self.read_block(l2.data().ptrs[i2])?;
-                let l0 = self.read_block(l1.data().ptrs[i1])?;
-                Ok(l0.data().ptrs[i0])
+        record_offset: u64,
+    ) -> Result<BlockPtr<RecordRaw>> {
+        unsafe {
+            match NodeLevel::new(record_offset).ok_or(Error::new(ERANGE))? {
+                NodeLevel::L0(i0) => Ok(node.data().level0[i0]),
+                NodeLevel::L1(i1, i0) => {
+                    let l0 = self.read_block_or_empty(node.data().level1[i1])?;
+                    Ok(l0.data().ptrs[i0])
+                }
+                NodeLevel::L2(i2, i1, i0) => {
+                    let l1 = self.read_block_or_empty(node.data().level2[i2])?;
+                    let l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
+                    Ok(l0.data().ptrs[i0])
+                }
+                NodeLevel::L3(i3, i2, i1, i0) => {
+                    let l2 = self.read_block_or_empty(node.data().level3[i3])?;
+                    let l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
+                    let l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
+                    Ok(l0.data().ptrs[i0])
+                }
+                NodeLevel::L4(i4, i3, i2, i1, i0) => {
+                    let l3 = self.read_block_or_empty(node.data().level4[i4])?;
+                    let l2 = self.read_block_or_empty(l3.data().ptrs[i3])?;
+                    let l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
+                    let l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
+                    Ok(l0.data().ptrs[i0])
+                }
            }
        }
    }

-    fn remove_node_block_ptr(
+    fn remove_node_record_ptr(
        &mut self,
        node: &mut TreeData<Node>,
-        block_offset: u64,
+        record_offset: u64,
    ) -> Result<()> {
-        match NodeLevel::new(block_offset).ok_or(Error::new(ERANGE))? {
-            NodeLevel::L0(i0) => {
-                self.deallocate_block(node.data_mut().level0[i0].clear());
-            }
-            NodeLevel::L1(i1, i0) => {
-                let mut l0 = self.read_block(node.data().level1[i1])?;
-                self.deallocate_block(l0.data_mut().ptrs[i0].clear());
-                if l0.data().is_empty() {
-                    self.deallocate_block(node.data_mut().level1[i1].clear());
-                } else {
-                    node.data_mut().level1[i1] = self.sync_block(l0)?;
-                }
-            }
-            NodeLevel::L2(i2, i1, i0) => {
-                let mut l1 = self.read_block(node.data().level2[i2])?;
-                let mut l0 = self.read_block(l1.data().ptrs[i1])?;
-                self.deallocate_block(l0.data_mut().ptrs[i0].clear());
-                if l0.data().is_empty() {
-                    self.deallocate_block(l1.data_mut().ptrs[i1].clear());
-                } else {
-                    l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
-                }
-                if l1.data().is_empty() {
-                    self.deallocate_block(node.data_mut().level2[i2].clear());
-                } else {
-                    node.data_mut().level2[i2] = self.sync_block(l1)?;
-                }
-            }
-            NodeLevel::L3(i3, i2, i1, i0) => {
-                let mut l2 = self.read_block(node.data().level3[i3])?;
-                let mut l1 = self.read_block(l2.data().ptrs[i2])?;
-                let mut l0 = self.read_block(l1.data().ptrs[i1])?;
-                self.deallocate_block(l0.data_mut().ptrs[i0].clear());
-                if l0.data().is_empty() {
-                    self.deallocate_block(l1.data_mut().ptrs[i1].clear());
-                } else {
-                    l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
-                }
-                if l1.data().is_empty() {
-                    self.deallocate_block(l2.data_mut().ptrs[i2].clear());
-                } else {
-                    l2.data_mut().ptrs[i2] = self.sync_block(l1)?;
-                }
-                if l2.data().is_empty() {
-                    self.deallocate_block(node.data_mut().level3[i3].clear());
-                } else {
-                    node.data_mut().level3[i3] = self.sync_block(l2)?;
+        unsafe {
+            match NodeLevel::new(record_offset).ok_or(Error::new(ERANGE))? {
+                NodeLevel::L0(i0) => {
+                    self.deallocate_block(node.data_mut().level0[i0].clear());
                }
-            }
-            NodeLevel::L4(i4, i3, i2, i1, i0) => {
-                let mut l3 = self.read_block(node.data().level4[i4])?;
-                let mut l2 = self.read_block(l3.data().ptrs[i3])?;
-                let mut l1 = self.read_block(l2.data().ptrs[i2])?;
-                let mut l0 = self.read_block(l1.data().ptrs[i1])?;
-                self.deallocate_block(l0.data_mut().ptrs[i0].clear());
-                if l0.data().is_empty() {
-                    self.deallocate_block(l1.data_mut().ptrs[i1].clear());
-                } else {
-                    l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
+                NodeLevel::L1(i1, i0) => {
+                    let mut l0 = self.read_block_or_empty(node.data().level1[i1])?;
+                    self.deallocate_block(l0.data_mut().ptrs[i0].clear());
+                    if l0.data().is_empty() {
+                        self.deallocate_block(node.data_mut().level1[i1].clear());
+                    } else {
+                        node.data_mut().level1[i1] = self.sync_block(l0)?;
+                    }
                }
-                if l1.data().is_empty() {
-                    self.deallocate_block(l2.data_mut().ptrs[i2].clear());
-                } else {
-                    l2.data_mut().ptrs[i2] = self.sync_block(l1)?;
+                NodeLevel::L2(i2, i1, i0) => {
+                    let mut l1 = self.read_block_or_empty(node.data().level2[i2])?;
+                    let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
+                    self.deallocate_block(l0.data_mut().ptrs[i0].clear());
+                    if l0.data().is_empty() {
+                        self.deallocate_block(l1.data_mut().ptrs[i1].clear());
+                    } else {
+                        l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
+                    }
+                    if l1.data().is_empty() {
+                        self.deallocate_block(node.data_mut().level2[i2].clear());
+                    } else {
+                        node.data_mut().level2[i2] = self.sync_block(l1)?;
+                    }
                }
-                if l2.data().is_empty() {
-                    self.deallocate_block(l3.data_mut().ptrs[i3].clear());
-                } else {
-                    l3.data_mut().ptrs[i3] = self.sync_block(l2)?;
+                NodeLevel::L3(i3, i2, i1, i0) => {
+                    let mut l2 = self.read_block_or_empty(node.data().level3[i3])?;
+                    let mut l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
+                    let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
+                    self.deallocate_block(l0.data_mut().ptrs[i0].clear());
+                    if l0.data().is_empty() {
+                        self.deallocate_block(l1.data_mut().ptrs[i1].clear());
+                    } else {
+                        l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
+                    }
+                    if l1.data().is_empty() {
+                        self.deallocate_block(l2.data_mut().ptrs[i2].clear());
+                    } else {
+                        l2.data_mut().ptrs[i2] = self.sync_block(l1)?;
+                    }
+                    if l2.data().is_empty() {
+                        self.deallocate_block(node.data_mut().level3[i3].clear());
+                    } else {
+                        node.data_mut().level3[i3] = self.sync_block(l2)?;
+                    }
                }
-                if l3.data().is_empty() {
-                    self.deallocate_block(node.data_mut().level4[i4].clear());
-                } else {
-                    node.data_mut().level4[i4] = self.sync_block(l3)?;
+                NodeLevel::L4(i4, i3, i2, i1, i0) => {
+                    let mut l3 = self.read_block_or_empty(node.data().level4[i4])?;
+                    let mut l2 = self.read_block_or_empty(l3.data().ptrs[i3])?;
+                    let mut l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
+                    let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;
+                    self.deallocate_block(l0.data_mut().ptrs[i0].clear());
+                    if l0.data().is_empty() {
+                        self.deallocate_block(l1.data_mut().ptrs[i1].clear());
+                    } else {
+                        l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
+                    }
+                    if l1.data().is_empty() {
+                        self.deallocate_block(l2.data_mut().ptrs[i2].clear());
+                    } else {
+                        l2.data_mut().ptrs[i2] = self.sync_block(l1)?;
+                    }
+                    if l2.data().is_empty() {
+                        self.deallocate_block(l3.data_mut().ptrs[i3].clear());
+                    } else {
+                        l3.data_mut().ptrs[i3] = self.sync_block(l2)?;
+                    }
+                    if l3.data().is_empty() {
+                        self.deallocate_block(node.data_mut().level4[i4].clear());
+                    } else {
+                        node.data_mut().level4[i4] = self.sync_block(l3)?;
+                    }
                }
            }
-        }

-        Ok(())
+            Ok(())
+        }
    }

-    fn sync_node_block_ptr(
+    /// Set the record at `ptr` as the data at `record_offset` of `node`.
+    fn sync_node_record_ptr(
        &mut self,
        node: &mut TreeData<Node>,
-        block_offset: u64,
-        ptr: BlockPtr<BlockRaw>,
+        record_offset: u64,
+        ptr: BlockPtr<RecordRaw>,
    ) -> Result<()> {
        unsafe {
-            match NodeLevel::new(block_offset).ok_or(Error::new(ERANGE))? {
+            match NodeLevel::new(record_offset).ok_or(Error::new(ERANGE))? {
                NodeLevel::L0(i0) => {
                    node.data_mut().level0[i0] = ptr;
                }
                NodeLevel::L1(i1, i0) => {
-                    let mut l0 = self.read_block_or_default(node.data().level1[i1])?;
+                    let mut l0 = self.read_block_or_empty(node.data().level1[i1])?;

                    l0.data_mut().ptrs[i0] = ptr;
                    node.data_mut().level1[i1] = self.sync_block(l0)?;
                }
                NodeLevel::L2(i2, i1, i0) => {
-                    let mut l1 = self.read_block_or_default(node.data().level2[i2])?;
-                    let mut l0 = self.read_block_or_default(l1.data().ptrs[i1])?;
+                    let mut l1 = self.read_block_or_empty(node.data().level2[i2])?;
+                    let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;

                    l0.data_mut().ptrs[i0] = ptr;
                    l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
                    node.data_mut().level2[i2] = self.sync_block(l1)?;
                }
                NodeLevel::L3(i3, i2, i1, i0) => {
-                    let mut l2 = self.read_block_or_default(node.data().level3[i3])?;
-                    let mut l1 = self.read_block_or_default(l2.data().ptrs[i2])?;
-                    let mut l0 = self.read_block_or_default(l1.data().ptrs[i1])?;
+                    let mut l2 = self.read_block_or_empty(node.data().level3[i3])?;
+                    let mut l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
+                    let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;

                    l0.data_mut().ptrs[i0] = ptr;
                    l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
@@ -869,10 +1055,10 @@ impl<'a, D: Disk> Transaction<'a, D> {
                    node.data_mut().level3[i3] = self.sync_block(l2)?;
                }
                NodeLevel::L4(i4, i3, i2, i1, i0) => {
-                    let mut l3 = self.read_block_or_default(node.data().level4[i4])?;
-                    let mut l2 = self.read_block_or_default(l3.data().ptrs[i3])?;
-                    let mut l1 = self.read_block_or_default(l2.data().ptrs[i2])?;
-                    let mut l0 = self.read_block_or_default(l1.data().ptrs[i1])?;
+                    let mut l3 = self.read_block_or_empty(node.data().level4[i4])?;
+                    let mut l2 = self.read_block_or_empty(l3.data().ptrs[i3])?;
+                    let mut l1 = self.read_block_or_empty(l2.data().ptrs[i2])?;
+                    let mut l0 = self.read_block_or_empty(l1.data().ptrs[i1])?;

                    l0.data_mut().ptrs[i0] = ptr;
                    l1.data_mut().ptrs[i1] = self.sync_block(l0)?;
@@ -893,22 +1079,36 @@ impl<'a, D: Disk> Transaction<'a, D> {
        buf: &mut [u8],
    ) -> Result<usize> {
        let node_size = node.data().size();
-        let mut i = 0;
-        while i < buf.len() && offset < node_size {
-            let block_ptr = self.node_block_ptr(&node, offset / BLOCK_SIZE)?;
-            let block = self.read_block(block_ptr)?;
+        let record_level = node.data().record_level();

-            let j = (offset % BLOCK_SIZE) as usize;
+        let mut bytes_read = 0;
+        while bytes_read < buf.len() && offset < node_size {
+            // How many bytes we've read into the next record
+            let j = (offset % record_level.bytes()) as usize;
+
+            // Number of bytes to read in this iteration
            let len = min(
-                buf.len() - i,
-                min(BLOCK_SIZE - j as u64, node_size - offset) as usize,
+                buf.len() - bytes_read, // number of bytes we have left in `buf`
+                min(
+                    record_level.bytes() - j as u64, // number of bytes we haven't read in this record
+                    node_size - offset,              // number of bytes left in this node
+                ) as usize,
            );
-            buf[i..i + len].copy_from_slice(&block.data()[j..j + len]);

-            i += len;
+            let record_idx = offset / record_level.bytes();
+            let record_ptr = self.node_record_ptr(node, record_idx)?;
+
+            // The level of the record to read.
+            // This is at most `record_level` due to the way `len` is computed.
+            let level = BlockLevel::for_bytes((j + len) as u64);
+
+            let record = unsafe { self.read_record(record_ptr, level)? };
+            buf[bytes_read..bytes_read + len].copy_from_slice(&record.data()[j..j + len]);
+
+            bytes_read += len;
            offset += len as u64;
        }
-        Ok(i)
+        Ok(bytes_read)
    }

    pub fn read_node(
@@ -943,6 +1143,7 @@ impl<'a, D: Disk> Transaction<'a, D> {

    pub fn truncate_node_inner(&mut self, node: &mut TreeData<Node>, size: u64) -> Result<bool> {
        let old_size = node.data().size();
+        let record_level = node.data().record_level();

        // Size already matches, return
        if old_size == size {
@@ -950,24 +1151,34 @@ impl<'a, D: Disk> Transaction<'a, D> {
        }

        if old_size < size {
-            // If size is smaller, write zeroes until the size matches
-            let zeroes = [0; BLOCK_SIZE as usize];
+            // If we're "truncating" to a larger size,
+            // write zeroes until the size matches
+            let zeroes = RecordRaw::empty(record_level).unwrap();

            let mut offset = old_size;
            while offset < size {
-                let start = offset % BLOCK_SIZE;
-                let end = if offset / BLOCK_SIZE == size / BLOCK_SIZE {
-                    size % BLOCK_SIZE
+                let start = offset % record_level.bytes();
+                if start == 0 {
+                    // We don't have to write completely zero records as read will interpret
+                    // null record pointers as zero records
+                    offset = size;
+                    break;
+                }
+                let end = if offset / record_level.bytes() == size / record_level.bytes() {
+                    size % record_level.bytes()
                } else {
-                    BLOCK_SIZE
+                    record_level.bytes()
                };
                self.write_node_inner(node, &mut offset, &zeroes[start as usize..end as usize])?;
            }
            assert_eq!(offset, size);
        } else {
-            // Deallocate blocks
-            for block in ((size + BLOCK_SIZE - 1) / BLOCK_SIZE..old_size / BLOCK_SIZE).rev() {
-                self.remove_node_block_ptr(node, block)?;
+            // Deallocate records
+            for record in ((size + record_level.bytes() - 1) / record_level.bytes()
+                ..old_size / record_level.bytes())
+                .rev()
+            {
+                self.remove_node_record_ptr(node, record)?;
            }
        }

@@ -977,6 +1188,10 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Ok(true)
    }

+    /// Truncate the given node to the given size.
+    ///
+    /// If `size` is larger than the node's current size,
+    /// expand the node with zeroes.
    pub fn truncate_node(
        &mut self,
        node_ptr: TreePtr<Node>,
@@ -1005,32 +1220,41 @@ impl<'a, D: Disk> Transaction<'a, D> {
    ) -> Result<bool> {
        let mut node_changed = false;

-        let node_blocks = (node.data().size() + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        let record_level = node.data().record_level();
+        let node_records = (node.data().size() + record_level.bytes() - 1) / record_level.bytes();

        let mut i = 0;
        while i < buf.len() {
-            let mut block_ptr = if node_blocks > (*offset / BLOCK_SIZE) {
-                self.node_block_ptr(node, *offset / BLOCK_SIZE)?
+            let j = (*offset % record_level.bytes()) as usize;
+            let len = min(buf.len() - i, record_level.bytes() as usize - j);
+            let level = BlockLevel::for_bytes((j + len) as u64);
+
+            let mut record_ptr = if node_records > (*offset / record_level.bytes()) {
+                self.node_record_ptr(node, *offset / record_level.bytes())?
            } else {
-                BlockPtr::default()
+                BlockPtr::null(level)
            };
-            let mut block = unsafe { self.read_block_or_default(block_ptr)? };
+            let mut record = unsafe { self.read_record(record_ptr, level)? };

-            let j = (*offset % BLOCK_SIZE) as usize;
-            let len = min(buf.len() - i, BLOCK_SIZE as usize - j);
-            if block_ptr.is_null() || buf[i..i + len] != block.data()[j..j + len] {
+            if buf[i..i + len] != record.data()[j..j + len] {
                unsafe {
-                    let old_addr = block.swap_addr(self.allocate()?);
+                    // CoW record using its current level
+                    let mut old_addr = record.swap_addr(self.allocate(record.addr().level())?);
+
+                    // If the record was resized we need to dealloc the original ptr
+                    if old_addr.is_null() {
+                        old_addr = record_ptr.addr();
+                    }

-                    block.data_mut()[j..j + len].copy_from_slice(&buf[i..i + len]);
-                    block_ptr = self.write_block(block)?;
+                    record.data_mut()[j..j + len].copy_from_slice(&buf[i..i + len]);
+                    record_ptr = self.write_block(record)?;

-                    if old_addr != 0 {
+                    if !old_addr.is_null() {
                        self.deallocate(old_addr);
                    }
                }

-                self.sync_node_block_ptr(node, *offset / BLOCK_SIZE, block_ptr)?;
+                self.sync_node_record_ptr(node, *offset / record_level.bytes(), record_ptr)?;
                node_changed = true;
            }

@@ -1046,6 +1270,7 @@ impl<'a, D: Disk> Transaction<'a, D> {
        Ok(node_changed)
    }

+    /// Write the bytes at `buf` to `node` starting at `offset`.
    pub fn write_node(
        &mut self,
        node_ptr: TreePtr<Node>,

--- a/src/tree.rs
+++ b/src/tree.rs
 use core::{marker::PhantomData, mem, ops, slice};
-use simple_endian::*;
+use endian_num::Le;

-use crate::{BlockPtr, BlockRaw};
+use crate::{BlockLevel, BlockPtr, BlockRaw, BlockTrait};

 // 1 << 8 = 256, this is the number of entries in a TreeList
 const TREE_LIST_SHIFT: u32 = 8;
+const TREE_LIST_ENTRIES: usize = 1 << TREE_LIST_SHIFT;

-// Tree with 4 levels
+/// A tree with 4 levels
 pub type Tree = TreeList<TreeList<TreeList<TreeList<BlockRaw>>>>;

+/// A [`TreePtr`] and the contents of the block it references.
 #[derive(Clone, Copy, Debug, Default)]
 pub struct TreeData<T> {
+    /// The value of the [`TreePtr`]
    id: u32,
+
+    // The data
    data: T,
 }

@@ -44,15 +49,21 @@ impl<T> TreeData<T> {
    }
 }

-#[repr(packed)]
+/// A list of pointers to blocks of type `T`.
+/// This is one level of a [`Tree`], defined above.
+#[repr(C, packed)]
 pub struct TreeList<T> {
-    pub ptrs: [BlockPtr<T>; 1 << TREE_LIST_SHIFT],
+    pub ptrs: [BlockPtr<T>; TREE_LIST_ENTRIES],
 }

-impl<T> Default for TreeList<T> {
-    fn default() -> Self {
-        Self {
-            ptrs: [BlockPtr::default(); 1 << TREE_LIST_SHIFT],
+unsafe impl<T> BlockTrait for TreeList<T> {
+    fn empty(level: BlockLevel) -> Option<Self> {
+        if level.0 == 0 {
+            Some(Self {
+                ptrs: [BlockPtr::default(); TREE_LIST_ENTRIES],
+            })
+        } else {
+            None
        }
    }
 }
@@ -80,13 +91,16 @@ impl<T> ops::DerefMut for TreeList<T> {
    }
 }

-#[repr(packed)]
+/// A pointer to an entry in a [`Tree`].
+#[repr(C, packed)]
 pub struct TreePtr<T> {
-    id: u32le,
+    id: Le<u32>,
    phantom: PhantomData<T>,
 }

 impl<T> TreePtr<T> {
+    /// Get a [`TreePtr`] to the filesystem root
+    /// directory's node.
    pub fn root() -> Self {
        Self::new(1)
    }
@@ -98,6 +112,11 @@ impl<T> TreePtr<T> {
        }
    }

+    /// Create a [`TreePtr`] from [`Tree`] indices,
+    /// Where `indexes` is `(i3, i2, i1, i0)`.
+    /// - `i3` is the index into the level 3 table,
+    /// - `i2` is the index into the level 2 table at `i3`
+    /// - ...and so on.
    pub fn from_indexes(indexes: (usize, usize, usize, usize)) -> Self {
        const SHIFT: u32 = TREE_LIST_SHIFT;
        let id = ((indexes.0 << (3 * SHIFT)) as u32)
@@ -111,33 +130,36 @@ impl<T> TreePtr<T> {
    }

    pub fn id(&self) -> u32 {
-        { self.id }.to_native()
+        self.id.to_ne()
    }

    pub fn is_null(&self) -> bool {
        self.id() == 0
    }

+    /// Get this indices of this [`TreePtr`] in a [`Tree`].
+    /// Returns `(i3, i2, i1, i0)`:
+    /// - `i3` is the index into the level 3 table,
+    /// - `i2` is the index into the level 2 table at `i3`
+    /// - ...and so on.
    pub fn indexes(&self) -> (usize, usize, usize, usize) {
        const SHIFT: u32 = TREE_LIST_SHIFT;
        const NUM: u32 = 1 << SHIFT;
        const MASK: u32 = NUM - 1;
        let id = self.id();
-        (
-            ((id >> (3 * SHIFT)) & MASK) as usize,
-            ((id >> (2 * SHIFT)) & MASK) as usize,
-            ((id >> SHIFT) & MASK) as usize,
-            (id & MASK) as usize,
-        )
+
+        let i3 = ((id >> (3 * SHIFT)) & MASK) as usize;
+        let i2 = ((id >> (2 * SHIFT)) & MASK) as usize;
+        let i1 = ((id >> SHIFT) & MASK) as usize;
+        let i0 = (id & MASK) as usize;
+
+        return (i3, i2, i1, i0);
    }
 }

 impl<T> Clone for TreePtr<T> {
    fn clone(&self) -> Self {
-        Self {
-            id: self.id,
-            phantom: PhantomData,
-        }
+        *self
    }
 }


--- a/src/unmount.rs
+++ b/src/unmount.rs
-use failure::Error;
-
 use std::{
    fs,
    io::{self},
@@ -33,7 +31,7 @@ fn unmount_linux_path(mount_path: &str) -> io::Result<ExitStatus> {
    ))
 }

-pub fn unmount_path(mount_path: &str) -> Result<(), Error> {
+pub fn unmount_path(mount_path: &str) -> Result<(), io::Error> {
    if cfg!(target_os = "redox") {
        fs::remove_file(format!(":{}", mount_path))?
    } else {
@@ -45,7 +43,10 @@ pub fn unmount_path(mount_path: &str) -> Result<(), Error> {

        let status = status_res?;
        if !status.success() {
-            return Err(io::Error::new(io::ErrorKind::Other, "redoxfs umount failed").into());
+            return Err(io::Error::new(
+                io::ErrorKind::Other,
+                "redoxfs umount failed",
+            ));
        }
    }


--- a/test.sh
+++ b/test.sh
@@ -34,15 +34,22 @@ ls -lah image

 mkdir image/test
 time cp -r src image/test/src
+
 dd if=/dev/urandom of=image/test/random bs=1M count=256
 dd if=image/test/random of=/dev/null bs=1M count=256
+
+time truncate --size=256M image/test/sparse
+dd if=image/test/sparse of=/dev/null bs=1M count=256
+
 dd if=/dev/zero of=image/test/zero bs=1M count=256
 dd if=image/test/zero of=/dev/null bs=1M count=256
+
 ls -lah image/test

 df -h image

 rm image/test/random
+rm image/test/sparse
 rm image/test/zero
 rm -rf image/test/src
 rmdir image/test
No results found