diff --git a/src/externs.rs b/src/externs.rs
index 4968e3c9142abd23ed2ad14746d00a1fd1df8b5f..42f111dc4183b8c0676921b1c29c3a63bd036e2c 100644
--- a/src/externs.rs
+++ b/src/externs.rs
@@ -9,23 +9,34 @@ const WORD_SIZE: usize = mem::size_of::<usize>();
 /// This faster implementation works by copying bytes not one-by-one, but in
 /// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
 #[no_mangle]
-pub unsafe extern fn memcpy(dest: *mut u8, src: *const u8,
-                            n: usize) -> *mut u8 {
-
-    let n_usize: usize = n/WORD_SIZE; // Number of word sized groups
-    let mut i: usize = 0;
-
-    // Copy `WORD_SIZE` bytes at a time
-    let n_fast = n_usize*WORD_SIZE;
-    while i < n_fast {
-        *((dest as usize + i) as *mut usize) =
-            *((src as usize + i) as *const usize);
+pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, len: usize) -> *mut u8 {
+    // TODO: Alignment? Some sources claim that even on relatively modern Âµ-arches, unaligned
+    // accesses spanning two pages, can take dozens of cycles. That means chunk-based memcpy can
+    // even be slower for small lengths if alignment is not taken into account.
+    //
+    // TODO: Optimize out smaller loops by first checking if len < WORD_SIZE, and possibly if
+    // dest + WORD_SIZE spans two pages, then doing one unaligned copy, then aligning up, and then
+    // doing one last unaligned copy?
+    //
+    // TODO: While we use the -fno-builtin equivalent, can we guarantee LLVM won't insert memcpy
+    // call inside here? Maybe write it in assembly?
+
+    let mut i = 0_usize;
+
+    // First we copy len / WORD_SIZE chunks...
+
+    let chunks = len / WORD_SIZE;
+
+    while i < chunks * WORD_SIZE {
+        dest.add(i)
+            .cast::<usize>()
+            .write_unaligned(src.add(i).cast::<usize>().read_unaligned());
         i += WORD_SIZE;
     }
 
-    // Copy 1 byte at a time
-    while i < n {
-        *((dest as usize + i) as *mut u8) = *((src as usize + i) as *const u8);
+    // .. then we copy len % WORD_SIZE bytes
+    while i < len {
+        dest.add(i).write(src.add(i).read());
         i += 1;
     }
 
@@ -39,43 +50,42 @@ pub unsafe extern fn memcpy(dest: *mut u8, src: *const u8,
 /// This faster implementation works by copying bytes not one-by-one, but in
 /// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
 #[no_mangle]
-pub unsafe extern fn memmove(dest: *mut u8, src: *const u8,
-                             n: usize) -> *mut u8 {
+pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, len: usize) -> *mut u8 {
+    let chunks = len / WORD_SIZE;
+
+    // TODO: also require dest - src < len before choosing to copy backwards?
     if src < dest as *const u8 {
-        let n_usize: usize = n/WORD_SIZE; // Number of word sized groups
-        let mut i: usize = n_usize*WORD_SIZE;
+        // We have to copy backwards if copying upwards.
 
-        // Copy `WORD_SIZE` bytes at a time
-        while i != 0 {
-            i -= WORD_SIZE;
-            *((dest as usize + i) as *mut usize) =
-                *((src as usize + i) as *const usize);
+        let mut i = len;
+
+        while i != chunks * WORD_SIZE {
+            i -= 1;
+            dest.add(i).write(src.add(i).read());
         }
 
-        let mut i: usize = n;
+        while i > 0 {
+            i -= WORD_SIZE;
 
-        // Copy 1 byte at a time
-        while i != n_usize*WORD_SIZE {
-            i -= 1;
-            *((dest as usize + i) as *mut u8) =
-                *((src as usize + i) as *const u8);
+            dest.add(i)
+                .cast::<usize>()
+                .write_unaligned(src.add(i).cast::<usize>().read_unaligned());
         }
     } else {
-        let n_usize: usize = n/WORD_SIZE; // Number of word sized groups
-        let mut i: usize = 0;
-
-        // Copy `WORD_SIZE` bytes at a time
-        let n_fast = n_usize*WORD_SIZE;
-        while i < n_fast {
-            *((dest as usize + i) as *mut usize) =
-                *((src as usize + i) as *const usize);
+        // We have to copy forward if copying downwards.
+
+        let mut i = 0_usize;
+
+        while i < chunks * WORD_SIZE {
+            dest.add(i)
+                .cast::<usize>()
+                .write_unaligned(src.add(i).cast::<usize>().read_unaligned());
+
             i += WORD_SIZE;
         }
 
-        // Copy 1 byte at a time
-        while i < n {
-            *((dest as usize + i) as *mut u8) =
-                *((src as usize + i) as *const u8);
+        while i < len {
+            dest.add(i).write(src.add(i).read());
             i += 1;
         }
     }
@@ -90,23 +100,21 @@ pub unsafe extern fn memmove(dest: *mut u8, src: *const u8,
 /// This faster implementation works by setting bytes not one-by-one, but in
 /// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
 #[no_mangle]
-pub unsafe extern fn memset(dest: *mut u8, c: i32, n: usize) -> *mut u8 {
-    let c: usize = mem::transmute([c as u8; WORD_SIZE]);
-    let n_usize: usize = n/WORD_SIZE;
-    let mut i: usize = 0;
-
-    // Set `WORD_SIZE` bytes at a time
-    let n_fast = n_usize*WORD_SIZE;
-    while i < n_fast {
-        *((dest as usize + i) as *mut usize) = c;
+pub unsafe extern "C" fn memset(dest: *mut u8, byte: i32, len: usize) -> *mut u8 {
+    let byte = byte as u8;
+
+    let mut i = 0;
+
+    let broadcasted = usize::from_ne_bytes([byte; WORD_SIZE]);
+    let chunks = len / WORD_SIZE;
+
+    while i < chunks * WORD_SIZE {
+        dest.add(i).cast::<usize>().write_unaligned(broadcasted);
         i += WORD_SIZE;
     }
 
-    let c = c as u8;
-
-    // Set 1 byte at a time
-    while i < n {
-        *((dest as usize + i) as *mut u8) = c;
+    while i < len {
+        dest.add(i).write(byte);
         i += 1;
     }
 
@@ -120,34 +128,34 @@ pub unsafe extern fn memset(dest: *mut u8, c: i32, n: usize) -> *mut u8 {
 /// This faster implementation works by comparing bytes not one-by-one, but in
 /// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
 #[no_mangle]
-pub unsafe extern fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
-    let n_usize: usize = n/WORD_SIZE;
-    let mut i: usize = 0;
-
-    let n_fast = n_usize*WORD_SIZE;
-    while i < n_fast {
-        let a = *((s1 as usize + i) as *const usize);
-        let b = *((s2 as usize + i) as *const usize);
+pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, len: usize) -> i32 {
+    let mut i = 0_usize;
+
+    // First compare WORD_SIZE chunks...
+    let chunks = len / WORD_SIZE;
+
+    while i < chunks * WORD_SIZE {
+        let a = s1.add(i).cast::<usize>().read_unaligned();
+        let b = s2.add(i).cast::<usize>().read_unaligned();
+
         if a != b {
-            let n: usize = i + WORD_SIZE;
-            // Find the one byte that is not equal
-            while i < n {
-                let a = *((s1 as usize + i) as *const u8);
-                let b = *((s2 as usize + i) as *const u8);
-                if a != b {
-                    return a as i32 - b as i32;
-                }
-                i += 1;
-            }
+            // x86 has had bswap since the 80486, and the compiler will likely use the faster
+            // movbe. AArch64 has the REV instruction, which I think is universally available.
+            let diff = usize::from_be(a).wrapping_sub(usize::from_be(b)) as isize;
+
+            // TODO: If chunk size == 32 bits, diff can be returned directly.
+            return diff.signum() as i32;
         }
         i += WORD_SIZE;
     }
 
-    while i < n {
-        let a = *((s1 as usize + i) as *const u8);
-        let b = *((s2 as usize + i) as *const u8);
+    // ... and then compare bytes.
+    while i < len {
+        let a = s1.add(i).read();
+        let b = s2.add(i).read();
+
         if a != b {
-            return a as i32 - b as i32;
+            return i32::from(a) - i32::from(b);
         }
         i += 1;
     }