From c4fc76f8441b15a6e74094ae5a9423dbdf869cfa Mon Sep 17 00:00:00 2001
From: pi_pi3 <walter.szymon.98@gmail.com>
Date: Fri, 14 Apr 2017 14:37:32 +0200
Subject: [PATCH] A faster implementation of the memcpy family

The default implementation of the memcpy, memmove, memset and memcmp
functions in the kernel file `extern.rs` uses a naive implementation
by copying, assigning or comparing bytes ony by one. This can be slow.
This commit proposes a reimplementation of those functions by copying,
assigning or comparing in group of 8 bytes by using the u64 type and
its respective pointers instead of u8. Alternative version for 32-bit
architectures are also supplied for future compatibility with x86.
Both version first copy whatever they can with wide word types. The
tail, i.e. the final few bytes that do not fit in a dword or qword
are then copied byte by byte.

Here is a comparison of copying 64kiB (65536 bytes) on stack:

x86_64-unknown-linux-gnu: (64-bit)
       | naive (ns) | fast (ns) | speedup (x)
-------|------------|-----------|------------
memcpy |   204430   |   32994   |   ~6.20
memmove|   202540   |   33186   |   ~6.10
memset |   163391   |   23884   |   ~6.84
memcmp |   205663   |   34385   |   ~5.98

i686-unknown-linux-gnu: (32-bit)
       | naive (ns) | fast (ns) | speedup (x)
-------|------------|-----------|------------
memcpy |   206297   |   66858   |   ~3.09
memmove|   204576   |   70326   |   ~2.91
memset |   165599   |   50227   |   ~3.30
memcmp |   204262   |   70572   |   ~2.89

Copying on the heap behaves simmilarly.

All tests performed on Intel i5 6600K (4x4.2GHz),
ArchLinux Kernel 4.8.12-3 x86_64.
---
 src/externs.rs | 247 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 238 insertions(+), 9 deletions(-)

diff --git a/src/externs.rs b/src/externs.rs
index d92f8c44..9db9ea3a 100644
--- a/src/externs.rs
+++ b/src/externs.rs
@@ -1,10 +1,50 @@
 /// Memcpy
 ///
 /// Copy N bytes of memory from one location to another.
+///
+/// This faster implementation works by copying bytes not one-by-one, but in
+/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
+#[cfg(target_pointer_width = "64")]
 #[no_mangle]
 pub unsafe extern fn memcpy(dest: *mut u8, src: *const u8,
                             n: usize) -> *mut u8 {
-    let mut i = 0;
+    let n_64: usize = n/8; // Number of 64-bit groups
+    let mut i: usize = 0;
+
+    // Copy 8 bytes at a time
+    while i < n_64 {
+        *((dest as usize + i*8) as *mut u64) =
+            *((src as usize + i*8) as *const u64);
+        i += 1;
+    }
+
+    let mut i: usize = i*8;
+
+    // Copy 1 byte at a time
+    while i < n {
+        *((dest as usize + i) as *mut u8) = *((src as usize + i) as *const u8);
+        i += 1;
+    }
+
+    dest
+}
+
+// 32-bit version of the function above
+#[cfg(target_pointer_width = "32")]
+#[no_mangle]
+pub unsafe extern fn memcpy(dest: *mut u8, src: *const u8,
+                            n: usize) -> *mut u8 {
+    let n_32: usize = n/8; // Number of 32-bit groups
+    let mut i: usize = 0;
+
+    // Copy 4 bytes at a time
+    while i < n_32 {
+        *((dest as usize + i*4) as *mut u32) =
+            *((src as usize + i*4) as *const u32);
+        i += 1;
+    }
+
+    let mut i: usize = i*4;
     while i < n {
         *((dest as usize + i) as *mut u8) = *((src as usize + i) as *const u8);
         i += 1;
@@ -16,19 +56,97 @@ pub unsafe extern fn memcpy(dest: *mut u8, src: *const u8,
 /// Memmove
 ///
 /// Copy N bytes of memory from src to dest. The memory areas may overlap.
+///
+/// This faster implementation works by copying bytes not one-by-one, but in
+/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
+#[cfg(target_pointer_width = "64")]
 #[no_mangle]
 pub unsafe extern fn memmove(dest: *mut u8, src: *const u8,
                              n: usize) -> *mut u8 {
     if src < dest as *const u8 {
-        let mut i = n;
+        let n_64: usize = n/8; // Number of 64-bit groups
+        let mut i: usize = n_64;
+
+        // Copy 8 bytes at a time
         while i != 0 {
             i -= 1;
-            *((dest as usize + i) as *mut u8) = *((src as usize + i) as *const u8);
+            *((dest as usize + i*8) as *mut u64) =
+                *((src as usize + i*8) as *const u64);
+        }
+
+        let mut i: usize = n;
+
+        // Copy 1 byte at a time
+        while i != n_64*8 {
+            i -= 1;
+            *((dest as usize + i) as *mut u8) =
+                *((src as usize + i) as *const u8);
         }
     } else {
-        let mut i = 0;
+        let n_64: usize = n/8; // Number of 64-bit groups
+        let mut i: usize = 0;
+
+        // Copy 8 bytes at a time
+        while i < n_64 {
+            *((dest as usize + i*8) as *mut u64) =
+                *((src as usize + i*8) as *const u64);
+            i += 1;
+        }
+
+        let mut i: usize = i*8;
+
+        // Copy 1 byte at a time
         while i < n {
-            *((dest as usize + i) as *mut u8) = *((src as usize + i) as *const u8);
+            *((dest as usize + i) as *mut u8) =
+                *((src as usize + i) as *const u8);
+            i += 1;
+        }
+    }
+
+    dest
+}
+
+// 32-bit version of the function above
+#[cfg(target_pointer_width = "32")]
+#[no_mangle]
+pub unsafe extern fn memmove(dest: *mut u8, src: *const u8,
+                             n: usize) -> *mut u8 {
+    if src < dest as *const u8 {
+        let n_32: usize = n/4; // Number of 32-bit groups
+        let mut i: usize = n_32;
+
+        // Copy 4 bytes at a time
+        while i != 0 {
+            i -= 1;
+            *((dest as usize + i*4) as *mut u32) =
+                *((src as usize + i*4) as *const u32);
+        }
+
+        let mut i: usize = n;
+
+        // Copy 1 byte at a time
+        while i != n_32*4 {
+            i -= 1;
+            *((dest as usize + i) as *mut u8) =
+                *((src as usize + i) as *const u8);
+        }
+    } else {
+        let n_32: usize = n/4; // Number of 32-bit groups
+        let mut i: usize = 0;
+
+        // Copy 4 bytes at a time
+        while i < n_32 {
+            *((dest as usize + i*4) as *mut u32) =
+                *((src as usize + i*4) as *const u32);
+            i += 1;
+        }
+
+        let mut i: usize = i*4;
+
+        // Copy 1 byte at a time
+        while i < n {
+            *((dest as usize + i) as *mut u8) =
+                *((src as usize + i) as *const u8);
             i += 1;
         }
     }
@@ -39,11 +157,57 @@ pub unsafe extern fn memmove(dest: *mut u8, src: *const u8,
 /// Memset
 ///
 /// Fill a block of memory with a specified value.
+///
+/// This faster implementation works by setting bytes not one-by-one, but in
+/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
+#[cfg(target_pointer_width = "64")]
+#[no_mangle]
+pub unsafe extern fn memset(dest: *mut u8, c: i32, n: usize) -> *mut u8 {
+    let c = c as u64;
+    let c = (c << 56) | (c << 48) | (c << 40) | (c << 32)
+          | (c << 24) | (c << 16) | (c << 8)  | c;
+    let n_64: usize = n/8;
+    let mut i: usize = 0;
+
+    // Set 8 bytes at a time
+    while i < n_64 {
+        *((dest as usize + i*8) as *mut u64) = c;
+        i += 1;
+    }
+
+    let c = c as u8;
+    let mut i: usize = i*8;
+
+    // Set 1 byte at a time
+    while i < n {
+        *((dest as usize + i) as *mut u8) = c;
+        i += 1;
+    }
+
+    dest
+}
+
+// 32-bit version of the function above
+#[cfg(target_pointer_width = "32")]
 #[no_mangle]
 pub unsafe extern fn memset(dest: *mut u8, c: i32, n: usize) -> *mut u8 {
-    let mut i = 0;
+    let c = c as u32;
+    let c = (c << 24) | (c << 16) | (c << 8)  | c;
+    let n_32: usize = n/4;
+    let mut i: usize = 0;
+
+    // Set 4 bytes at a time
+    while i < n_32 {
+        *((dest as usize + i*4) as *mut u32) = c;
+        i += 1;
+    }
+
+    let c = c as u8;
+    let mut i: usize = i*4;
+
+    // Set 1 byte at a time
     while i < n {
-        *((dest as usize + i) as *mut u8) = c as u8;
+        *((dest as usize + i) as *mut u8) = c;
         i += 1;
     }
 
@@ -53,15 +217,80 @@ pub unsafe extern fn memset(dest: *mut u8, c: i32, n: usize) -> *mut u8 {
 /// Memcmp
 ///
 /// Compare two blocks of memory.
+///
+/// This faster implementation works by comparing bytes not one-by-one, but in
+/// groups of 8 bytes (or 4 bytes in the case of 32-bit architectures).
+#[cfg(target_pointer_width = "64")]
 #[no_mangle]
 pub unsafe extern fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
-    let mut i = 0;
+    let n_64: usize = n/8;
+    let mut i: usize = 0;
+
+    while i < n_64 {
+        let a = *((s1 as usize + i*8) as *const u64);
+        let b = *((s2 as usize + i*8) as *const u64);
+        if a != b {
+            let mut i: usize = i*8;
+            let n: usize = i + 8;
+            // Find the one byte that is not equal
+            while i < n {
+                let a = *((s1 as usize + i) as *const u8);
+                let b = *((s2 as usize + i) as *const u8);
+                if a != b {
+                    return a as i32 - b as i32;
+                }
+                i += 1;
+            }
+        }
+        i += 1;
+    }
+
+    let mut i: usize = i*8;
+
+    while i < n {
+        let a = *((s1 as usize + i) as *const u8);
+        let b = *((s2 as usize + i) as *const u8);
+        if a != b {
+            return a as i32 - b as i32;
+        }
+        i += 1;
+    }
+
+    0
+}
+
+#[cfg(target_pointer_width = "32")]
+#[no_mangle]
+pub unsafe extern fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+    let n_32: usize = n/4;
+    let mut i: usize = 0;
+
+    while i < n_32 {
+        let a = *((s1 as usize + i*4) as *const u32);
+        let b = *((s2 as usize + i*4) as *const u32);
+        if a != b {
+            let mut i: usize = i*4;
+            let n: usize = i + 4;
+            // Find the one byte that is not equal
+            while i < n {
+                let a = *((s1 as usize + i) as *const u8);
+                let b = *((s2 as usize + i) as *const u8);
+                if a != b {
+                    return a as i32 - b as i32;
+                }
+                i += 1;
+            }
+        }
+        i += 1;
+    }
+
+    let mut i: usize = i*4;
 
     while i < n {
         let a = *((s1 as usize + i) as *const u8);
         let b = *((s2 as usize + i) as *const u8);
         if a != b {
-            return a as i32 - b as i32
+            return a as i32 - b as i32;
         }
         i += 1;
     }
-- 
GitLab