diff --git a/Cargo.lock b/Cargo.lock
index 706ed5e221c38ee01a600266b2812eecef66d449..6184caf09d028029a907ad56d7548c0b370e90c3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -75,7 +75,6 @@ dependencies = [
  "linked_list_allocator 0.9.0",
  "log",
  "memoffset",
- "paste",
  "raw-cpuid 8.1.2",
  "redox_syscall",
  "rmm",
@@ -131,12 +130,6 @@ dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "paste"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58"
-
 [[package]]
 name = "plain"
 version = "0.2.3"
diff --git a/Cargo.toml b/Cargo.toml
index 9f651a64b3f3b1b06ecf385521e874c6e12b5838..c58a7ec9a62339f8e7079db7e15f66a3d28379ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,6 @@ memoffset = { version = "0.6", features = ["unstable_const"] }
 redox_syscall = { path = "syscall" }
 slab_allocator = { path = "slab_allocator", optional = true }
 spin = "0.9"
-paste = "1"
 rmm = { path = "rmm", default-features = false }
 
 [dependencies.goblin]
diff --git a/src/arch/x86_64/consts.rs b/src/arch/x86_64/consts.rs
index 5589106e1441e5dd95f672e9f3a3efbf5c2b9e0e..2d9ac4014859ab73895e2f2478cf2514c891f704 100644
--- a/src/arch/x86_64/consts.rs
+++ b/src/arch/x86_64/consts.rs
@@ -28,7 +28,8 @@
     pub const KERNEL_PERCPU_OFFSET: usize = KERNEL_TMP_MISC_OFFSET - PML4_SIZE;
     pub const KERNEL_PERCPU_PML4: usize = (KERNEL_PERCPU_OFFSET & PML4_MASK)/PML4_SIZE;
     /// Size of kernel percpu variables
-    pub const KERNEL_PERCPU_SIZE: usize = 64 * 1024; // 64 KB
+    pub const KERNEL_PERCPU_SHIFT: u8 = 16; // 2^16 = 64 KiB
+    pub const KERNEL_PERCPU_SIZE: usize = 1_usize << KERNEL_PERCPU_SHIFT;
 
     /// Offset of physmap
     // This needs to match RMM's PHYS_OFFSET
diff --git a/src/arch/x86_64/gdt.rs b/src/arch/x86_64/gdt.rs
index 94ad795baa15d00877ec8066ed5e9076476f56a9..5e974c3db628b16d9090e636ad76cf506c8577fe 100644
--- a/src/arch/x86_64/gdt.rs
+++ b/src/arch/x86_64/gdt.rs
@@ -52,7 +52,7 @@ static mut INIT_GDT: [GdtEntry; 4] = [
 ];
 
 #[thread_local]
-pub static mut GDT: [GdtEntry; 9] = [
+pub static mut GDT: [GdtEntry; 10] = [
     // Null
     GdtEntry::new(0, 0, 0, 0),
     // Kernel code
@@ -71,6 +71,9 @@ pub static mut GDT: [GdtEntry; 9] = [
     GdtEntry::new(0, 0, GDT_A_PRESENT | GDT_A_RING_3 | GDT_A_TSS_AVAIL, 0),
     // TSS must be 16 bytes long, twice the normal size
     GdtEntry::new(0, 0, 0, 0),
+    // Unused entry which stores the CPU ID. This is necessary for paranoid interrupts as they have
+    // no other way of determining it.
+    GdtEntry::new(0, 0, 0, 0),
 ];
 
 #[repr(C, align(16))]
diff --git a/src/arch/x86_64/interrupt/exception.rs b/src/arch/x86_64/interrupt/exception.rs
index 48c0353da135ff842565595cede31d6e9f7ef6cf..242c7cf3b5d0ef34e9fcf2aadc95989b40018aa0 100644
--- a/src/arch/x86_64/interrupt/exception.rs
+++ b/src/arch/x86_64/interrupt/exception.rs
@@ -18,7 +18,7 @@ interrupt_stack!(divide_by_zero, |stack| {
     ksignal(SIGFPE);
 });
 
-interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| {
+interrupt_stack!(debug, @paranoid, |stack| {
     let mut handled = false;
 
     // Disable singlestep before there is a breakpoint, since the breakpoint
@@ -41,7 +41,7 @@ interrupt_stack!(debug, super_atomic: swapgs_iff_ring3_slow!, |stack| {
     }
 });
 
-interrupt_stack!(non_maskable, super_atomic: swapgs_iff_ring3_slow!, |stack| {
+interrupt_stack!(non_maskable, @paranoid, |stack| {
     println!("Non-maskable interrupt");
     stack.dump();
 });
@@ -158,7 +158,7 @@ interrupt_error!(alignment_check, |stack| {
     ksignal(SIGBUS);
 });
 
-interrupt_stack!(machine_check, super_atomic: swapgs_iff_ring3_slow!, |stack| {
+interrupt_stack!(machine_check, @paranoid, |stack| {
     println!("Machine check fault");
     stack.dump();
     stack_trace();
diff --git a/src/arch/x86_64/interrupt/handler.rs b/src/arch/x86_64/interrupt/handler.rs
index d3921f0c7229cc4677f7f865752eecc287065a20..58ee4dd7d0bad535af5f071810377df9e8240597 100644
--- a/src/arch/x86_64/interrupt/handler.rs
+++ b/src/arch/x86_64/interrupt/handler.rs
@@ -303,38 +303,125 @@ macro_rules! swapgs_iff_ring3_fast_errorcode {
         1:
     " };
 }
-#[macro_export]
-macro_rules! swapgs_iff_ring3_slow {
+
+#[cfg(feature = "x86_fsgsbase")]
+macro_rules! save_gsbase_paranoid {
+    () => { "
+        // Unused: {IA32_GS_BASE}
+        rdgsbase rax
+        push rax
+    " }
+}
+#[cfg(feature = "x86_fsgsbase")]
+macro_rules! restore_gsbase_paranoid {
     () => { "
-    push rax
-    push rdx
-    push rcx
-    mov ecx, 0xC0000102
-    rdmsr
-    shl rdx, 32
-    or eax, edx
-    test rdx, rdx
-    jnz 1f
-    swapgs
-    1:
-    pop rcx
-    pop rdx
-    pop rax
+        // Unused: {IA32_GS_BASE}
+        pop rax
+        wrgsbase rax
     " }
 }
+#[cfg(not(feature = "x86_fsgsbase"))]
+macro_rules! save_gsbase_paranoid {
+    () => { "
+        mov ecx, {IA32_GS_BASE}
+        rdmsr
+        shl rdx, 32
+        or rax, rdx
+
+        push rax
+    " }
+}
+#[cfg(not(feature = "x86_fsgsbase"))]
+macro_rules! restore_gsbase_paranoid {
+    () => { "
+        pop rdx
+
+        mov ecx, {IA32_GS_BASE}
+        mov eax, edx
+        shr rdx, 32
+        wrmsr
+    " }
+}
+
+#[cfg(feature = "x86_fsgsbase")]
+macro_rules! set_gsbase_paranoid {
+    () => { "
+        // Unused: {IA32_GS_BASE}
+        wrgsbase rax
+    " }
+}
+#[cfg(not(feature = "x86_fsgsbase"))]
+macro_rules! set_gsbase_paranoid {
+    () => { "
+        mov ecx, {IA32_GS_BASE}
+        mov rdx, rax
+        shr rdx, 32
+        wrmsr
+    " }
+}
+
+macro_rules! save_and_set_gsbase_paranoid {
+    // For paranoid interrupt entries, we have to be extremely careful with how we use IA32_GS_BASE
+    // and IA32_KERNEL_GS_BASE. If FSGSBASE is enabled, then we have no way to differentiate these
+    // two, as paranoid interrupts (e.g. NMIs) can occur even in kernel mode. In fact, they can
+    // even occur within another IRQ, so we cannot check the the privilege level via the stack.
+    //
+    // TODO: Linux uses the Interrupt Stack Table to figure out which NMIs were nested. Perhaps
+    // this could be done here, because if nested (sp > initial_sp), that means the NMI could not
+    // have come from userspace. But then, knowing the initial sp would somehow have to involve
+    // percpu, which brings us back to square one. But it might be useful if we would allow faults
+    // in NMIs.
+    //
+    // What we do instead, is using a special entry in the GDT, since we know that the GDT will
+    // always be thread local, as it contains the TSS. This gives us more than 32 bits to work
+    // with, which already is the largest x2APIC ID that an x86 CPU can handle. Luckily we can also
+    // use the stack, even though there might be interrupts in between.
+    () => { concat!(
+        save_gsbase_paranoid!(),
+
+        // Allocate stack space for 8 bytes GDT base and 2 bytes size (ignored).
+        "sub rsp, 16\n",
+        // Set it to the GDT base.
+        "sgdt [rsp + 6]\n",
+        // Get the base pointer
+        "
+        mov rax, [rsp + 8]
+        add rsp, 16
+        ",
+        // Load the lower 32 bits of that GDT entry.
+        "mov eax, [rax]\n",
+        // Calculate the percpu offset.
+        "
+        mov rbx, {KERNEL_PERCPU_OFFSET}
+        shl rax, {KERNEL_PERCPU_SHIFT}
+        add rax, rbx
+        ",
+        // Set GSBASE to RAX accordingly
+        set_gsbase_paranoid!(),
+    ) }
+}
+macro_rules! nop {
+    () => { "
+        // Unused: {IA32_GS_BASE} {KERNEL_PERCPU_OFFSET} {KERNEL_PERCPU_SHIFT}
+        " }
+}
 
 #[macro_export]
 macro_rules! interrupt_stack {
     // XXX: Apparently we cannot use $expr and check for bool exhaustiveness, so we will have to
     // use idents directly instead.
-    ($name:ident, super_atomic: $is_super_atomic:ident!, |$stack:ident| $code:block) => {
-        paste::item! {
-            #[no_mangle]
-            unsafe extern "C" fn [<__interrupt_ $name>]($stack: &mut $crate::arch::x86_64::interrupt::InterruptStack) {
-                // Deadlock safety: interrupts are not normally enabled in the kernel, except in
-                // kmain. However, no locks for context list nor even individual context locks, are
-                // ever meant to be acquired there.
-                let _guard = $crate::ptrace::set_process_regs($stack);
+    ($name:ident, $save1:ident!, $save2:ident!, $rstor2:ident!, $rstor1:ident!, is_paranoid: $is_paranoid:expr, |$stack:ident| $code:block) => {
+        #[naked]
+        pub unsafe extern "C" fn $name() {
+            unsafe extern "C" fn inner($stack: &mut $crate::arch::x86_64::interrupt::InterruptStack) {
+                let _guard;
+
+                if !$is_paranoid {
+                    // Deadlock safety: (non-paranoid) interrupts are not normally enabled in the
+                    // kernel, except in kmain. However, no locks for context list nor even
+                    // individual context locks, are ever meant to be acquired there.
+                    _guard = $crate::ptrace::set_process_regs($stack);
+                }
 
                 // TODO: Force the declarations to specify unsafe?
 
@@ -343,46 +430,61 @@ macro_rules! interrupt_stack {
                     $code
                 }
             }
-
-            function!($name => {
+            asm!(concat!(
                 // Backup all userspace registers to stack
-                $is_super_atomic!(),
+                $save1!(),
                 "push rax\n",
                 push_scratch!(),
                 push_preserved!(),
 
+                $save2!(),
+
                 // TODO: Map PTI
                 // $crate::arch::x86_64::pti::map();
 
                 // Call inner function with pointer to stack
-                "mov rdi, rsp\n",
-                "call __interrupt_", stringify!($name), "\n",
+                "
+                mov rdi, rsp
+                call {inner}
+                ",
 
                 // TODO: Unmap PTI
                 // $crate::arch::x86_64::pti::unmap();
 
+                $rstor2!(),
+
                 // Restore all userspace registers
                 pop_preserved!(),
                 pop_scratch!(),
 
-                $is_super_atomic!(),
+                $rstor1!(),
                 "iretq\n",
-            });
+            ),
+
+            inner = sym inner,
+            IA32_GS_BASE = const(x86::msr::IA32_GS_BASE),
+            KERNEL_PERCPU_SHIFT = const(crate::KERNEL_PERCPU_SHIFT),
+            KERNEL_PERCPU_OFFSET = const(crate::KERNEL_PERCPU_OFFSET),
+
+            options(noreturn),
+
+            );
         }
     };
-    ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, super_atomic: swapgs_iff_ring3_fast!, |$stack| $code); };
+    ($name:ident, |$stack:ident| $code:block) => { interrupt_stack!($name, swapgs_iff_ring3_fast!, nop!, nop!, swapgs_iff_ring3_fast!, is_paranoid: false, |$stack| $code); };
+    ($name:ident, @paranoid, |$stack:ident| $code:block) => { interrupt_stack!($name, nop!, save_and_set_gsbase_paranoid!, restore_gsbase_paranoid!, nop!, is_paranoid: true, |$stack| $code); }
 }
 
 #[macro_export]
 macro_rules! interrupt {
     ($name:ident, || $code:block) => {
-        paste::item! {
-            #[no_mangle]
-            unsafe extern "C" fn [<__interrupt_ $name>]() {
+        #[naked]
+        pub unsafe extern "C" fn $name() {
+            unsafe extern "C" fn inner() {
                 $code
             }
 
-            function!($name => {
+            asm!(concat!(
                 // Backup all userspace registers to stack
                 swapgs_iff_ring3_fast!(),
                 "push rax\n",
@@ -392,7 +494,7 @@ macro_rules! interrupt {
                 // $crate::arch::x86_64::pti::map();
 
                 // Call inner function with pointer to stack
-                "call __interrupt_", stringify!($name), "\n",
+                "call {inner}\n",
 
                 // TODO: Unmap PTI
                 // $crate::arch::x86_64::pti::unmap();
@@ -402,7 +504,12 @@ macro_rules! interrupt {
 
                 swapgs_iff_ring3_fast!(),
                 "iretq\n",
-            });
+            ),
+
+            inner = sym inner,
+
+            options(noreturn),
+            );
         }
     };
 }
@@ -410,9 +517,9 @@ macro_rules! interrupt {
 #[macro_export]
 macro_rules! interrupt_error {
     ($name:ident, |$stack:ident| $code:block) => {
-        paste::item! {
-            #[no_mangle]
-            unsafe extern "C" fn [<__interrupt_ $name>]($stack: &mut $crate::arch::x86_64::interrupt::handler::InterruptErrorStack) {
+        #[naked]
+        pub unsafe extern "C" fn $name() {
+            unsafe extern "C" fn inner($stack: &mut $crate::arch::x86_64::interrupt::handler::InterruptErrorStack) {
                 let _guard;
 
                 // Only set_ptrace_process_regs if this error occured from userspace. If this fault
@@ -432,7 +539,7 @@ macro_rules! interrupt_error {
                 }
             }
 
-            function!($name => {
+            asm!(concat!(
                 swapgs_iff_ring3_fast_errorcode!(),
                 // Move rax into code's place, put code in last instead (to be
                 // compatible with InterruptStack)
@@ -449,8 +556,10 @@ macro_rules! interrupt_error {
                 // $crate::arch::x86_64::pti::map();
 
                 // Call inner function with pointer to stack
-                "mov rdi, rsp\n",
-                "call __interrupt_", stringify!($name), "\n",
+                "
+                mov rdi, rsp
+                call {inner}
+                ",
 
                 // TODO: Unmap PTI
                 // $crate::arch::x86_64::pti::unmap();
@@ -462,9 +571,14 @@ macro_rules! interrupt_error {
                 pop_preserved!(),
                 pop_scratch!(),
 
-                swapgs_iff_ring3_fast_errorcode!(),
+                // The error code has already been popped, so use the regular macro.
+                swapgs_iff_ring3_fast!(),
                 "iretq\n",
-            });
+            ),
+
+            inner = sym inner,
+
+            options(noreturn));
         }
     };
 }