diff --git a/redox-rt/src/arch/i686.rs b/redox-rt/src/arch/i686.rs
index d2912ae8c4b246f1f622ee699b465295f1496c0e..27f1e55bce87fd910e6c306fd703aa60ffba311a 100644
--- a/redox-rt/src/arch/i686.rs
+++ b/redox-rt/src/arch/i686.rs
@@ -10,6 +10,7 @@ pub(crate) const STACK_TOP: usize = 1 << 31;
 pub(crate) const STACK_SIZE: usize = 1024 * 1024;
 
 #[derive(Debug, Default)]
+#[repr(C)]
 pub struct SigArea {
     pub altstack_top: usize,
     pub altstack_bottom: usize,
diff --git a/redox-rt/src/arch/x86_64.rs b/redox-rt/src/arch/x86_64.rs
index 13bfc5264047ad5649023619a4259c1c80d19d4a..33d52167e25e4b91e7ee6391f5bbfae79da6f70d 100644
--- a/redox-rt/src/arch/x86_64.rs
+++ b/redox-rt/src/arch/x86_64.rs
@@ -6,6 +6,7 @@ use syscall::error::*;
 use syscall::flag::*;
 
 use crate::proc::{fork_inner, FdGuard};
+use crate::signal::SigStack;
 use crate::signal::{inner_c, RtSigarea};
 
 // Setup a stack starting from the very end of the address space, and then growing downwards.
@@ -13,6 +14,7 @@ pub(crate) const STACK_TOP: usize = 1 << 47;
 pub(crate) const STACK_SIZE: usize = 1024 * 1024;
 
 #[derive(Debug, Default)]
+#[repr(C)]
 pub struct SigArea {
     pub tmp_rip: usize,
     pub tmp_rsp: usize,
@@ -25,6 +27,31 @@ pub struct SigArea {
     pub disable_signals_depth: u64,
 }
 
+#[repr(C, align(64))]
+#[derive(Debug, Default)]
+pub struct ArchIntRegs {
+    _pad: [usize; 2], // ensure size is divisible by 32
+
+    pub r15: usize,
+    pub r14: usize,
+    pub r13: usize,
+    pub r12: usize,
+    pub rbp: usize,
+    pub rbx: usize,
+    pub r11: usize,
+    pub r10: usize,
+    pub r9: usize,
+    pub r8: usize,
+    pub rax: usize,
+    pub rcx: usize,
+    pub rdx: usize,
+    pub rsi: usize,
+    pub rdi: usize,
+    pub rflags: usize,
+    pub rip: usize,
+    pub rsp: usize,
+}
+
 /// Deactive TLS, used before exec() on Redox to not trick target executable into thinking TLS
 /// is already initialized as if it was a thread.
 pub unsafe fn deactivate_tcb(open_via_dup: usize) -> Result<()> {
@@ -185,11 +212,9 @@ asmfunction!(__relibc_internal_sigentry: ["
 4:
     // Now that we have a stack, we can finally start initializing the signal stack!
 
-    push 0x23 // SS
     push fs:[{tcb_sa_off} + {sa_tmp_rsp}]
-    push fs:[{tcb_sc_off} + {sc_saved_rflags}]
-    push 0x2b // CS
     push fs:[{tcb_sc_off} + {sc_saved_rip}]
+    push fs:[{tcb_sc_off} + {sc_saved_rflags}]
 
     push rdi
     push rsi
@@ -250,14 +275,12 @@ asmfunction!(__relibc_internal_sigentry: ["
     pop rsi
     pop rdi
 
-    iretq
-    /*
-    pop qword ptr fs:[{tcb_sa_off} + {sa_tmp_rip}]
-    add rsp, 8
     popfq
+    pop qword ptr fs:[{tcb_sa_off} + {sa_tmp_rip}]
+__relibc_internal_sigentry_crit_first:
     pop rsp
+__relibc_internal_sigentry_crit_second:
     jmp qword ptr fs:[{tcb_sa_off} + {sa_tmp_rip}]
-    */
 6:
     fxsave64 [rsp]
 
@@ -292,4 +315,29 @@ asmfunction!(__relibc_internal_sigentry: ["
     STACK_ALIGN = const 64, // if xsave is used
 ]);
 
+extern "C" {
+    fn __relibc_internal_sigentry_crit_first();
+    fn __relibc_internal_sigentry_crit_second();
+}
+pub unsafe fn arch_pre(stack: &mut SigStack, area: &mut SigArea) {
+    // It is impossible to update RSP and RIP atomically on x86_64, without using IRETQ, which is
+    // almost as slow as calling a SIGRETURN syscall would be. Instead, we abuse the fact that
+    // signals are disabled in the prologue of the signal trampoline, which allows us to emulate
+    // atomicity inside the critical section, consisting of one instruction at 'crit_first', and
+    // one at 'crit_second', see asm.
+
+    if stack.regs.rip == __relibc_internal_sigentry_crit_first as usize {
+        // Reexecute pop rsp and jump steps. This case needs to be different from the one below,
+        // since rsp has not been overwritten with the previous context's stack, just yet. At this
+        // point, we know [rsp+0] contains the saved RSP, and [rsp-8] contains the saved RIP.
+        let stack_ptr = stack.regs.rsp as *const usize;
+        stack.regs.rsp = stack_ptr.read();
+        stack.regs.rip = stack_ptr.sub(1).read();
+    } else if stack.regs.rip == __relibc_internal_sigentry_crit_second as usize {
+        // Almost finished, just reexecute the jump before tmp_rip is overwritten by this
+        // deeper-level signal.
+        stack.regs.rip = area.tmp_rip;
+    }
+}
+
 static SUPPORTS_XSAVE: AtomicU8 = AtomicU8::new(1); // FIXME
diff --git a/redox-rt/src/signal.rs b/redox-rt/src/signal.rs
index 63fcbd460e7b9adaf26a79814574d21621381b21..d328896d62fb73df93a7bcfbac9e62f6fe284ff2 100644
--- a/redox-rt/src/signal.rs
+++ b/redox-rt/src/signal.rs
@@ -2,7 +2,7 @@ use core::cell::{Cell, UnsafeCell};
 use core::ffi::c_int;
 use core::sync::atomic::Ordering;
 
-use syscall::{Error, IntRegisters, Result, SetSighandlerData, SigProcControl, Sigcontrol, SigcontrolFlags, EINVAL, SIGCHLD, SIGCONT, SIGKILL, SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU, SIGURG, SIGW0_NOCLDSTOP_BIT, SIGW0_TSTP_IS_STOP_BIT, SIGW0_TTIN_IS_STOP_BIT, SIGW0_TTOU_IS_STOP_BIT, SIGWINCH, data::AtomicU64};
+use syscall::{Error, Result, SetSighandlerData, SigProcControl, Sigcontrol, SigcontrolFlags, EINVAL, SIGCHLD, SIGCONT, SIGKILL, SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU, SIGURG, SIGW0_NOCLDSTOP_BIT, SIGW0_TSTP_IS_STOP_BIT, SIGW0_TTIN_IS_STOP_BIT, SIGW0_TTOU_IS_STOP_BIT, SIGWINCH, data::AtomicU64};
 
 use crate::{arch::*, Tcb};
 use crate::sync::Mutex;
@@ -41,16 +41,18 @@ pub struct SigStack {
     _pad: [usize; 2], // pad to 192 = 3 * 64 = 168 + 24 bytes
 
     sig_num: usize,
-    regs: IntRegisters, // 160 bytes currently
+    pub regs: ArchIntRegs, // 160 bytes currently
 }
 
 #[inline(always)]
 unsafe fn inner(stack: &mut SigStack) {
-    // TODO: Set procmask based on SA_NODEFER, SA_RESETHAND, and sa_mask.
+    let os = &Tcb::current().unwrap().os_specific;
 
     // asm counts from 0
     stack.sig_num += 1;
 
+    arch_pre(stack, &mut *os.arch.get());
+
     let sigaction = {
         let mut guard = SIGACTIONS.lock();
         let action = guard[stack.sig_num];
@@ -75,8 +77,6 @@ unsafe fn inner(stack: &mut SigStack) {
         sigallow_inside &= !sig_bit(stack.sig_num);
     }
 
-    let os = &Tcb::current().unwrap().os_specific;
-
     // Set sigmask to sa_mask and unmark the signal as pending.
     let lo = os.control.word[0].load(Ordering::Relaxed) >> 32;
     let hi = os.control.word[1].load(Ordering::Relaxed) >> 32;