From 9a7cfaeaf16d4c7586fe70fd490d1e85afb11b0f Mon Sep 17 00:00:00 2001
From: 4lDO2 <4lDO2@protonmail.com>
Date: Sat, 3 Aug 2024 01:14:07 +0200
Subject: [PATCH] Simplify aarch64 asm slightly.

---
 redox-rt/src/arch/aarch64.rs | 99 +++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 36 deletions(-)

diff --git a/redox-rt/src/arch/aarch64.rs b/redox-rt/src/arch/aarch64.rs
index e6bf9827c..d72ef9cec 100644
--- a/redox-rt/src/arch/aarch64.rs
+++ b/redox-rt/src/arch/aarch64.rs
@@ -26,7 +26,8 @@ pub struct SigArea {
     pub pctl: usize, // TODO: remove
     pub last_sig_was_restart: bool,
     pub last_sigstack: Option<NonNull<SigStack>>,
-    pub tmp_inf: RtSigInfo,
+    pub tmp_rt_inf: RtSigInfo,
+    pub tmp_id_inf: u64,
 }
 #[repr(C)]
 #[derive(Debug, Default)]
@@ -176,65 +177,73 @@ asmfunction!(__relibc_internal_sigentry: ["
 
     ldr x6, [x0, #{tcb_sa_off} + {sa_pctl}]
 1:
-    // Load x1 and x2 with each signal group's bits (tearing between x1 and x2 can occur) with
-    // acquire ordering
-    add x2, x0, #{tcb_sc_off} + {sc_word}
-    ldaxp x1, x2, [x2]
+    // Load x1 with the thread's bits
+    add x5, x0, #{tcb_sc_off} + {sc_word}
+    ldaxr x1, [x5]
 
     // First check if there are standard thread signals,
     and x4, x1, x1, lsr #32 // x4 := x1 & (x1 >> 32)
     cbnz x4, 3f // jump if x4 != 0
+    clrex
 
     // and if not, load process pending bitset.
-    add x3, x6, #{pctl_pending}
-    ldaxr x3, [x3]
+    add x1, x6, #{pctl_pending}
+    ldaxr x2, [x1]
 
     // Check if there are standard proc signals:
-    lsr x4, x1, #32 // mask
-    and w4, w4, w3 // pending unblocked proc
-    cbz w4, 4f // skip 'fetch_andn' step if zero
+    lsr x3, x1, #32 // mask
+    and w3, w3, w3 // pending unblocked proc
+    cbz w3, 4f // skip 'fetch_andn' step if zero
 
     // If there was one, find which one, and try clearing the bit (last value in x3, addr in x6)
     // this picks the MSB rather than the LSB, unlike x86. POSIX does not require any specific
     // ordering though.
-    clz x4, x4
-    mov x5, #32
-    sub x4, x5, x4
+    clz x3, x3
+    mov x4, #32
+    sub x3, x4, x3
+    // x3 now contains the sig_idx
 
-    mov x5, #1
-    lsl x5, x5, x4 // bit to remove
+    mov x4, #1
+    lsl x4, x4, x3 // bit to remove
 
-    sub x5, x3, x5 // bit was certainly set, so sub is allowed
-    add x3, x6, #{pctl_pending}
+    sub x4, x2, x4 // bit was certainly set, so sub is allowed
+    // x4 is now the new mask to be set
+    add x5, x6, #{pctl_pending}
 
     // Try clearing the bit, retrying on failure.
-    add x3, x6, #{pctl_pending}
-    stxr w1, x5, [x3] // try setting [x3] to x5, set x5 := 0 on success
-    cbnz x1, 1b
-    mov x1, x4
+    stxr w1, x4, [x1] // try setting pending set to x4, set w1 := 0 on success
+    cbnz x1, 1b // retry everything if this fails
+    mov x1, x3
     b 2f
 4:
-    // Check for realtime signals, thread/proc. x1 is now free real estate (but needs to contain
-    // the selected signal number when entering Rust).
+    // Check for realtime signals, thread/proc.
+    clrex
 
-    b .
+    // Load the pending set again. TODO: optimize this?
+    add x1, x6, #{pctl_pending}
+    ldaxr x2, [x1]
+    lsr x2, x2, #32
 
-    mov w1, w2
-    orr w1, w1, w3
-    and x1, x1, x1, lsr #32
+    add x5, x0, #{tcb_sc_off} + {sc_word} + 8
+    ldar x1, [x5]
 
-    rbit x1, x1
-    clz x1, x1
-    mov x2, #32
-    sub x1, x2, x1
-    mov x2, #1
-    lsl x2, x2, x1
+    orr x2, x1, x2
+    and x2, x2, x2, lsr #32
+
+    rbit x3, x2
+    clz x3, x3
+    mov x4, #32
+    sub x2, x4, x3
+    // x2 now contains sig_idx - 32
+
+    // If realtime signal was directed at thread, handle it as an idempotent signal.
+    tbnz x1, x2, 5f
 
     mov x5, x0
     mov x4, x8
     mov x8, {SYS_SIGDEQUEUE}
     // x1 contains signal
-    add x2, x0, #{tcb_sa_off} + {sa_tmp_inf}
+    add x2, x0, #{tcb_sa_off} + {sa_tmp_rt_inf}
     svc 0
     cbnz x0, 1b
     mov x0, x5
@@ -254,10 +263,26 @@ asmfunction!(__relibc_internal_sigentry: ["
 
     // skip sigaltstack step if SA_ONSTACK is clear
     // tbz x2, #{SA_ONSTACK_BIT}, 2f
+    b 2f
+5:
+    add x1, x2, 32
+    b 3b
 3:
+    // A signal was sent to this thread, try clearing its bit.
     clz x1, x1
-    mov x2, #32 + 64
+    mov x2, #32
     sub x1, x2, x1
+
+    add x2, x0, #{tcb_sc_off} + {sc_sender_infos}
+    add x2, x2, w1, utxb #3
+    ldar x2, [x2]
+
+    stxr w3, x1, [x5]
+    cbnz w3, 1b
+
+    str x3, [x0, #{tcb_sa_off} + {sa_tmp_id_inf}]
+    add x1, x1, #64
+    b 2f
 2:
     ldr x2, [x0, #{tcb_sc_off} + {sc_saved_pc}]
     ldr x3, [x0, #{tcb_sc_off} + {sc_saved_x0}]
@@ -325,10 +350,12 @@ asmfunction!(__relibc_internal_sigentry: ["
     sa_tmp_x3_x4 = const offset_of!(SigArea, tmp_x3_x4),
     sa_tmp_x5_x6 = const offset_of!(SigArea, tmp_x5_x6),
     sa_tmp_sp = const offset_of!(SigArea, tmp_sp),
-    sa_tmp_inf = const offset_of!(SigArea, tmp_inf),
+    sa_tmp_rt_inf = const offset_of!(SigArea, tmp_rt_inf),
+    sa_tmp_id_inf = const offset_of!(SigArea, tmp_id_inf),
     sa_pctl = const offset_of!(SigArea, pctl),
     sc_saved_pc = const offset_of!(Sigcontrol, saved_ip),
     sc_saved_x0 = const offset_of!(Sigcontrol, saved_archdep_reg),
+    sc_sender_infos = const offset_of!(Sigcontrol, sender_infos),
     sc_word = const offset_of!(Sigcontrol, word),
     inner = sym inner_c,
 
-- 
GitLab