swscanf and vswscanf implementation proposal

198c5502 · Nicolás Antinori · Jeremy Soller · ce818157 · 198c5502 · 198c5502
Commit 198c5502 authored 7 months ago by Nicolás Antinori Committed by Jeremy Soller 7 months ago
--- a/src/header/wchar/lookaheadreader.rs
+++ b/src/header/wchar/lookaheadreader.rs
+use super::{fseek_locked, ftell_locked, FILE, SEEK_SET};
+use crate::{
+    io::Read,
+    platform::types::{off_t, wint_t},
+};
+struct LookAheadBuffer {
+    buf: *const wint_t,
+    pos: isize,
+    look_ahead: isize,
+}
+impl LookAheadBuffer {
+    fn look_ahead(&mut self) -> Result<Option<wint_t>, i32> {
+        let wchar = unsafe { *self.buf.offset(self.look_ahead) };
+        if wchar == 0 {
+            Ok(None)
+        } else {
+            self.look_ahead += 1;
+            Ok(Some(wchar))
+        }
+    }
+
+    fn commit(&mut self) {
+        self.pos = self.look_ahead;
+    }
+}
+
+impl From<*const wint_t> for LookAheadBuffer {
+    fn from(buff: *const wint_t) -> LookAheadBuffer {
+        LookAheadBuffer {
+            buf: buff,
+            pos: 0,
+            look_ahead: 0,
+        }
+    }
+}
+
+pub struct LookAheadReader(LookAheadBuffer);
+
+impl LookAheadReader {
+    pub fn lookahead1(&mut self) -> Result<Option<wint_t>, i32> {
+        self.0.look_ahead()
+    }
+
+    pub fn commit(&mut self) {
+        self.0.commit()
+    }
+}
+
+impl From<*const wint_t> for LookAheadReader {
+    fn from(buff: *const wint_t) -> LookAheadReader {
+        LookAheadReader(buff.into())
+    }
+}
--- a/src/header/wchar/mod.rs
+++ b/src/header/wchar/mod.rs
@@ -15,8 +15,10 @@ use crate::{
    platform::{self, types::*, ERRNO},
 };

+mod lookaheadreader;
 mod utf8;
 mod wprintf;
+mod wscanf;

 #[repr(C)]
 #[derive(Clone, Copy)]
@@ -277,9 +279,23 @@ pub unsafe extern "C" fn putwchar(wc: wchar_t) -> wint_t {
    fputwc(wc, &mut *stdout)
 }

-// #[no_mangle]
-pub extern "C" fn swscanf(s: *const wchar_t, format: *const wchar_t, ap: va_list) -> c_int {
-    unimplemented!();
+#[no_mangle]
+pub unsafe extern "C" fn vswscanf(
+    s: *const wchar_t,
+    format: *const wchar_t,
+    __valist: va_list,
+) -> c_int {
+    let reader = (s as *const wint_t).into();
+    wscanf::scanf(reader, format, __valist)
+}
+
+#[no_mangle]
+pub unsafe extern "C" fn swscanf(
+    s: *const wchar_t,
+    format: *const wchar_t,
+    mut __valist: ...
+) -> c_int {
+    vswscanf(s, format, __valist.as_va_list())
 }

 /// Push wide character `wc` back onto `stream` so it'll be read next

--- a/src/header/wchar/wscanf.rs
+++ b/src/header/wchar/wscanf.rs
+use super::lookaheadreader::LookAheadReader;
+use crate::platform::types::*;
+use alloc::{string::String, vec::Vec};
+use core::ffi::VaList as va_list;
+
+#[derive(PartialEq, Eq)]
+enum IntKind {
+    Byte,
+    Short,
+    Int,
+    Long,
+    LongLong,
+    IntMax,
+    PtrDiff,
+    Size,
+}
+
+#[derive(PartialEq, Eq)]
+enum CharKind {
+    Ascii,
+    Wide,
+}
+
+/// Helper function for progressing a C string
+unsafe fn next_char(string: &mut *const wchar_t) -> Result<wint_t, c_int> {
+    let c = **string as wint_t;
+    *string = string.offset(1);
+    if c == 0 {
+        Err(-1)
+    } else {
+        Ok(c)
+    }
+}
+
+macro_rules! wc_as_char {
+    ($c:ident) => {
+        char::try_from($c).map_err(|_| -1)?
+    };
+}
+
+unsafe fn inner_scanf(
+    mut r: LookAheadReader,
+    mut format: *const wchar_t,
+    mut ap: va_list,
+) -> Result<c_int, c_int> {
+    let mut matched = 0;
+    let mut wchar = 0;
+    let mut skip_read = false;
+    let mut count = 0;
+
+    macro_rules! read {
+        () => {{
+            match r.lookahead1() {
+                Ok(None) => false,
+                Ok(Some(b)) => {
+                    wchar = b;
+                    count += 1;
+                    true
+                }
+                Err(x) => return Err(x),
+            }
+        }};
+    }
+
+    macro_rules! maybe_read {
+        () => {
+            maybe_read!(inner false);
+        };
+        (noreset) => {
+            maybe_read!(inner);
+        };
+        (inner $($placeholder:expr)*) => {
+            if !skip_read && !read!() {
+                match matched {
+                    0 => return Ok(-1),
+                    a => return Ok(a),
+                }
+            }
+            $(else {
+                // Hacky way of having this optional
+                skip_read = $placeholder;
+            })*
+        }
+    }
+
+    while *format != 0 {
+        let mut c = next_char(&mut format)?;
+
+        if c as u8 == b' ' {
+            maybe_read!(noreset);
+
+            while (wc_as_char!(wchar)).is_whitespace() {
+                if !read!() {
+                    return Ok(matched);
+                }
+            }
+
+            skip_read = true;
+        } else if c as u8 != b'%' {
+            maybe_read!();
+            if c != wchar {
+                return Ok(matched);
+            }
+            r.commit();
+        } else {
+            c = next_char(&mut format)?;
+
+            let mut ignore = false;
+            if c as u8 == b'*' {
+                ignore = true;
+                c = next_char(&mut format)?;
+            }
+
+            let mut width = String::new();
+            while c as u8 >= b'0' && c as u8 <= b'9' {
+                width.push(wc_as_char!(c));
+                c = next_char(&mut format)?;
+            }
+            let mut width = if width.is_empty() {
+                None
+            } else {
+                match width.parse::<usize>() {
+                    Ok(n) => Some(n),
+                    Err(_) => return Err(-1),
+                }
+            };
+
+            // When an EOF occurs, eof is set, stuff is marked matched
+            // as usual, and finally it is returned
+            let mut eof = false;
+
+            let mut kind = IntKind::Int;
+            let mut c_kind = CharKind::Ascii;
+            loop {
+                match c as u8 {
+                    b'h' => {
+                        if kind == IntKind::Short || kind == IntKind::Byte {
+                            kind = IntKind::Byte;
+                        } else {
+                            kind = IntKind::Short;
+                        }
+                    }
+                    b'j' => kind = IntKind::IntMax,
+                    b'l' => {
+                        if kind == IntKind::Long || kind == IntKind::LongLong {
+                            kind = IntKind::LongLong;
+                        } else {
+                            kind = IntKind::Long;
+                        }
+                    }
+                    b'q' | b'L' => kind = IntKind::LongLong,
+                    b't' => kind = IntKind::PtrDiff,
+                    b'z' => kind = IntKind::Size,
+                    // If kind is Long, means we found a 'l' before finding 'c' or 's'. In this
+                    // case the format corresponds to a wide char/string
+                    b'c' | b's' if kind == IntKind::Long => {
+                        c_kind = CharKind::Wide;
+                        break;
+                    }
+                    _ => break,
+                }
+
+                c = next_char(&mut format)?;
+            }
+
+            if c as u8 != b'n' {
+                maybe_read!(noreset);
+            }
+            match c as u8 {
+                b'%' => {
+                    while (wc_as_char!(wchar)).is_whitespace() {
+                        if !read!() {
+                            return Ok(matched);
+                        }
+                    }
+
+                    if wchar as u8 != b'%' {
+                        return Err(matched);
+                    } else if !read!() {
+                        return Ok(matched);
+                    }
+                }
+
+                b'd' | b'i' | b'o' | b'u' | b'x' | b'X' | b'f' | b'e' | b'g' | b'E' | b'a'
+                | b'p' => {
+                    while (wc_as_char!(wchar)).is_whitespace() {
+                        if !read!() {
+                            return Ok(matched);
+                        }
+                    }
+
+                    let pointer = c as u8 == b'p';
+                    // Pointers aren't automatic, but we do want to parse "0x"
+                    let auto = c as u8 == b'i' || pointer;
+                    let float = c as u8 == b'f'
+                        || c as u8 == b'e'
+                        || c as u8 == b'g'
+                        || c as u8 == b'E'
+                        || c as u8 == b'a';
+
+                    let mut radix = match c as u8 {
+                        b'o' => 8,
+                        b'x' | b'X' | b'p' => 16,
+                        _ => 10,
+                    };
+
+                    let mut n = String::new();
+                    let mut dot = false;
+
+                    while width.map(|w| w > 0).unwrap_or(true)
+                        && ((wchar as u8 >= b'0' && wchar as u8 <= b'7')
+                            || (radix >= 10 && (wchar as u8 >= b'8' && wchar as u8 <= b'9'))
+                            || (float && !dot && wchar as u8 == b'.')
+                            || (radix == 16
+                                && ((wchar as u8 >= b'a' && wchar as u8 <= b'f')
+                                    || (wchar as u8 >= b'A' && wchar as u8 <= b'F'))))
+                    {
+                        if auto
+                            && n.is_empty()
+                            && wchar as u8 == b'0'
+                            && width.map(|w| w > 0).unwrap_or(true)
+                        {
+                            if !pointer {
+                                radix = 8;
+                            }
+                            width = width.map(|w| w - 1);
+                            if !read!() {
+                                return Ok(matched);
+                            }
+                            if width.map(|w| w > 0).unwrap_or(true)
+                                && (wchar as u8 == b'x' || wchar as u8 == b'X')
+                            {
+                                radix = 16;
+                                width = width.map(|w| w - 1);
+                                if width.map(|w| w > 0).unwrap_or(true) && !read!() {
+                                    return Ok(matched);
+                                }
+                            }
+                            continue;
+                        }
+                        if wchar as u8 == b'.' {
+                            // Don't allow another dot
+                            dot = true;
+                        }
+                        n.push(wc_as_char!(wchar));
+                        r.commit();
+                        width = width.map(|w| w - 1);
+                        if width.map(|w| w > 0).unwrap_or(true) && !read!() {
+                            break;
+                        }
+                    }
+
+                    macro_rules! parse_type {
+                        (noformat $type:ident) => {{
+                            let n = if n.is_empty() {
+                                0 as $type
+                            } else {
+                                n.parse::<$type>().map_err(|_| 0)?
+                            };
+                            if !ignore {
+                                *ap.arg::<*mut $type>() = n;
+                                matched += 1;
+                            }
+                        }};
+                        (c_double) => {
+                            parse_type!(noformat c_double)
+                        };
+                        (c_float) => {
+                            parse_type!(noformat c_float)
+                        };
+                        ($type:ident) => {
+                            parse_type!($type, $type)
+                        };
+                        ($type:ident, $final:ty) => {{
+                            let n = if n.is_empty() {
+                                0 as $type
+                            } else {
+                                $type::from_str_radix(&n, radix).map_err(|_| 0)?
+                            };
+                            if !ignore {
+                                *ap.arg::<*mut $final>() = n as $final;
+                                matched += 1;
+                            }
+                        }};
+                    }
+
+                    if float {
+                        if kind == IntKind::Long || kind == IntKind::LongLong {
+                            parse_type!(c_double);
+                        } else {
+                            parse_type!(c_float);
+                        }
+                    } else if c as u8 == b'p' {
+                        parse_type!(size_t, *mut c_void);
+                    } else {
+                        let unsigned = c as u8 == b'o'
+                            || c as u8 == b'u'
+                            || c as u8 == b'x'
+                            || c as u8 == b'X';
+
+                        match kind {
+                            IntKind::Byte => {
+                                if unsigned {
+                                    parse_type!(c_uchar);
+                                } else {
+                                    parse_type!(c_char);
+                                }
+                            }
+                            IntKind::Short => {
+                                if unsigned {
+                                    parse_type!(c_ushort)
+                                } else {
+                                    parse_type!(c_short)
+                                }
+                            }
+                            IntKind::Int => {
+                                if unsigned {
+                                    parse_type!(c_uint)
+                                } else {
+                                    parse_type!(c_int)
+                                }
+                            }
+                            IntKind::Long => {
+                                if unsigned {
+                                    parse_type!(c_ulong)
+                                } else {
+                                    parse_type!(c_long)
+                                }
+                            }
+                            IntKind::LongLong => {
+                                if unsigned {
+                                    parse_type!(c_ulonglong)
+                                } else {
+                                    parse_type!(c_longlong)
+                                }
+                            }
+                            IntKind::IntMax => {
+                                if unsigned {
+                                    parse_type!(uintmax_t)
+                                } else {
+                                    parse_type!(intmax_t)
+                                }
+                            }
+                            IntKind::PtrDiff => parse_type!(ptrdiff_t),
+                            IntKind::Size => {
+                                if unsigned {
+                                    parse_type!(size_t)
+                                } else {
+                                    parse_type!(ssize_t)
+                                }
+                            }
+                        }
+                    }
+                }
+
+                b's' => {
+                    macro_rules! parse_string_type {
+                        ($type:ident) => {
+                            while (wc_as_char!(wchar)).is_whitespace() {
+                                if !read!() {
+                                    return Ok(matched);
+                                }
+                            }
+
+                            let mut ptr: Option<*mut $type> =
+                                if ignore { None } else { Some(ap.arg()) };
+
+                            while width.map(|w| w > 0).unwrap_or(true)
+                                && !(wc_as_char!(wchar)).is_whitespace()
+                            {
+                                if let Some(ref mut ptr) = ptr {
+                                    **ptr = wchar as $type;
+                                    *ptr = ptr.offset(1);
+                                }
+                                width = width.map(|w| w - 1);
+                                if width.map(|w| w > 0).unwrap_or(true) && !read!() {
+                                    eof = true;
+                                    break;
+                                }
+                            }
+
+                            if let Some(ptr) = ptr {
+                                *ptr = 0;
+                                matched += 1;
+                                r.commit();
+                            }
+                        };
+                    }
+
+                    if c_kind == CharKind::Ascii {
+                        parse_string_type!(c_char);
+                    } else {
+                        parse_string_type!(wchar_t);
+                    }
+                }
+
+                b'c' => {
+                    macro_rules! parse_char_type {
+                        ($type:ident) => {
+                            let ptr: Option<*mut $type> =
+                                if ignore { None } else { Some(ap.arg()) };
+
+                            for i in 0..width.unwrap_or(1) {
+                                if let Some(ptr) = ptr {
+                                    *ptr.add(i) = wchar as $type;
+                                }
+                                width = width.map(|w| w - 1);
+                                if width.map(|w| w > 0).unwrap_or(true) && !read!() {
+                                    eof = true;
+                                    break;
+                                }
+                            }
+
+                            if ptr.is_some() {
+                                matched += 1;
+                                r.commit();
+                            }
+                        };
+                    }
+
+                    if c_kind == CharKind::Ascii {
+                        parse_char_type!(c_char);
+                    } else {
+                        parse_char_type!(wchar_t);
+                    }
+                }
+
+                b'[' => {
+                    c = next_char(&mut format)?;
+
+                    let mut matches = Vec::new();
+                    let invert = if c as u8 == b'^' {
+                        c = next_char(&mut format)?;
+                        true
+                    } else {
+                        false
+                    };
+
+                    let mut prev;
+                    loop {
+                        matches.push(c);
+                        prev = c;
+                        c = next_char(&mut format)?;
+                        if c as u8 == b'-' {
+                            if prev as u8 == b']' {
+                                continue;
+                            }
+                            c = next_char(&mut format)?;
+                            if c as u8 == b']' {
+                                matches.push('-' as wint_t);
+                                break;
+                            }
+                            prev += 1;
+                            while prev < c {
+                                matches.push(prev);
+                                prev += 1;
+                            }
+                        } else if c as u8 == b']' {
+                            break;
+                        }
+                    }
+
+                    let mut ptr: Option<*mut c_char> = if ignore { None } else { Some(ap.arg()) };
+
+                    // While we haven't used up all the width, and it matches
+                    let mut data_stored = false;
+                    while width.map(|w| w > 0).unwrap_or(true)
+                        && !invert == matches.contains(&wchar)
+                    {
+                        if let Some(ref mut ptr) = ptr {
+                            **ptr = wchar as c_char;
+                            *ptr = ptr.offset(1);
+                            data_stored = true;
+                        }
+                        r.commit();
+                        // Decrease the width, and read a new character unless the width is 0
+                        width = width.map(|w| w - 1);
+                        if width.map(|w| w > 0).unwrap_or(true) && !read!() {
+                            // Reading a new character has failed, return after
+                            // actually marking this as matched
+                            eof = true;
+                            break;
+                        }
+                    }
+
+                    if data_stored {
+                        *ptr.unwrap() = 0;
+                        matched += 1;
+                    }
+                }
+                b'n' => {
+                    if !ignore {
+                        *ap.arg::<*mut c_int>() = count as c_int;
+                    }
+                }
+                _ => return Err(-1),
+            }
+
+            if eof {
+                return Ok(matched);
+            }
+
+            if width != Some(0) && c as u8 != b'n' {
+                // It didn't hit the width, so an extra character was read and matched.
+                // But this character did not match so let's reuse it.
+                skip_read = true;
+            }
+        }
+    }
+    Ok(matched)
+}
+
+pub unsafe fn scanf(r: LookAheadReader, format: *const wchar_t, ap: va_list) -> c_int {
+    match inner_scanf(r, format, ap) {
+        Ok(n) => n,
+        Err(n) => n,
+    }
+}
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -119,6 +119,7 @@ EXPECT_NAMES=\
 	wchar/mbsrtowcs \
 	wchar/printf-on-wchars \
 	wchar/putwchar \
+	wchar/wscanf \
 	wchar/ungetwc \
 	wchar/wprintf \
 	wchar/wcrtomb \

--- a/tests/expected/bins_static/wchar/wscanf.stderr
+++ b/tests/expected/bins_static/wchar/wscanf.stderr
--- a/tests/expected/bins_static/wchar/wscanf.stdout
+++ b/tests/expected/bins_static/wchar/wscanf.stdout
--- a/tests/wchar/wscanf.c
+++ b/tests/wchar/wscanf.c
+/* swscanf example */
+#include <wchar.h>
+#include <stdio.h>
+
+#include "test_helpers.h"
+
+struct params {
+    short sa;
+    int ia;
+    int ib;
+    int ic;
+    float fa;
+    double da;
+    int *ptr;
+    char c;
+    wchar_t wc;
+    char string1[20];
+    char string2[20];
+    char string3[20];
+    char string4[20];
+    wchar_t wstring1[20];
+    wchar_t wstring2[20];
+    wchar_t wstring3[20];
+    wchar_t wstring4[20];
+};
+
+
+void test(wchar_t* fmt_in, wchar_t* input, struct params *p, ...) {
+    va_list args;
+    va_start(args, p);
+    wint_t ret = vswscanf(input, fmt_in, args);
+    va_end(args);
+
+    wprintf(
+        L"%d, { sa: %hhd, ia: %d, ib: %d, ic: %d, fa: %f, da: %lf, ptr: %p, char: %c, wide char: %lc, string1: %s, string2: %s, string3: %s, string4: %s, wstring1: %ls, wstring2: %ls, wstring3: %ls, wstring4: %ls }\n",
+        ret, p->sa, p->ia, p->ib, p->ic, p->fa, p->da, p->ptr, p->c, p->wc, p->string1, p->string2, p->string3, p->string4, p->wstring1, p->wstring2, p->wstring3, p->wstring4
+    );
+}
+
+int main ()
+{
+    struct params p = { .c = 'a' };
+
+    test(L"%hd %d", L"12 345", &p, &p.sa, &p.ia);
+    test(L"%x %i %i", L"12 0x345 010", &p, &p.ia, &p.ib, &p.ic);
+    test(L"%f.%lf", L"0.1.0.2", &p, &p.fa, &p.da);
+    test(L"%p", L"0xABCDEF", &p, &p.ptr);
+    test(L"%s", L"Hello World", &p, &p.string1);
+    test(L"%3i", L"0xFF", &p, &p.ia);
+    test(L"%c%3c", L"hello", &p, &p.c, &p.string1);
+    test(L"%lc", L"β", &p, &p.wc);
+    test(L"%lc %f", L"π 3.14", &p, &p.wc, &p.fa);
+    test(L"test: %2i%n", L"test: 0xFF", &p, &p.ia, &p.ib);
+    test(L"hello world%%", L"hello world%", &p);
+    test(L"h%1[ae]ll%1[^a] wor%1[^\n]%[d]", L"hello world", &p, &p.string1, &p.string2, &p.string3, &p.string4);
+    test(L"h%1[ae]ll%1[^a] wor%1[^\n]%[d]", L"halle worfdddddd", &p, &p.string1, &p.string2, &p.string3, &p.string4);
+    test(L"%[^a]%[b]", L"testbbbb", &p, &p.string1, &p.string2);
+    test(L"%ls %ls", L"Привет мир", &p, &p.wstring1, &p.wstring2);
+    test(L"%ls %ls", L"こんにちは 世界", &p, &p.wstring1, &p.wstring2);
+    test(L"%ls %d %ls %d", L"αβγ 123 δεζ 456", &p, &p.wstring1, &p.ia, &p.wstring2, &p.ib);
+    test(L"%ls %s %ls %s", L"αβγ test1 δεζ test2", &p, &p.wstring1, &p.string1, &p.wstring2, &p.string2);
+    test(L"%ls %ls %ls %ls", L"z ß 水 🍌", &p, &p.wstring1, &p.wstring2, &p.wstring3, &p.wstring4);
+
+    // Scanf stolen from the url parsing in curl
+    wchar_t protobuf[16];
+    wchar_t slashbuf[4];
+    wchar_t hostbuf[100];
+    wchar_t pathbuf[100];
+
+    // don't push NUL, make sure scanf does that
+    memset(protobuf, 97, 16);
+    memset(slashbuf, 97, 4);
+    memset(hostbuf, 97, 100);
+    memset(pathbuf, 97, 100);
+
+    int ret = swscanf(
+        L"https://redox-os.org\0# extra garbage for nul test", L"%15[^\n/:]:%3[/]%[^\n/?#]%[^\n]",
+        &protobuf, &slashbuf, &hostbuf, &pathbuf
+    );
+    if (ret < 4) {
+        *pathbuf = 0;
+    }
+    if (ret < 3) {
+        *hostbuf = 0;
+    }
+    if (ret < 2) {
+        *slashbuf = 0;
+    }
+    if (ret < 1) {
+        *protobuf = 0;
+    }
+
+    wprintf(L"%d \"%s\" \"%s\" \"%s\" \"%s\"\n", ret, &protobuf, &slashbuf, &hostbuf, &pathbuf);
+
+  return 0;
+}