diff --git a/src/header/wchar/lookaheadreader.rs b/src/header/wchar/lookaheadreader.rs new file mode 100644 index 0000000000000000000000000000000000000000..ddf36a63f5fe2c34f48d07d39e702e5862d8ca51 --- /dev/null +++ b/src/header/wchar/lookaheadreader.rs @@ -0,0 +1,53 @@ +use super::{fseek_locked, ftell_locked, FILE, SEEK_SET}; +use crate::{ + io::Read, + platform::types::{off_t, wint_t}, +}; +struct LookAheadBuffer { + buf: *const wint_t, + pos: isize, + look_ahead: isize, +} +impl LookAheadBuffer { + fn look_ahead(&mut self) -> Result<Option<wint_t>, i32> { + let wchar = unsafe { *self.buf.offset(self.look_ahead) }; + if wchar == 0 { + Ok(None) + } else { + self.look_ahead += 1; + Ok(Some(wchar)) + } + } + + fn commit(&mut self) { + self.pos = self.look_ahead; + } +} + +impl From<*const wint_t> for LookAheadBuffer { + fn from(buff: *const wint_t) -> LookAheadBuffer { + LookAheadBuffer { + buf: buff, + pos: 0, + look_ahead: 0, + } + } +} + +pub struct LookAheadReader(LookAheadBuffer); + +impl LookAheadReader { + pub fn lookahead1(&mut self) -> Result<Option<wint_t>, i32> { + self.0.look_ahead() + } + + pub fn commit(&mut self) { + self.0.commit() + } +} + +impl From<*const wint_t> for LookAheadReader { + fn from(buff: *const wint_t) -> LookAheadReader { + LookAheadReader(buff.into()) + } +} diff --git a/src/header/wchar/mod.rs b/src/header/wchar/mod.rs index a2f8530a51ca94a4554b2793cadba38341c59550..0dea96faa84a4c4f4ec9afacfb8f542885514658 100644 --- a/src/header/wchar/mod.rs +++ b/src/header/wchar/mod.rs @@ -15,8 +15,10 @@ use crate::{ platform::{self, types::*, ERRNO}, }; +mod lookaheadreader; mod utf8; mod wprintf; +mod wscanf; #[repr(C)] #[derive(Clone, Copy)] @@ -277,9 +279,23 @@ pub unsafe extern "C" fn putwchar(wc: wchar_t) -> wint_t { fputwc(wc, &mut *stdout) } -// #[no_mangle] -pub extern "C" fn swscanf(s: *const wchar_t, format: *const wchar_t, ap: va_list) -> c_int { - unimplemented!(); +#[no_mangle] +pub unsafe extern "C" fn vswscanf( + s: *const wchar_t, + format: *const wchar_t, + __valist: va_list, +) -> c_int { + let reader = (s as *const wint_t).into(); + wscanf::scanf(reader, format, __valist) +} + +#[no_mangle] +pub unsafe extern "C" fn swscanf( + s: *const wchar_t, + format: *const wchar_t, + mut __valist: ... +) -> c_int { + vswscanf(s, format, __valist.as_va_list()) } /// Push wide character `wc` back onto `stream` so it'll be read next diff --git a/src/header/wchar/wscanf.rs b/src/header/wchar/wscanf.rs new file mode 100644 index 0000000000000000000000000000000000000000..ff61874a5cfd3cd9f5bda61d5705d00a5b2b8619 --- /dev/null +++ b/src/header/wchar/wscanf.rs @@ -0,0 +1,518 @@ +use super::lookaheadreader::LookAheadReader; +use crate::platform::types::*; +use alloc::{string::String, vec::Vec}; +use core::ffi::VaList as va_list; + +#[derive(PartialEq, Eq)] +enum IntKind { + Byte, + Short, + Int, + Long, + LongLong, + IntMax, + PtrDiff, + Size, +} + +#[derive(PartialEq, Eq)] +enum CharKind { + Ascii, + Wide, +} + +/// Helper function for progressing a C string +unsafe fn next_char(string: &mut *const wchar_t) -> Result<wint_t, c_int> { + let c = **string as wint_t; + *string = string.offset(1); + if c == 0 { + Err(-1) + } else { + Ok(c) + } +} + +macro_rules! wc_as_char { + ($c:ident) => { + char::try_from($c).map_err(|_| -1)? + }; +} + +unsafe fn inner_scanf( + mut r: LookAheadReader, + mut format: *const wchar_t, + mut ap: va_list, +) -> Result<c_int, c_int> { + let mut matched = 0; + let mut wchar = 0; + let mut skip_read = false; + let mut count = 0; + + macro_rules! read { + () => {{ + match r.lookahead1() { + Ok(None) => false, + Ok(Some(b)) => { + wchar = b; + count += 1; + true + } + Err(x) => return Err(x), + } + }}; + } + + macro_rules! maybe_read { + () => { + maybe_read!(inner false); + }; + (noreset) => { + maybe_read!(inner); + }; + (inner $($placeholder:expr)*) => { + if !skip_read && !read!() { + match matched { + 0 => return Ok(-1), + a => return Ok(a), + } + } + $(else { + // Hacky way of having this optional + skip_read = $placeholder; + })* + } + } + + while *format != 0 { + let mut c = next_char(&mut format)?; + + if c as u8 == b' ' { + maybe_read!(noreset); + + while (wc_as_char!(wchar)).is_whitespace() { + if !read!() { + return Ok(matched); + } + } + + skip_read = true; + } else if c as u8 != b'%' { + maybe_read!(); + if c != wchar { + return Ok(matched); + } + r.commit(); + } else { + c = next_char(&mut format)?; + + let mut ignore = false; + if c as u8 == b'*' { + ignore = true; + c = next_char(&mut format)?; + } + + let mut width = String::new(); + while c as u8 >= b'0' && c as u8 <= b'9' { + width.push(wc_as_char!(c)); + c = next_char(&mut format)?; + } + let mut width = if width.is_empty() { + None + } else { + match width.parse::<usize>() { + Ok(n) => Some(n), + Err(_) => return Err(-1), + } + }; + + // When an EOF occurs, eof is set, stuff is marked matched + // as usual, and finally it is returned + let mut eof = false; + + let mut kind = IntKind::Int; + let mut c_kind = CharKind::Ascii; + loop { + match c as u8 { + b'h' => { + if kind == IntKind::Short || kind == IntKind::Byte { + kind = IntKind::Byte; + } else { + kind = IntKind::Short; + } + } + b'j' => kind = IntKind::IntMax, + b'l' => { + if kind == IntKind::Long || kind == IntKind::LongLong { + kind = IntKind::LongLong; + } else { + kind = IntKind::Long; + } + } + b'q' | b'L' => kind = IntKind::LongLong, + b't' => kind = IntKind::PtrDiff, + b'z' => kind = IntKind::Size, + // If kind is Long, means we found a 'l' before finding 'c' or 's'. In this + // case the format corresponds to a wide char/string + b'c' | b's' if kind == IntKind::Long => { + c_kind = CharKind::Wide; + break; + } + _ => break, + } + + c = next_char(&mut format)?; + } + + if c as u8 != b'n' { + maybe_read!(noreset); + } + match c as u8 { + b'%' => { + while (wc_as_char!(wchar)).is_whitespace() { + if !read!() { + return Ok(matched); + } + } + + if wchar as u8 != b'%' { + return Err(matched); + } else if !read!() { + return Ok(matched); + } + } + + b'd' | b'i' | b'o' | b'u' | b'x' | b'X' | b'f' | b'e' | b'g' | b'E' | b'a' + | b'p' => { + while (wc_as_char!(wchar)).is_whitespace() { + if !read!() { + return Ok(matched); + } + } + + let pointer = c as u8 == b'p'; + // Pointers aren't automatic, but we do want to parse "0x" + let auto = c as u8 == b'i' || pointer; + let float = c as u8 == b'f' + || c as u8 == b'e' + || c as u8 == b'g' + || c as u8 == b'E' + || c as u8 == b'a'; + + let mut radix = match c as u8 { + b'o' => 8, + b'x' | b'X' | b'p' => 16, + _ => 10, + }; + + let mut n = String::new(); + let mut dot = false; + + while width.map(|w| w > 0).unwrap_or(true) + && ((wchar as u8 >= b'0' && wchar as u8 <= b'7') + || (radix >= 10 && (wchar as u8 >= b'8' && wchar as u8 <= b'9')) + || (float && !dot && wchar as u8 == b'.') + || (radix == 16 + && ((wchar as u8 >= b'a' && wchar as u8 <= b'f') + || (wchar as u8 >= b'A' && wchar as u8 <= b'F')))) + { + if auto + && n.is_empty() + && wchar as u8 == b'0' + && width.map(|w| w > 0).unwrap_or(true) + { + if !pointer { + radix = 8; + } + width = width.map(|w| w - 1); + if !read!() { + return Ok(matched); + } + if width.map(|w| w > 0).unwrap_or(true) + && (wchar as u8 == b'x' || wchar as u8 == b'X') + { + radix = 16; + width = width.map(|w| w - 1); + if width.map(|w| w > 0).unwrap_or(true) && !read!() { + return Ok(matched); + } + } + continue; + } + if wchar as u8 == b'.' { + // Don't allow another dot + dot = true; + } + n.push(wc_as_char!(wchar)); + r.commit(); + width = width.map(|w| w - 1); + if width.map(|w| w > 0).unwrap_or(true) && !read!() { + break; + } + } + + macro_rules! parse_type { + (noformat $type:ident) => {{ + let n = if n.is_empty() { + 0 as $type + } else { + n.parse::<$type>().map_err(|_| 0)? + }; + if !ignore { + *ap.arg::<*mut $type>() = n; + matched += 1; + } + }}; + (c_double) => { + parse_type!(noformat c_double) + }; + (c_float) => { + parse_type!(noformat c_float) + }; + ($type:ident) => { + parse_type!($type, $type) + }; + ($type:ident, $final:ty) => {{ + let n = if n.is_empty() { + 0 as $type + } else { + $type::from_str_radix(&n, radix).map_err(|_| 0)? + }; + if !ignore { + *ap.arg::<*mut $final>() = n as $final; + matched += 1; + } + }}; + } + + if float { + if kind == IntKind::Long || kind == IntKind::LongLong { + parse_type!(c_double); + } else { + parse_type!(c_float); + } + } else if c as u8 == b'p' { + parse_type!(size_t, *mut c_void); + } else { + let unsigned = c as u8 == b'o' + || c as u8 == b'u' + || c as u8 == b'x' + || c as u8 == b'X'; + + match kind { + IntKind::Byte => { + if unsigned { + parse_type!(c_uchar); + } else { + parse_type!(c_char); + } + } + IntKind::Short => { + if unsigned { + parse_type!(c_ushort) + } else { + parse_type!(c_short) + } + } + IntKind::Int => { + if unsigned { + parse_type!(c_uint) + } else { + parse_type!(c_int) + } + } + IntKind::Long => { + if unsigned { + parse_type!(c_ulong) + } else { + parse_type!(c_long) + } + } + IntKind::LongLong => { + if unsigned { + parse_type!(c_ulonglong) + } else { + parse_type!(c_longlong) + } + } + IntKind::IntMax => { + if unsigned { + parse_type!(uintmax_t) + } else { + parse_type!(intmax_t) + } + } + IntKind::PtrDiff => parse_type!(ptrdiff_t), + IntKind::Size => { + if unsigned { + parse_type!(size_t) + } else { + parse_type!(ssize_t) + } + } + } + } + } + + b's' => { + macro_rules! parse_string_type { + ($type:ident) => { + while (wc_as_char!(wchar)).is_whitespace() { + if !read!() { + return Ok(matched); + } + } + + let mut ptr: Option<*mut $type> = + if ignore { None } else { Some(ap.arg()) }; + + while width.map(|w| w > 0).unwrap_or(true) + && !(wc_as_char!(wchar)).is_whitespace() + { + if let Some(ref mut ptr) = ptr { + **ptr = wchar as $type; + *ptr = ptr.offset(1); + } + width = width.map(|w| w - 1); + if width.map(|w| w > 0).unwrap_or(true) && !read!() { + eof = true; + break; + } + } + + if let Some(ptr) = ptr { + *ptr = 0; + matched += 1; + r.commit(); + } + }; + } + + if c_kind == CharKind::Ascii { + parse_string_type!(c_char); + } else { + parse_string_type!(wchar_t); + } + } + + b'c' => { + macro_rules! parse_char_type { + ($type:ident) => { + let ptr: Option<*mut $type> = + if ignore { None } else { Some(ap.arg()) }; + + for i in 0..width.unwrap_or(1) { + if let Some(ptr) = ptr { + *ptr.add(i) = wchar as $type; + } + width = width.map(|w| w - 1); + if width.map(|w| w > 0).unwrap_or(true) && !read!() { + eof = true; + break; + } + } + + if ptr.is_some() { + matched += 1; + r.commit(); + } + }; + } + + if c_kind == CharKind::Ascii { + parse_char_type!(c_char); + } else { + parse_char_type!(wchar_t); + } + } + + b'[' => { + c = next_char(&mut format)?; + + let mut matches = Vec::new(); + let invert = if c as u8 == b'^' { + c = next_char(&mut format)?; + true + } else { + false + }; + + let mut prev; + loop { + matches.push(c); + prev = c; + c = next_char(&mut format)?; + if c as u8 == b'-' { + if prev as u8 == b']' { + continue; + } + c = next_char(&mut format)?; + if c as u8 == b']' { + matches.push('-' as wint_t); + break; + } + prev += 1; + while prev < c { + matches.push(prev); + prev += 1; + } + } else if c as u8 == b']' { + break; + } + } + + let mut ptr: Option<*mut c_char> = if ignore { None } else { Some(ap.arg()) }; + + // While we haven't used up all the width, and it matches + let mut data_stored = false; + while width.map(|w| w > 0).unwrap_or(true) + && !invert == matches.contains(&wchar) + { + if let Some(ref mut ptr) = ptr { + **ptr = wchar as c_char; + *ptr = ptr.offset(1); + data_stored = true; + } + r.commit(); + // Decrease the width, and read a new character unless the width is 0 + width = width.map(|w| w - 1); + if width.map(|w| w > 0).unwrap_or(true) && !read!() { + // Reading a new character has failed, return after + // actually marking this as matched + eof = true; + break; + } + } + + if data_stored { + *ptr.unwrap() = 0; + matched += 1; + } + } + b'n' => { + if !ignore { + *ap.arg::<*mut c_int>() = count as c_int; + } + } + _ => return Err(-1), + } + + if eof { + return Ok(matched); + } + + if width != Some(0) && c as u8 != b'n' { + // It didn't hit the width, so an extra character was read and matched. + // But this character did not match so let's reuse it. + skip_read = true; + } + } + } + Ok(matched) +} + +pub unsafe fn scanf(r: LookAheadReader, format: *const wchar_t, ap: va_list) -> c_int { + match inner_scanf(r, format, ap) { + Ok(n) => n, + Err(n) => n, + } +} diff --git a/tests/Makefile b/tests/Makefile index 5ebaf278bfad76b9d9ade8f4486bfb9ca3f5546e..74cc43bc3c2ef6d19c877d5fd1026660d6c20e5d 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -120,6 +120,7 @@ EXPECT_NAMES=\ wchar/mbsrtowcs \ wchar/printf-on-wchars \ wchar/putwchar \ + wchar/wscanf \ wchar/ungetwc \ wchar/wprintf \ wchar/wcrtomb \ diff --git a/tests/expected/bins_static/wchar/wscanf.stderr b/tests/expected/bins_static/wchar/wscanf.stderr new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/expected/bins_static/wchar/wscanf.stdout b/tests/expected/bins_static/wchar/wscanf.stdout new file mode 100644 index 0000000000000000000000000000000000000000..9fa48dd2538cb4a3f3ed9fee1e04a1a651a21dd4 Binary files /dev/null and b/tests/expected/bins_static/wchar/wscanf.stdout differ diff --git a/tests/wchar/wscanf.c b/tests/wchar/wscanf.c new file mode 100644 index 0000000000000000000000000000000000000000..c74c4ded866c707a1018c47f44157baf3a708782 --- /dev/null +++ b/tests/wchar/wscanf.c @@ -0,0 +1,96 @@ +/* swscanf example */ +#include <wchar.h> +#include <stdio.h> + +#include "test_helpers.h" + +struct params { + short sa; + int ia; + int ib; + int ic; + float fa; + double da; + int *ptr; + char c; + wchar_t wc; + char string1[20]; + char string2[20]; + char string3[20]; + char string4[20]; + wchar_t wstring1[20]; + wchar_t wstring2[20]; + wchar_t wstring3[20]; + wchar_t wstring4[20]; +}; + + +void test(wchar_t* fmt_in, wchar_t* input, struct params *p, ...) { + va_list args; + va_start(args, p); + wint_t ret = vswscanf(input, fmt_in, args); + va_end(args); + + wprintf( + L"%d, { sa: %hhd, ia: %d, ib: %d, ic: %d, fa: %f, da: %lf, ptr: %p, char: %c, wide char: %lc, string1: %s, string2: %s, string3: %s, string4: %s, wstring1: %ls, wstring2: %ls, wstring3: %ls, wstring4: %ls }\n", + ret, p->sa, p->ia, p->ib, p->ic, p->fa, p->da, p->ptr, p->c, p->wc, p->string1, p->string2, p->string3, p->string4, p->wstring1, p->wstring2, p->wstring3, p->wstring4 + ); +} + +int main () +{ + struct params p = { .c = 'a' }; + + test(L"%hd %d", L"12 345", &p, &p.sa, &p.ia); + test(L"%x %i %i", L"12 0x345 010", &p, &p.ia, &p.ib, &p.ic); + test(L"%f.%lf", L"0.1.0.2", &p, &p.fa, &p.da); + test(L"%p", L"0xABCDEF", &p, &p.ptr); + test(L"%s", L"Hello World", &p, &p.string1); + test(L"%3i", L"0xFF", &p, &p.ia); + test(L"%c%3c", L"hello", &p, &p.c, &p.string1); + test(L"%lc", L"β", &p, &p.wc); + test(L"%lc %f", L"Ï€ 3.14", &p, &p.wc, &p.fa); + test(L"test: %2i%n", L"test: 0xFF", &p, &p.ia, &p.ib); + test(L"hello world%%", L"hello world%", &p); + test(L"h%1[ae]ll%1[^a] wor%1[^\n]%[d]", L"hello world", &p, &p.string1, &p.string2, &p.string3, &p.string4); + test(L"h%1[ae]ll%1[^a] wor%1[^\n]%[d]", L"halle worfdddddd", &p, &p.string1, &p.string2, &p.string3, &p.string4); + test(L"%[^a]%[b]", L"testbbbb", &p, &p.string1, &p.string2); + test(L"%ls %ls", L"Привет мир", &p, &p.wstring1, &p.wstring2); + test(L"%ls %ls", L"ã“ã‚“ã«ã¡ã¯ 世界", &p, &p.wstring1, &p.wstring2); + test(L"%ls %d %ls %d", L"αβγ 123 δεζ 456", &p, &p.wstring1, &p.ia, &p.wstring2, &p.ib); + test(L"%ls %s %ls %s", L"αβγ test1 δεζ test2", &p, &p.wstring1, &p.string1, &p.wstring2, &p.string2); + test(L"%ls %ls %ls %ls", L"z ß æ°´ ðŸŒ", &p, &p.wstring1, &p.wstring2, &p.wstring3, &p.wstring4); + + // Scanf stolen from the url parsing in curl + wchar_t protobuf[16]; + wchar_t slashbuf[4]; + wchar_t hostbuf[100]; + wchar_t pathbuf[100]; + + // don't push NUL, make sure scanf does that + memset(protobuf, 97, 16); + memset(slashbuf, 97, 4); + memset(hostbuf, 97, 100); + memset(pathbuf, 97, 100); + + int ret = swscanf( + L"https://redox-os.org\0# extra garbage for nul test", L"%15[^\n/:]:%3[/]%[^\n/?#]%[^\n]", + &protobuf, &slashbuf, &hostbuf, &pathbuf + ); + if (ret < 4) { + *pathbuf = 0; + } + if (ret < 3) { + *hostbuf = 0; + } + if (ret < 2) { + *slashbuf = 0; + } + if (ret < 1) { + *protobuf = 0; + } + + wprintf(L"%d \"%s\" \"%s\" \"%s\" \"%s\"\n", ret, &protobuf, &slashbuf, &hostbuf, &pathbuf); + + return 0; +}