diff --git a/Cargo.lock b/Cargo.lock index f8ed1c3e8cd164d8bc1fb48e108c9b96d1c92052..6373598afd60e970334453ac84a0fc3259e02ebc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,6 +148,11 @@ name = "num-traits" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "posix-regex" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "proc-macro2" version = "0.2.3" @@ -227,6 +232,7 @@ dependencies = [ "compiler_builtins 0.1.0 (git+https://github.com/rust-lang-nursery/compiler-builtins.git)", "core_io 0.1.20180619", "lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "posix-regex 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "ralloc 1.0.0", "rand 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", "redox_syscall 0.1.40 (git+https://gitlab.redox-os.org/redox-os/syscall.git?branch=relibc)", @@ -464,6 +470,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" "checksum log 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "d4fcce5fa49cc693c312001daf1d13411c4a5283796bac1084299ea3e567113f" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1" +"checksum posix-regex 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "58b31ca4f5022c6c0a22206d63c177be2f418355db5a713db22bd901c6ac0db3" "checksum proc-macro2 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "cd07deb3c6d1d9ff827999c7f9b04cdfd66b1b17ae508e14fe47b620f2282ae0" "checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a" "checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" diff --git a/Cargo.toml b/Cargo.toml index 753a030e2581fbc99559f3d226ae513c74977d74..4670d53fcafe133a62c4c91ebafa4edefa2a7bb3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ cc = "1.0.17" cbitset = "0.1.0" core_io = { path = "core_io", features = ["collections"] } lazy_static = { version = "*", features = ["nightly", "spin_no_std"] } +posix-regex = { version = "0.1", features = ["no_std"] } rand = { version = "0.5.2", default-features = false } va_list = { path = "va_list", features = ["no_std"] } diff --git a/Makefile b/Makefile index f58e391e94ffb74f738cff795da289c655ebb6dd..a6da156f7e8f9818259af8688db08e14c81db490 100644 --- a/Makefile +++ b/Makefile @@ -53,6 +53,7 @@ libc: $(BUILD)/release/libc.a $(BUILD)/release/crt0.o $(BUILD)/release/crti.o $( libm: $(BUILD)/openlibm/libopenlibm.a sysroot: all + rm -rf $@ rm -rf $@.partial mkdir -p $@.partial make install DESTDIR=$@.partial diff --git a/include/sys/types.h b/include/sys/types.h index c4048845afc17f45a0ae99d907331fdaa56f4557..9dabc63ce5f95571aed2fdd38a0741ee67b7e3e2 100644 --- a/include/sys/types.h +++ b/include/sys/types.h @@ -20,6 +20,7 @@ typedef long clock_t; typedef int clockid_t; typedef void* timer_t; typedef unsigned long int blkcnt_t; +typedef size_t regoff_t; typedef unsigned char u_char, uchar; typedef unsigned short u_short, ushort; diff --git a/src/header/mod.rs b/src/header/mod.rs index 889483221c6e8814e3329586b57b5c91d117052e..ce0c6b798bcfe12ffb9dc89533f0ec6eb639e125 100644 --- a/src/header/mod.rs +++ b/src/header/mod.rs @@ -14,6 +14,7 @@ pub mod netdb; pub mod netinet_in; //pub mod pthread; pub mod pwd; +pub mod regex; pub mod semaphore; pub mod setjmp; pub mod sgtty; diff --git a/src/header/regex/cbindgen.toml b/src/header/regex/cbindgen.toml new file mode 100644 index 0000000000000000000000000000000000000000..e027cd276b3716c9e94b137f69a3923263a46726 --- /dev/null +++ b/src/header/regex/cbindgen.toml @@ -0,0 +1,7 @@ +sys_includes = ["sys/types.h"] +include_guard = "_TEMPLATE_H" +language = "C" +style = "Type" + +[enum] +prefix_with_name = true diff --git a/src/header/regex/mod.rs b/src/header/regex/mod.rs new file mode 100644 index 0000000000000000000000000000000000000000..d82a1068e71e6f17fc7baaede27852e6a41fff04 --- /dev/null +++ b/src/header/regex/mod.rs @@ -0,0 +1,183 @@ +//! regex.h implementation, following http://pubs.opengroup.org/onlinepubs/7908799/xsh/regex.h.html + +use alloc::borrow::Cow; +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::{mem, slice, ptr}; +use header::string::strlen; +use platform::types::*; +use posix_regex::{PosixRegexBuilder, PosixRegex}; +use posix_regex::compile::{Error as CompileError, Token, Range}; + +#[repr(C)] +pub struct regex_t { + // Can't be a normal Vec<T> because then the struct size won't be known + // from C. + ptr: *mut c_void, + length: size_t, + capacity: size_t, + + cflags: c_int, + re_nsub: size_t +} +#[repr(C)] +pub struct regmatch_t { + rm_so: regoff_t, + rm_eo: regoff_t +} + +pub const REG_EXTENDED: c_int = 1; +pub const REG_ICASE: c_int = 2; +pub const REG_NOSUB: c_int = 4; +pub const REG_NEWLINE: c_int = 8; +pub const REG_NOTBOL: c_int = 16; +pub const REG_NOTEOL: c_int = 32; + +pub const REG_NOMATCH: c_int = 1; +pub const REG_BADPAT: c_int = 2; +pub const REG_ECOLLATE: c_int = 3; +pub const REG_ECTYPE: c_int = 4; +pub const REG_EESCAPE: c_int = 5; +pub const REG_ESUBREG: c_int = 6; +pub const REG_EBRACK: c_int = 7; +pub const REG_ENOSYS: c_int = 8; +pub const REG_EPAREN: c_int = 9; +pub const REG_EBRACE: c_int = 10; +pub const REG_BADBR: c_int = 11; +pub const REG_ERANGE: c_int = 12; +pub const REG_ESPACE: c_int = 13; +pub const REG_BADRPT: c_int = 14; + +fn count_groups(branches: &[Vec<(Token, Range)>]) -> usize { + let mut count = 0; + for branch in branches { + for (token, _) in branch { + if let Token::Group(ref inner) = token { + count += 1 + count_groups(inner); + } + } + } + count +} + +#[no_mangle] +pub extern "C" fn regcomp(out: *mut regex_t, pat: *const c_char, cflags: c_int) -> c_int { + if cflags & REG_EXTENDED == REG_EXTENDED { + return REG_ENOSYS; + } + + let pat = unsafe { slice::from_raw_parts(pat as *const u8, strlen(pat)) }; + let res = PosixRegexBuilder::new(pat) + .with_default_classes() + .compile_tokens(); + + match res { + Ok(mut branches) => unsafe { + let re_nsub = count_groups(&branches); + *out = regex_t { + ptr: branches.as_mut_ptr() as *mut c_void, + length: branches.len(), + capacity: branches.capacity(), + + cflags, + re_nsub, + }; + mem::forget(branches); + 0 + }, + Err(CompileError::EmptyRepetition) + | Err(CompileError::IntegerOverflow) + | Err(CompileError::IllegalRange) => REG_BADBR, + Err(CompileError::UnclosedRepetition) => REG_EBRACE, + Err(CompileError::LeadingRepetition) => REG_BADRPT, + Err(CompileError::UnknownCollation) => REG_ECOLLATE, + Err(CompileError::UnknownClass(_)) => REG_ECTYPE, + Err(_) => REG_BADPAT + } +} +#[no_mangle] +pub unsafe extern "C" fn regfree(regex: *mut regex_t) { + Vec::from_raw_parts( + (*regex).ptr as *mut Vec<(Token, Range)>, + (*regex).length, + (*regex).capacity + ); +} +#[no_mangle] +pub extern "C" fn regexec(regex: *const regex_t, input: *const c_char, + nmatch: size_t, pmatch: *mut regmatch_t, eflags: c_int) -> c_int { + if eflags & REG_EXTENDED == REG_EXTENDED { + return REG_ENOSYS; + } + + let regex = unsafe { &(*regex) }; + + // Allow specifying a compiler argument to the executor and vise versa + // because why not? + let mut flags = regex.cflags | eflags; + + let input = unsafe { slice::from_raw_parts(input as *const u8, strlen(input)) }; + + let branches = unsafe { slice::from_raw_parts(regex.ptr as *const Vec<(Token, Range)>, regex.length) }; + + let matches = PosixRegex::new(Cow::Borrowed(&branches)) + .case_insensitive(flags & REG_ICASE == REG_ICASE) + .newline(flags & REG_NEWLINE == REG_NEWLINE) + .no_start(flags & REG_NOTBOL == REG_NOTBOL) + .no_end(flags & REG_NOTEOL == REG_NOTEOL) + .matches(input, Some(1)); + + if !matches.is_empty() + && eflags & REG_NOSUB != REG_NOSUB + && !pmatch.is_null() + && nmatch > 0 { + let first = &matches[0]; + + let len = first.len().min(nmatch as usize); + for i in 0..len { + let (start, end) = first[i]; + unsafe { + *pmatch.offset(i as isize) = regmatch_t { + rm_so: start, + rm_eo: end + }; + } + } + for i in len as isize..nmatch as isize { + unsafe { + *pmatch.offset(i) = regmatch_t { + rm_so: !0, + rm_eo: !0 + }; + } + } + } + + if matches.is_empty() { REG_NOMATCH } else { 0 } +} + +#[no_mangle] +pub extern "C" fn regerror(code: c_int, _regex: *const regex_t, out: *mut c_char, max: c_int) { + let string = match code { + 0 => "No error\0", + REG_NOMATCH => "No match\0", + REG_BADPAT => "Invalid regexp\0", + REG_ECOLLATE => "Unknown collating element\0", + REG_ECTYPE => "Unknown character class name\0", + REG_EESCAPE => "Trailing backslash\0", + REG_ESUBREG => "Invalid back reference\0", + REG_EBRACK => "Missing ']'\0", + REG_ENOSYS => "Unsupported operation\0", + REG_EPAREN => "Missing ')'\0", + REG_EBRACE => "Missing '}'\0", + REG_BADBR => "Invalid contents of {}\0", + REG_ERANGE => "Invalid character range\0", + REG_ESPACE => "Out of memory\0", + REG_BADRPT => "Repetition not preceded by valid expression\0", + _ => "Unknown error\0" + }; + + unsafe { + ptr::copy_nonoverlapping(string.as_ptr(), out as *mut u8, string.len().min(max as usize)) + } +} diff --git a/src/lib.rs b/src/lib.rs index 75b32e3c101835e46564efdf4329894c5982b757..4319f6d767c2ed5574d26072b873b8d5c9bc57a5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ extern crate cbitset; extern crate core_io; #[macro_use] extern crate lazy_static; +extern crate posix_regex; extern crate rand; extern crate va_list; diff --git a/src/platform/types.rs b/src/platform/types.rs index 39afacf3558e0d9f5f43bcdf410788b1db914591..dcd5803c38c623ce89e3b3a06ec8ec69e63dae29 100644 --- a/src/platform/types.rs +++ b/src/platform/types.rs @@ -46,6 +46,7 @@ pub type wchar_t = i32; pub type wint_t = u32; pub type wctype_t = i64; +pub type regoff_t = size_t; pub type off_t = c_long; pub type mode_t = c_int; pub type time_t = c_long; diff --git a/tests/Makefile b/tests/Makefile index 59e0834bc490a2096292ac1cfc651b21e13432f8..a48b45f7836e1f997fb4d442d562247ddbaa45f6 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -11,6 +11,7 @@ EXPECT_BINS=\ locale \ math \ netdb \ + regex \ select \ setjmp \ signal \ diff --git a/tests/expected/regex.stderr b/tests/expected/regex.stderr new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/expected/regex.stdout b/tests/expected/regex.stdout new file mode 100644 index 0000000000000000000000000000000000000000..07caf6fc01baa2d731ef3a8188dadd21eab81191 --- /dev/null +++ b/tests/expected/regex.stdout @@ -0,0 +1,3 @@ +Matching group: 25 - 36 +Matching group: 31 - 36 +Matching group: -1 - -1 diff --git a/tests/regex.c b/tests/regex.c new file mode 100644 index 0000000000000000000000000000000000000000..432a8b546ed2fb0fa7f29473bb3ef077a998e28a --- /dev/null +++ b/tests/regex.c @@ -0,0 +1,31 @@ +#include <regex.h> +#include <stdio.h> + +int main() { + regex_t regex; + char error_buf[256]; + + int error = regcomp(®ex, "h.llo \\(w.rld\\)", REG_ICASE); + if (error) { + regerror(error, ®ex, error_buf, 255); + error_buf[255] = 0; + printf("regcomp error: %d = %s\n", error, error_buf); + return -1; + } + + regmatch_t matches[3] = { 0 }; + + error = regexec(®ex, "Hey, how are you? Hello? Hallo Wurld??", 3, matches, 0); + + regfree(®ex); + + if (error) { + regerror(error, ®ex, error_buf, 255); + printf("regexec error: %d = %s\n", error, error_buf); + return -1; + } + + for (int group = 0; group < 3; group += 1) { + printf("Matching group: %d - %d\n", matches[group].rm_so, matches[group].rm_eo); + } +}