diff --git a/pomsky-bin/src/test.rs b/pomsky-bin/src/test.rs index 8d53c24..6b16e9d 100644 --- a/pomsky-bin/src/test.rs +++ b/pomsky-bin/src/test.rs @@ -8,7 +8,7 @@ use pomsky::{ }; pub(crate) fn run_tests( - parsed: Expr<'_>, + parsed: Expr, input: &str, options: CompileOptions, errors: &mut Vec, @@ -53,7 +53,7 @@ pub(crate) fn run_tests( } } -fn check_test_match(regex: &Regex, test_case: TestCaseMatch<'_>, errors: &mut Vec) { +fn check_test_match(regex: &Regex, test_case: TestCaseMatch, errors: &mut Vec) { let result = regex.captures(test_case.literal.content.as_bytes()); match result { Ok(Some(captures)) => { @@ -67,9 +67,9 @@ fn check_test_match(regex: &Regex, test_case: TestCaseMatch<'_>, errors: &mut Ve } for capture in &test_case.captures { - let Some(got_capture) = (match capture.ident { + let Some(got_capture) = (match &capture.ident { CaptureIdent::Name(name) => captures.name(name), - CaptureIdent::Index(idx) => captures.get(idx as usize), + &CaptureIdent::Index(idx) => captures.get(idx as usize), }) else { errors.push(Diagnostic::test_failure( capture.ident_span, @@ -103,7 +103,7 @@ fn check_test_match(regex: &Regex, test_case: TestCaseMatch<'_>, errors: &mut Ve fn check_all_test_matches( regex: &Regex, - test_case: TestCaseMatchAll<'_>, + test_case: TestCaseMatchAll, errors: &mut Vec, ) { let captures_iter = regex @@ -144,9 +144,9 @@ fn check_all_test_matches( } for capture in &test_case.captures { - let Some(got_capture) = (match capture.ident { + let Some(got_capture) = (match &capture.ident { CaptureIdent::Name(name) => captures.name(name), - CaptureIdent::Index(idx) => captures.get(idx as usize), + &CaptureIdent::Index(idx) => captures.get(idx as usize), }) else { errors.push(Diagnostic::test_failure( capture.ident_span, @@ -173,7 +173,7 @@ fn check_all_test_matches( } } -fn check_test_reject(regex: &Regex, test_case: TestCaseReject<'_>, errors: &mut Vec) { +fn check_test_reject(regex: &Regex, test_case: TestCaseReject, errors: &mut Vec) { let result = regex.captures(test_case.literal.content.as_bytes()); match result { Ok(Some(captures)) => { diff --git a/pomsky-lib/Cargo.toml b/pomsky-lib/Cargo.toml index 39936cd..32f2682 100644 --- a/pomsky-lib/Cargo.toml +++ b/pomsky-lib/Cargo.toml @@ -19,6 +19,7 @@ exclude = ["tests/**", "fuzz/**", "afl-fuzz/**"] default = [] dbg = ["pomsky-syntax/dbg"] suggestions = ["pomsky-syntax/suggestions"] +arbitrary = ["dep:arbitrary", "pomsky-syntax/arbitrary"] [dependencies] pomsky-syntax = { version = "0.11.0", path = "../pomsky-syntax" } diff --git a/pomsky-lib/afl-fuzz/Cargo.lock b/pomsky-lib/afl-fuzz/Cargo.lock index edfc515..15e0ea6 100644 --- a/pomsky-lib/afl-fuzz/Cargo.lock +++ b/pomsky-lib/afl-fuzz/Cargo.lock @@ -276,6 +276,9 @@ dependencies = [ [[package]] name = "pomsky-syntax" version = "0.11.0" +dependencies = [ + "arbitrary", +] [[package]] name = "proc-macro2" diff --git a/pomsky-lib/afl-fuzz/Cargo.toml b/pomsky-lib/afl-fuzz/Cargo.toml index fdb6c9e..e24d633 100644 --- a/pomsky-lib/afl-fuzz/Cargo.toml +++ b/pomsky-lib/afl-fuzz/Cargo.toml @@ -12,7 +12,7 @@ afl = "0.14.3" arbitrary = "1.3.2" regex = "1" regex-test = { path = "../../regex-test" } -pomsky = { path = "..", features = ["arbitrary"] } +pomsky = { path = "..", features = ["dbg", "arbitrary"] } # Prevent this from interfering with workspaces [workspace] diff --git a/pomsky-lib/afl-fuzz/README.md b/pomsky-lib/afl-fuzz/README.md new file mode 100644 index 0000000..46eb335 --- /dev/null +++ b/pomsky-lib/afl-fuzz/README.md @@ -0,0 +1,33 @@ +# AFL fuzzer + +This fuzzer checks that the Pomsky compiler does not crash for any input, and produces valid regular expressions. + +The latter requirement is tested by compiling the regex with the respective regex engine. This requires the following programs to be installed: + +- deno (for JavaScript) +- javac +- python +- mcs (for .NET) + +## Usage + +It is recommended to use [just](https://github.com/casey/just). When fuzzing Pomsky for the first time, run + +```sh +just fuzz_init +just fuzz in +``` + +When you want to resume a previous fuzzing session, you can just + +```sh +just fuzz +``` + +## Analyze crashes + +When you found a crash, you might find it in `errors.txt`. If it's not in `errors.txt`, that likely means that there was an unexpected panic. To minimize it, run `just tmin `, where `` is the path to a file in the `out/default/crashes` folder. This command minimizes the input for the crash and creates a logfile at `log.txt` that should make it possible to identify the bug. + +## Report the bug + +Please report the bug [here](https://github.com/pomsky-lang/pomsky/issues). If you think it could be a security vulnerability, please disclose it directly per email: ludwig.stecher@gmx.de. diff --git a/pomsky-lib/afl-fuzz/ignored_errors.txt b/pomsky-lib/afl-fuzz/ignored_errors.txt index a00306a..f4bb537 100644 --- a/pomsky-lib/afl-fuzz/ignored_errors.txt +++ b/pomsky-lib/afl-fuzz/ignored_errors.txt @@ -1,5 +1,14 @@ Ruby|Oniguruma error: never ending recursion +# Ruby|Oniguruma error: too big number for repeat range + Rust|empty character classes are not allowed Rust|Compiled regex exceeds size limit -PCRE|error compiling pattern at offset \d+: lookbehind assertion is not fixed length -Py|look-behind requires fixed-width pattern \ No newline at end of file + +PCRE|branch too long in variable-length lookbehind assertion +PCRE|regular expression is too large +# repetitions can be at most 65535 (2^16) +PCRE|number too big in \{\} quantifier +# lookbehind must not match more than 65535 (2^16) code points +PCRE|lookbehind assertion is too long + +Java|Look-behind group does not have an obvious maximum length \ No newline at end of file diff --git a/pomsky-lib/afl-fuzz/justfile b/pomsky-lib/afl-fuzz/justfile index 2120ff0..a2f4209 100644 --- a/pomsky-lib/afl-fuzz/justfile +++ b/pomsky-lib/afl-fuzz/justfile @@ -1,3 +1,11 @@ +fuzz_init: + cargo install cargo-afl + cargo afl system-config + +fuzz in='-': + cargo afl build + cargo afl fuzz -i {{in}} -o out target/debug/afl-fuzz + tmin input: - rm log.txt - FUZZ_LOG=1 AFL_DEBUG=1 AFL_MAP_SIZE=100000 cargo afl tmin -i {{input}} -o out.txt -- ./target/debug/afl-fuzz \ No newline at end of file + rm -f log.txt + FUZZ_LOG=1 AFL_DEBUG=1 AFL_MAP_SIZE=100000 cargo afl tmin -i {{input}} -o out.txt -- ./target/debug/afl-fuzz diff --git a/pomsky-lib/afl-fuzz/src/main.rs b/pomsky-lib/afl-fuzz/src/main.rs index fd364a3..82d73ec 100644 --- a/pomsky-lib/afl-fuzz/src/main.rs +++ b/pomsky-lib/afl-fuzz/src/main.rs @@ -35,14 +35,13 @@ fn main() { afl::fuzz(true, |data| { let mut u = Unstructured::new(data); - let Ok((input, compile_options)) = Arbitrary::arbitrary(&mut u) else { return }; - let input = transform_input(input); + let Ok((compile_options, input)) = Arbitrary::arbitrary(&mut u) else { return }; - debug!(f, "\n{:?} -- {:?}\n", input, compile_options); + debug!(f, "\n\n{:#?}\n{:?} -- {:?}\n", input, input, compile_options); - let result = Expr::parse_and_compile(&input, compile_options); + let result = Expr::compile(&input, "", compile_options); - if let (Some(regex), _warnings, _tests) = result { + if let (Some(regex), _warnings) = result { debug!(f, " compiled;"); let features = compile_options.allowed_features; @@ -52,7 +51,7 @@ fn main() { // - don't validate raw regexes, which may be invalid if regex.len() < 2048 && !regex.is_empty() && features == { features }.regexes(false) { debug!(f, " check"); - check(®ex, &ignored_errors, compile_options.flavor, f, ef); + check(input, ®ex, &ignored_errors, compile_options.flavor, f, ef); } else { debug!(f, " SKIPPED (too long or `regex` feature enabled)"); } @@ -63,6 +62,7 @@ fn main() { } fn check( + expr: Expr, regex: &str, ignored_errors: &HashMap, flavor: RegexFlavor, @@ -89,7 +89,7 @@ fn check( } } - debug!(ef, "{flavor:?}|{regex:?}|{e}\n"); + debug!(ef, "\n{expr:?}\n{flavor:?}|{regex:?}|{e}\n"); debug!(f, " {regex:?} ({flavor:?}) failed:\n{e}"); panic!("Regex {regex:?} is invalid in the {flavor:?} flavor:\n{e}"); } @@ -101,6 +101,9 @@ fn parse_ignored_errors() -> HashMap { let ignored_errors = ignored_errors .lines() .filter_map(|line| { + if line.starts_with('#') || line.is_empty() { + return None; + } Some(match line.split_once('|') { Some(("JS" | "JavaScript", err)) => (RegexFlavor::JavaScript, err), Some(("Java", err)) => (RegexFlavor::Java, err), @@ -121,44 +124,3 @@ fn parse_ignored_errors() -> HashMap { ignored_errors.into_iter().map(|(k, v)| (k, RegexSet::new(v).unwrap())).collect() } - -fn transform_input(input: &str) -> String { - input.chars().fold(String::with_capacity(input.len()), |mut acc, c| match c { - // increase likelihood of generating these key words and important sequences by chance - 'à' => acc + " Codepoint ", - 'á' => acc + " Grapheme ", - 'â' => acc + " Start ", - 'ã' => acc + " End ", - 'ä' => acc + " lazy ", - 'å' => acc + " greedy ", - 'æ' => acc + " enable ", - 'ç' => acc + " disable ", - 'è' => acc + " unicode ", - 'é' => acc + " test {", - 'ê' => acc + " match ", - 'ë' => acc + " reject ", - 'ì' => acc + " in ", - 'í' => acc + " as ", - 'î' => acc + " if ", - 'ï' => acc + " else ", - 'ð' => acc + " regex ", - 'ñ' => acc + " recursion ", - 'ò' => acc + " range ", - 'ó' => acc + " base ", - 'ô' => acc + " let ", - 'õ' => acc + " U+1FEFF ", - 'ö' => acc + ":bla(", - 'ø' => acc + "::bla ", - 'ù' => acc + "<< ", - 'ú' => acc + ">> ", - 'û' => acc + "'test'", - 'ü' => acc + "atomic", - 'ý' => acc + " U+FEFF ", - // 'þ' => acc + "", - // 'ÿ' => acc + "", - _ => { - acc.push(c); - acc - } - }) -} diff --git a/pomsky-lib/src/capturing_groups.rs b/pomsky-lib/src/capturing_groups.rs index aecfad7..46163a9 100644 --- a/pomsky-lib/src/capturing_groups.rs +++ b/pomsky-lib/src/capturing_groups.rs @@ -41,7 +41,7 @@ impl RuleVisitor for CapturingGroupsCollector { } fn visit_group(&mut self, group: &exprs::Group) -> Result<(), CompileError> { - match group.kind { + match &group.kind { GroupKind::Capturing(Capture { name: Some(name) }) => { if self.variable_nesting > 0 { return Err(CompileErrorKind::CaptureInLet.at(group.span)); diff --git a/pomsky-lib/src/compile.rs b/pomsky-lib/src/compile.rs index 4de148e..1e4d7a9 100644 --- a/pomsky-lib/src/compile.rs +++ b/pomsky-lib/src/compile.rs @@ -8,10 +8,10 @@ use crate::{ regex::Regex, }; -pub(crate) type CompileResult<'i> = Result, CompileError>; +pub(crate) type CompileResult = Result; #[derive(Clone)] -pub(crate) struct CompileState<'c, 'i> { +pub(crate) struct CompileState<'i> { pub(crate) next_idx: u32, pub(crate) used_names_vec: Vec>, pub(crate) used_names: HashMap, @@ -19,16 +19,16 @@ pub(crate) struct CompileState<'c, 'i> { pub(crate) numbered_groups_count: u32, pub(crate) in_lookbehind: bool, - pub(crate) variables: Vec<(&'i str, &'c Rule<'i>)>, + pub(crate) variables: Vec<(&'i str, &'i Rule)>, pub(crate) current_vars: HashSet, pub(crate) diagnostics: Vec, } -impl<'c, 'i> CompileState<'c, 'i> { +impl<'i> CompileState<'i> { pub(crate) fn new( capt_groups: CapturingGroupsCollector, - variables: Vec<(&'i str, &'c Rule<'i>)>, + variables: Vec<(&'i str, &'i Rule)>, ) -> Self { let used_names = capt_groups.names; let groups_count = capt_groups.count_named + capt_groups.count_numbered; diff --git a/pomsky-lib/src/diagnose/compile_error.rs b/pomsky-lib/src/diagnose/compile_error.rs index 35689db..b649c45 100644 --- a/pomsky-lib/src/diagnose/compile_error.rs +++ b/pomsky-lib/src/diagnose/compile_error.rs @@ -88,6 +88,13 @@ pub(crate) enum CompileErrorKind { RubyLookaheadInLookbehind { was_word_boundary: bool, }, + UnsupportedInLookbehind { + flavor: RegexFlavor, + feature: Feature, + }, + LookbehindNotConstantLength { + flavor: RegexFlavor, + }, NestedTest, } @@ -186,6 +193,17 @@ impl core::fmt::Display for CompileErrorKind { CompileErrorKind::NestedTest => { write!(f, "Unit tests may only appear at the top level of the expression") } + CompileErrorKind::UnsupportedInLookbehind { flavor, feature } => { + write!(f, "Feature `{feature:?}` is not supported within lookbehinds in the {flavor:?} flavor") + } + CompileErrorKind::LookbehindNotConstantLength { flavor } => match flavor { + RegexFlavor::Pcre | RegexFlavor::Python => write!( + f, + "In the {flavor:?} flavor, lookbehinds must have a {} length", + if flavor == &RegexFlavor::Pcre { "bounded" } else { "constant" } + ), + _ => write!(f, "This kind of lookbehind is not supported in the {flavor:?} flavor"), + }, } } } diff --git a/pomsky-lib/src/diagnose/diagnostic_code.rs b/pomsky-lib/src/diagnose/diagnostic_code.rs index 32583a2..cd976d1 100644 --- a/pomsky-lib/src/diagnose/diagnostic_code.rs +++ b/pomsky-lib/src/diagnose/diagnostic_code.rs @@ -93,6 +93,8 @@ diagnostic_code! { IllegalNegation = 317, DotNetNumberedRefWithMixedGroups = 318, RubyLookaheadInLookbehind = 319, + UnsupportedInLookbehind = 320, + LookbehindNotConstantLength = 321, // Warning indicating something might not be supported PossiblyUnsupported = 400, @@ -222,6 +224,8 @@ impl<'a> From<&'a CompileErrorKind> for DiagnosticCode { C::JsWordBoundaryInUnicodeMode => Self::UnsupportedInUnicodeMode, C::DotNetNumberedRefWithMixedGroups => Self::DotNetNumberedRefWithMixedGroups, C::RubyLookaheadInLookbehind { .. } => Self::RubyLookaheadInLookbehind, + C::UnsupportedInLookbehind { .. } => Self::UnsupportedInLookbehind, + C::LookbehindNotConstantLength { .. } => Self::LookbehindNotConstantLength, C::NestedTest => Self::NestedTest, } } diff --git a/pomsky-lib/src/diagnose/diagnostic_kind.rs b/pomsky-lib/src/diagnose/diagnostic_kind.rs index 9f9ab8a..4c21c9d 100644 --- a/pomsky-lib/src/diagnose/diagnostic_kind.rs +++ b/pomsky-lib/src/diagnose/diagnostic_kind.rs @@ -50,7 +50,9 @@ impl From<&CompileErrorKind> for DiagnosticKind { | K::NestedTest | K::NegatedHorizVertSpace | K::DotNetNumberedRefWithMixedGroups - | K::RubyLookaheadInLookbehind { .. } => DiagnosticKind::Unsupported, + | K::RubyLookaheadInLookbehind { .. } + | K::UnsupportedInLookbehind { .. } + | K::LookbehindNotConstantLength { .. } => DiagnosticKind::Unsupported, K::RangeIsTooBig(_) => DiagnosticKind::Limits, } } diff --git a/pomsky-lib/src/exprs/alternation.rs b/pomsky-lib/src/exprs/alternation.rs index 181e621..0855c1b 100644 --- a/pomsky-lib/src/exprs/alternation.rs +++ b/pomsky-lib/src/exprs/alternation.rs @@ -9,12 +9,12 @@ use crate::{ use super::{Alternation, RuleExt}; -impl<'i> RuleExt<'i> for Alternation<'i> { +impl RuleExt for Alternation { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { Ok(Regex::Alternation(RegexAlternation { parts: self .rules @@ -26,12 +26,12 @@ impl<'i> RuleExt<'i> for Alternation<'i> { } #[cfg_attr(feature = "dbg", derive(Debug))] -pub(crate) struct RegexAlternation<'i> { - pub(crate) parts: Vec>, +pub(crate) struct RegexAlternation { + pub(crate) parts: Vec, } -impl<'i> RegexAlternation<'i> { - pub(crate) fn new(parts: Vec>) -> Self { +impl RegexAlternation { + pub(crate) fn new(parts: Vec) -> Self { Self { parts } } @@ -40,6 +40,8 @@ impl<'i> RegexAlternation<'i> { rule.codegen(buf, flavor); buf.push('|'); } - let _ = buf.pop(); + if !self.parts.is_empty() { + let _ = buf.pop(); + } } } diff --git a/pomsky-lib/src/exprs/boundary.rs b/pomsky-lib/src/exprs/boundary.rs index 84b6db8..4bb7ef0 100644 --- a/pomsky-lib/src/exprs/boundary.rs +++ b/pomsky-lib/src/exprs/boundary.rs @@ -13,12 +13,8 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Boundary { - fn compile<'c>( - &'c self, - options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { +impl RuleExt for Boundary { + fn compile(&self, options: CompileOptions, state: &mut CompileState<'_>) -> CompileResult { use BoundaryKind::*; if options.flavor == RegexFlavor::JavaScript diff --git a/pomsky-lib/src/exprs/char_class.rs b/pomsky-lib/src/exprs/char_class.rs index 825b42f..2b9cf66 100644 --- a/pomsky-lib/src/exprs/char_class.rs +++ b/pomsky-lib/src/exprs/char_class.rs @@ -94,16 +94,12 @@ use pomsky_syntax::{ use super::RuleExt; -impl<'i> RuleExt<'i> for CharClass { - fn compile( - &self, - options: CompileOptions, - _state: &mut CompileState<'_, 'i>, - ) -> CompileResult<'i> { +impl RuleExt for CharClass { + fn compile(&self, options: CompileOptions, _state: &mut CompileState<'_>) -> CompileResult { if self.inner.len() == 1 && !matches!(&&*self.inner, [GroupItem::Char('\r')]) { let first = self.inner.first().unwrap(); if let &GroupItem::Char(c) = first { - return Ok(Regex::Literal(c.to_string().into())); + return Ok(Regex::Literal(c.to_string())); } } diff --git a/pomsky-lib/src/exprs/codepoint.rs b/pomsky-lib/src/exprs/codepoint.rs index 181b780..d284259 100644 --- a/pomsky-lib/src/exprs/codepoint.rs +++ b/pomsky-lib/src/exprs/codepoint.rs @@ -15,7 +15,7 @@ use super::char_class::{RegexCharSet, RegexCharSetItem}; pub(crate) struct Codepoint {} impl Codepoint { - pub(crate) fn compile(&self, _options: CompileOptions) -> CompileResult<'static> { + pub(crate) fn compile(&self, _options: CompileOptions) -> CompileResult { Ok(Regex::CharSet(RegexCharSet::new(vec![ RegexCharSetItem::Shorthand(RegexShorthand::Space), RegexCharSetItem::Shorthand(RegexShorthand::NotSpace), diff --git a/pomsky-lib/src/exprs/dot.rs b/pomsky-lib/src/exprs/dot.rs index e77cf24..ae0673a 100644 --- a/pomsky-lib/src/exprs/dot.rs +++ b/pomsky-lib/src/exprs/dot.rs @@ -9,7 +9,7 @@ use crate::{compile::CompileResult, options::CompileOptions, regex::Regex}; pub(crate) struct Dot {} impl Dot { - pub(crate) fn compile(&self, _: CompileOptions) -> CompileResult<'static> { + pub(crate) fn compile(&self, _: CompileOptions) -> CompileResult { Ok(Regex::Dot) } } diff --git a/pomsky-lib/src/exprs/grapheme.rs b/pomsky-lib/src/exprs/grapheme.rs index 75e47a2..67d438e 100644 --- a/pomsky-lib/src/exprs/grapheme.rs +++ b/pomsky-lib/src/exprs/grapheme.rs @@ -17,7 +17,7 @@ use crate::{ pub(crate) struct Grapheme {} impl Grapheme { - pub(crate) fn compile(&self, options: CompileOptions) -> CompileResult<'static> { + pub(crate) fn compile(&self, options: CompileOptions) -> CompileResult { if matches!(options.flavor, RegexFlavor::Pcre | RegexFlavor::Java | RegexFlavor::Ruby) { Ok(Regex::Grapheme) } else { diff --git a/pomsky-lib/src/exprs/group.rs b/pomsky-lib/src/exprs/group.rs index e4a8a13..7c8d035 100644 --- a/pomsky-lib/src/exprs/group.rs +++ b/pomsky-lib/src/exprs/group.rs @@ -8,12 +8,12 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Group<'i> { +impl RuleExt for Group { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { if let GroupKind::Capturing(_) = self.kind { state.next_idx += 1; } @@ -24,9 +24,9 @@ impl<'i> RuleExt<'i> for Group<'i> { .iter() .map(|part| part.compile(options, state)) .collect::>()?, - kind: match self.kind { + kind: match &self.kind { GroupKind::Capturing(Capture { name: Some(name) }) => { - RegexGroupKind::NamedCapture(name) + RegexGroupKind::NamedCapture(name.clone()) } GroupKind::Capturing(Capture { name: None }) => RegexGroupKind::Capture, GroupKind::Atomic => RegexGroupKind::Atomic, @@ -37,27 +37,27 @@ impl<'i> RuleExt<'i> for Group<'i> { } #[cfg_attr(feature = "dbg", derive(Debug))] -pub(crate) struct RegexGroup<'i> { - pub(crate) parts: Vec>, - pub(crate) kind: RegexGroupKind<'i>, +pub(crate) struct RegexGroup { + pub(crate) parts: Vec, + pub(crate) kind: RegexGroupKind, } #[cfg_attr(feature = "dbg", derive(Debug))] #[derive(PartialEq, Eq)] -pub(crate) enum RegexGroupKind<'i> { +pub(crate) enum RegexGroupKind { Capture, - NamedCapture(&'i str), + NamedCapture(String), Atomic, Normal, } -impl<'i> RegexGroup<'i> { - pub(crate) fn new(parts: Vec>, capture: RegexGroupKind<'i>) -> Self { +impl RegexGroup { + pub(crate) fn new(parts: Vec, capture: RegexGroupKind) -> Self { Self { parts, kind: capture } } pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) { - match self.kind { + match &self.kind { RegexGroupKind::NamedCapture(name) => { // https://www.regular-expressions.info/named.html match flavor { diff --git a/pomsky-lib/src/exprs/literal.rs b/pomsky-lib/src/exprs/literal.rs index 3a22f67..cdc54e4 100644 --- a/pomsky-lib/src/exprs/literal.rs +++ b/pomsky-lib/src/exprs/literal.rs @@ -8,8 +8,8 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Literal<'i> { - fn compile<'c>(&'c self, _: CompileOptions, _: &mut CompileState<'c, 'i>) -> CompileResult<'i> { +impl RuleExt for Literal { + fn compile(&self, _: CompileOptions, _: &mut CompileState<'_>) -> CompileResult { Ok(Regex::Literal(self.content.clone())) } } diff --git a/pomsky-lib/src/exprs/lookaround.rs b/pomsky-lib/src/exprs/lookaround.rs index 93f0f36..2197683 100644 --- a/pomsky-lib/src/exprs/lookaround.rs +++ b/pomsky-lib/src/exprs/lookaround.rs @@ -9,12 +9,12 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Lookaround<'i> { +impl RuleExt for Lookaround { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { match options.flavor { RegexFlavor::Ruby if state.in_lookbehind => { if let LookaroundKind::Ahead | LookaroundKind::AheadNegative = self.kind { @@ -32,20 +32,44 @@ impl<'i> RuleExt<'i> for Lookaround<'i> { state.in_lookbehind = true; } - Ok(Regex::Lookaround(Box::new(RegexLookaround { - content: self.rule.compile(options, &mut state)?, - kind: self.kind, - }))) + let content = self.rule.compile(options, &mut state)?; + let lookaround = RegexLookaround::new(content, self.kind, options.flavor) + .map_err(|e| e.at(self.span))?; + + Ok(Regex::Lookaround(Box::new(lookaround))) } } #[cfg_attr(feature = "dbg", derive(Debug))] -pub(crate) struct RegexLookaround<'i> { - pub(crate) content: Regex<'i>, +pub(crate) struct RegexLookaround { + pub(crate) content: Regex, pub(crate) kind: LookaroundKind, } -impl<'i> RegexLookaround<'i> { +impl RegexLookaround { + pub(crate) fn new( + content: Regex, + kind: LookaroundKind, + flavor: RegexFlavor, + ) -> Result { + match flavor { + RegexFlavor::Python => { + if let LookaroundKind::Behind | LookaroundKind::BehindNegative = kind { + content.validate_in_lookbehind_py()?; + } + } + RegexFlavor::Pcre => { + if let LookaroundKind::Behind | LookaroundKind::BehindNegative = kind { + content.validate_in_lookbehind_pcre()?; + } + } + // TODO: Java, see + _ => {} + } + + Ok(RegexLookaround { content, kind }) + } + pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) { buf.push_str(match self.kind { LookaroundKind::Ahead => "(?=", diff --git a/pomsky-lib/src/exprs/mod.rs b/pomsky-lib/src/exprs/mod.rs index c3a2422..d0536fd 100644 --- a/pomsky-lib/src/exprs/mod.rs +++ b/pomsky-lib/src/exprs/mod.rs @@ -29,25 +29,26 @@ pub(crate) mod var; use pomsky_syntax::exprs::{test::Test, *}; use pomsky_syntax::Span; -pub(crate) trait RuleExt<'i> { +pub(crate) trait RuleExt { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i>; + state: &mut CompileState<'c>, + ) -> CompileResult; } /// A parsed pomsky expression, which might contain more sub-expressions. #[derive(Clone)] #[cfg_attr(not(feature = "dbg"), derive(Debug))] -pub struct Expr<'i>(Rule<'i>); +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Expr(Rule); -impl<'i> Expr<'i> { +impl Expr { /// Parse a `Expr` without generating code. /// /// The parsed `Expr` can be displayed with `Debug` if the `dbg` feature is /// enabled. - pub fn parse(input: &'i str) -> (Option, impl Iterator + '_) { + pub fn parse(input: &str) -> (Option, impl Iterator + '_) { let (rule, diagnostics) = pomsky_syntax::parse(input, 256); (rule.map(Expr), diagnostics.into_iter().map(|d| Diagnostic::from_parser(&d, input))) } @@ -55,7 +56,7 @@ impl<'i> Expr<'i> { /// Compile a `Expr` that has been parsed, to a regex pub fn compile( &self, - input: &'i str, + input: &str, options: CompileOptions, ) -> (Option, Vec) { if let Err(e) = Validator::new(options).visit_rule(&self.0) { @@ -98,7 +99,7 @@ impl<'i> Expr<'i> { } /// Extracts top-level all unit tests from the Pomsky expression - pub fn extract_tests(self) -> Vec> { + pub fn extract_tests(self) -> Vec { let mut rule = self.0; let mut tests = Vec::new(); while let Rule::StmtExpr(expr) = rule { @@ -112,9 +113,9 @@ impl<'i> Expr<'i> { /// Parse a string to a `Expr` and compile it to a regex. pub fn parse_and_compile( - input: &'i str, + input: &str, options: CompileOptions, - ) -> (Option, Vec, Vec>) { + ) -> (Option, Vec, Vec) { match Self::parse(input) { (Some(parsed), warnings1) => match parsed.compile(input, options) { (Some(compiled), warnings2) => { @@ -138,8 +139,12 @@ impl<'i> Expr<'i> { } #[cfg(feature = "dbg")] -impl core::fmt::Debug for Expr<'_> { +impl core::fmt::Debug for Expr { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - core::fmt::Display::fmt(&self.0, f) + if f.alternate() { + core::fmt::Debug::fmt(&self.0, f) + } else { + core::fmt::Display::fmt(&self.0, f) + } } } diff --git a/pomsky-lib/src/exprs/range.rs b/pomsky-lib/src/exprs/range.rs index 05847ac..946fa03 100644 --- a/pomsky-lib/src/exprs/range.rs +++ b/pomsky-lib/src/exprs/range.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, cmp::Ordering}; +use std::cmp::Ordering; use pomsky_syntax::exprs::{Range, RepetitionKind}; @@ -16,8 +16,8 @@ use super::{ RuleExt, }; -impl<'i> RuleExt<'i> for Range { - fn compile<'c>(&'c self, _: CompileOptions, _: &mut CompileState<'c, 'i>) -> CompileResult<'i> { +impl RuleExt for Range { + fn compile(&self, _: CompileOptions, _: &mut CompileState<'_>) -> CompileResult { Ok(range(&self.start, &self.end, true, self.radix).to_regex()) } } @@ -401,9 +401,9 @@ impl Rule { } } - fn to_regex(&self) -> Regex<'static> { + fn to_regex(&self) -> Regex { match self { - Rule::Empty => Regex::Literal(Cow::Borrowed("")), + Rule::Empty => Regex::Literal("".to_string()), Rule::Class(c) => c.to_regex(), Rule::Repeat(r) => r.to_regex(), Rule::Alt(a) => a.to_regex(), @@ -435,7 +435,7 @@ struct Repeat { } impl Repeat { - fn to_regex(&self) -> Regex<'static> { + fn to_regex(&self) -> Regex { Regex::Repetition(Box::new(RegexRepetition::new( self.rule.to_regex(), RepetitionKind::try_from((self.min as u32, Some(self.max as u32))).unwrap(), @@ -448,7 +448,7 @@ impl Repeat { struct Alt(Vec>); impl Alt { - fn to_regex(&self) -> Regex<'static> { + fn to_regex(&self) -> Regex { Regex::Alternation(RegexAlternation::new( self.0 .iter() @@ -464,7 +464,7 @@ impl Alt { } impl Class { - fn to_regex(self) -> Regex<'static> { + fn to_regex(self) -> Regex { let (a, b) = (self.start, self.end); Regex::CharSet(RegexCharSet::new(match (a, b, a == b) { diff --git a/pomsky-lib/src/exprs/recursion.rs b/pomsky-lib/src/exprs/recursion.rs index 7fbfab4..7048b6b 100644 --- a/pomsky-lib/src/exprs/recursion.rs +++ b/pomsky-lib/src/exprs/recursion.rs @@ -8,12 +8,8 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Recursion { - fn compile<'c>( - &'c self, - _options: CompileOptions, - _: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { +impl RuleExt for Recursion { + fn compile(&self, _options: CompileOptions, _: &mut CompileState<'_>) -> CompileResult { Ok(Regex::Recursion) } } diff --git a/pomsky-lib/src/exprs/reference.rs b/pomsky-lib/src/exprs/reference.rs index 062f483..fa51dc1 100644 --- a/pomsky-lib/src/exprs/reference.rs +++ b/pomsky-lib/src/exprs/reference.rs @@ -24,9 +24,9 @@ impl From for Feature { } } -impl<'i> RuleExt<'i> for Reference<'i> { - fn compile(&self, options: CompileOptions, state: &mut CompileState) -> CompileResult<'i> { - let (direction, number) = match self.target { +impl RuleExt for Reference { + fn compile(&self, options: CompileOptions, state: &mut CompileState) -> CompileResult { + let (direction, number) = match &self.target { ReferenceTarget::Named(name) => match state.used_names.get(name) { Some(index) => { let direction = if index.absolute >= state.next_idx { @@ -43,7 +43,7 @@ impl<'i> RuleExt<'i> for Reference<'i> { } None => { return Err(CompileErrorKind::UnknownReferenceName { - found: name.into(), + found: name.clone().into(), #[cfg(feature = "suggestions")] similar: pomsky_syntax::find_suggestion( name, @@ -53,7 +53,7 @@ impl<'i> RuleExt<'i> for Reference<'i> { .at(self.span)); } }, - ReferenceTarget::Number(idx) => { + &ReferenceTarget::Number(idx) => { if idx == 0 { return Err(CompileErrorKind::UnknownReferenceNumber(0).at(self.span)); } @@ -77,7 +77,7 @@ impl<'i> RuleExt<'i> for Reference<'i> { (direction, idx) } - ReferenceTarget::Relative(offset) => { + &ReferenceTarget::Relative(offset) => { let direction = if offset >= 0 { ReferenceDirection::Forwards } else { diff --git a/pomsky-lib/src/exprs/regex.rs b/pomsky-lib/src/exprs/regex.rs index a1d71ba..0d9c98a 100644 --- a/pomsky-lib/src/exprs/regex.rs +++ b/pomsky-lib/src/exprs/regex.rs @@ -8,8 +8,8 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for RegexLiteral<'i> { - fn compile<'c>(&'c self, _: CompileOptions, _: &mut CompileState<'c, 'i>) -> CompileResult<'i> { +impl RuleExt for RegexLiteral { + fn compile(&self, _: CompileOptions, _: &mut CompileState<'_>) -> CompileResult { Ok(Regex::Unescaped(self.content.clone())) } } diff --git a/pomsky-lib/src/exprs/repetition.rs b/pomsky-lib/src/exprs/repetition.rs index b93eb1e..eaaa771 100644 --- a/pomsky-lib/src/exprs/repetition.rs +++ b/pomsky-lib/src/exprs/repetition.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; - use pomsky_syntax::exprs::{Quantifier, Repetition, RepetitionKind}; use crate::{ @@ -11,12 +9,12 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Repetition<'i> { +impl RuleExt for Repetition { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { let content = self.rule.compile(options, state)?; if options.flavor == RegexFlavor::Ruby && content.is_assertion() { @@ -34,8 +32,8 @@ impl<'i> RuleExt<'i> for Repetition<'i> { } #[cfg_attr(feature = "dbg", derive(Debug))] -pub(crate) struct RegexRepetition<'i> { - pub(crate) content: Regex<'i>, +pub(crate) struct RegexRepetition { + pub(crate) content: Regex, pub(crate) kind: RepetitionKind, pub(crate) quantifier: RegexQuantifier, } @@ -46,20 +44,18 @@ pub(crate) enum RegexQuantifier { Lazy, } -impl<'i> RegexRepetition<'i> { - pub(crate) fn new( - content: Regex<'i>, - kind: RepetitionKind, - quantifier: RegexQuantifier, - ) -> Self { +impl RegexRepetition { + pub(crate) fn new(content: Regex, kind: RepetitionKind, quantifier: RegexQuantifier) -> Self { Self { content, kind, quantifier } } pub(crate) fn codegen(&self, buf: &mut String, flavor: RegexFlavor) { use std::fmt::Write; - if let Regex::Literal(Cow::Borrowed("")) = self.content { - return; + if let Regex::Literal(l) = &self.content { + if l.is_empty() { + return; + } } if self.content.needs_parens_before_repetition(flavor) { diff --git a/pomsky-lib/src/exprs/rule.rs b/pomsky-lib/src/exprs/rule.rs index dfebe36..58946ed 100644 --- a/pomsky-lib/src/exprs/rule.rs +++ b/pomsky-lib/src/exprs/rule.rs @@ -10,12 +10,12 @@ use super::{ char_class::check_char_class_empty, codepoint::Codepoint, dot::Dot, grapheme::Grapheme, RuleExt, }; -impl<'i> RuleExt<'i> for Rule<'i> { +impl RuleExt for Rule { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { match self { Rule::Literal(l) => l.compile(options, state), Rule::CharClass(c) => c.compile(options, state), diff --git a/pomsky-lib/src/exprs/stmt.rs b/pomsky-lib/src/exprs/stmt.rs index 52af49e..753986a 100644 --- a/pomsky-lib/src/exprs/stmt.rs +++ b/pomsky-lib/src/exprs/stmt.rs @@ -7,16 +7,16 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for StmtExpr<'i> { +impl RuleExt for StmtExpr { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { match &self.stmt { Stmt::Enable(..) | Stmt::Disable(..) => self.rule.compile(options, state), Stmt::Let(r#let) => { - state.variables.push((r#let.name, &r#let.rule)); + state.variables.push((&r#let.name, &r#let.rule)); let res = self.rule.compile(options, state)?; state.variables.pop(); Ok(res) diff --git a/pomsky-lib/src/exprs/var.rs b/pomsky-lib/src/exprs/var.rs index beed986..fa3d655 100644 --- a/pomsky-lib/src/exprs/var.rs +++ b/pomsky-lib/src/exprs/var.rs @@ -9,12 +9,12 @@ use crate::{ use super::RuleExt; -impl<'i> RuleExt<'i> for Variable<'i> { +impl RuleExt for Variable { fn compile<'c>( &'c self, options: CompileOptions, - state: &mut CompileState<'c, 'i>, - ) -> CompileResult<'i> { + state: &mut CompileState<'c>, + ) -> CompileResult { let rule = state .variables .iter() @@ -43,10 +43,10 @@ impl<'i> RuleExt<'i> for Variable<'i> { Err(CompileErrorKind::RecursiveVariable.at(self.span)) } else { Err(CompileErrorKind::UnknownVariable { - found: self.name.into(), + found: self.name.clone().into(), #[cfg(feature = "suggestions")] similar: pomsky_syntax::find_suggestion( - self.name, + &self.name, state.variables.iter().map(|&(var, _)| var), ), } diff --git a/pomsky-lib/src/features.rs b/pomsky-lib/src/features.rs index 4fa595f..6437a2f 100644 --- a/pomsky-lib/src/features.rs +++ b/pomsky-lib/src/features.rs @@ -17,6 +17,7 @@ use crate::diagnose::{CompileError, CompileErrorKind, UnsupportedError}; /// .variables(false); /// ``` #[derive(Copy, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct PomskyFeatures { bits: u16, } @@ -43,29 +44,6 @@ impl fmt::Debug for PomskyFeatures { } } -#[cfg(feature = "arbitrary")] -impl<'a> arbitrary::Arbitrary<'a> for PomskyFeatures { - fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result { - let mut feat = PomskyFeatures::default(); - feat.grapheme(bool::arbitrary(u)?); - feat.numbered_groups(bool::arbitrary(u)?); - feat.named_groups(bool::arbitrary(u)?); - feat.atomic_groups(bool::arbitrary(u)?); - feat.references(bool::arbitrary(u)?); - feat.lazy_mode(bool::arbitrary(u)?); - feat.ascii_mode(bool::arbitrary(u)?); - feat.ranges(bool::arbitrary(u)?); - feat.variables(bool::arbitrary(u)?); - feat.lookahead(bool::arbitrary(u)?); - feat.lookbehind(bool::arbitrary(u)?); - feat.boundaries(bool::arbitrary(u)?); - feat.regexes(bool::arbitrary(u)?); - feat.dot(bool::arbitrary(u)?); - feat.recursion(bool::arbitrary(u)?); - Ok(feat) - } -} - impl Default for PomskyFeatures { fn default() -> Self { Self { diff --git a/pomsky-lib/src/regex/mod.rs b/pomsky-lib/src/regex/mod.rs index cc9822b..ffd9056 100644 --- a/pomsky-lib/src/regex/mod.rs +++ b/pomsky-lib/src/regex/mod.rs @@ -1,13 +1,15 @@ -use std::borrow::{Borrow, Cow}; +use std::borrow::Borrow; use pomsky_syntax::{ - exprs::{BoundaryKind, Category, CodeBlock, LookaroundKind, OtherProperties, Script}, + exprs::{ + BoundaryKind, Category, CodeBlock, LookaroundKind, OtherProperties, RepetitionKind, Script, + }, Span, }; use crate::{ compile::CompileResult, - diagnose::{CompileErrorKind, IllegalNegationKind}, + diagnose::{CompileErrorKind, Feature, IllegalNegationKind}, exprs::{ alternation::RegexAlternation, boundary::boundary_kind_codegen, @@ -27,11 +29,11 @@ mod optimize; pub(super) use optimize::Count; #[cfg_attr(feature = "dbg", derive(Debug))] -pub(crate) enum Regex<'i> { +pub(crate) enum Regex { /// A literal string - Literal(Cow<'i, str>), + Literal(String), /// A regex string that is inserted verbatim into the output - Unescaped(Cow<'i, str>), + Unescaped(String), /// A literal char Char(char), /// A character class, delimited with square brackets @@ -41,25 +43,115 @@ pub(crate) enum Regex<'i> { /// The dot, matching anything except `\n` Dot, /// A group, i.e. a sequence of rules, possibly wrapped in parentheses. - Group(RegexGroup<'i>), + Group(RegexGroup), /// An alternation, i.e. a list of alternatives; at least one of them has to /// match. - Alternation(RegexAlternation<'i>), + Alternation(RegexAlternation), /// A repetition, i.e. a expression that must be repeated. The number of /// required repetitions is constrained by a lower and possibly an upper /// bound. - Repetition(Box>), + Repetition(Box), /// A boundary (start of string, end of string or word boundary). Boundary(BoundaryKind), /// A (positive or negative) lookahead or lookbehind. - Lookaround(Box>), + Lookaround(Box), /// A backreference or forward reference. Reference(RegexReference), /// Recursively matches the entire regex. Recursion, } -impl Default for Regex<'_> { +impl Regex { + pub(super) fn validate_in_lookbehind_py(&self) -> Result, CompileErrorKind> { + match self { + Regex::Literal(str) => Ok(Some(str.chars().count() as u32)), + Regex::Unescaped(_) => Ok(None), + Regex::Char(_) => Ok(Some(1)), + Regex::CharSet(_) => Ok(Some(1)), + Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind { + flavor: RegexFlavor::Python, + feature: Feature::Grapheme, + }), + Regex::Dot => Ok(Some(1)), + Regex::Group(g) => g.parts.iter().try_fold(Some(0), |acc, part| { + Ok(match (acc, part.validate_in_lookbehind_py()?) { + (Some(a), Some(b)) => Some(a + b), + _ => None, + }) + }), + Regex::Alternation(alt) => { + let mut count = None; + for part in &alt.parts { + let c = part.validate_in_lookbehind_py()?; + count = match (count, c) { + (Some(a), Some(b)) if a == b => Some(a), + (Some(_), Some(_)) => { + return Err(CompileErrorKind::LookbehindNotConstantLength { + flavor: RegexFlavor::Python, + }) + } + (Some(a), None) | (None, Some(a)) => Some(a), + _ => None, + }; + } + Ok(count) + } + Regex::Repetition(r) => { + if let RepetitionKind { lower_bound, upper_bound: Some(upper) } = r.kind { + if lower_bound == upper { + return Ok(Some(upper)); + } + } + Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Python }) + } + Regex::Boundary(_) => Ok(Some(0)), + Regex::Lookaround(_) => Ok(Some(0)), + Regex::Reference(_) => Ok(None), // TODO: somehow get the length of the referenced group + Regex::Recursion => unreachable!("not supported in python"), + } + } + + pub(super) fn validate_in_lookbehind_pcre(&self) -> Result<(), CompileErrorKind> { + match self { + Regex::Literal(_) => Ok(()), + Regex::Unescaped(_) => Ok(()), + Regex::Char(_) => Ok(()), + Regex::CharSet(_) => Ok(()), + Regex::Grapheme => Err(CompileErrorKind::UnsupportedInLookbehind { + flavor: RegexFlavor::Pcre, + feature: Feature::Grapheme, + }), + Regex::Dot => Ok(()), + Regex::Group(g) => { + for part in &g.parts { + part.validate_in_lookbehind_pcre()?; + } + Ok(()) + } + Regex::Alternation(alt) => { + for part in &alt.parts { + part.validate_in_lookbehind_pcre()?; + } + Ok(()) + } + Regex::Repetition(r) => match r.kind.upper_bound { + Some(_) => Ok(()), + _ => { + Err(CompileErrorKind::LookbehindNotConstantLength { flavor: RegexFlavor::Pcre }) + } + }, + Regex::Boundary(_) => Ok(()), + Regex::Lookaround(_) => Ok(()), + Regex::Reference(_) => Ok(()), // TODO: somehow check the referenced group + Regex::Recursion => Err(CompileErrorKind::UnsupportedInLookbehind { + flavor: RegexFlavor::Pcre, + feature: Feature::Recursion, + }), + } + } +} + +impl Default for Regex { fn default() -> Self { Regex::Literal("".into()) } @@ -130,8 +222,8 @@ impl RegexProperty { } } -impl<'i> Regex<'i> { - pub(crate) fn negate(self, not_span: Span, flavor: RegexFlavor) -> CompileResult<'i> { +impl Regex { + pub(crate) fn negate(self, not_span: Span, flavor: RegexFlavor) -> CompileResult { match self { Regex::Literal(l) => { let mut iter = l.chars(); @@ -151,7 +243,7 @@ impl<'i> Regex<'i> { } Regex::Char(c) => { let items = vec![RegexCharSetItem::Char(c)]; - return Ok(Regex::CharSet(RegexCharSet::new(items).negate())); + Ok(Regex::CharSet(RegexCharSet::new(items).negate())) } Regex::CharSet(s) => Ok(Regex::CharSet(s.negate())), Regex::Boundary(b) => match b { diff --git a/pomsky-lib/src/regex/optimize.rs b/pomsky-lib/src/regex/optimize.rs index d6e83fc..dce9af9 100644 --- a/pomsky-lib/src/regex/optimize.rs +++ b/pomsky-lib/src/regex/optimize.rs @@ -25,7 +25,7 @@ impl Add for Count { } } -impl<'i> Regex<'i> { +impl Regex { pub(crate) fn optimize(&mut self) -> Count { match self { Regex::Literal(l) => { diff --git a/pomsky-lib/src/validation.rs b/pomsky-lib/src/validation.rs index b6a26a0..94b2457 100644 --- a/pomsky-lib/src/validation.rs +++ b/pomsky-lib/src/validation.rs @@ -41,7 +41,7 @@ impl RuleVisitor for Validator { } fn visit_group(&mut self, group: &exprs::Group) -> Result<(), CompileError> { - match group.kind { + match &group.kind { exprs::GroupKind::Atomic => { self.require(Feat::ATOMIC_GROUPS, group.span)?; diff --git a/pomsky-lib/tests/it/files.rs b/pomsky-lib/tests/it/files.rs index f7a4638..fadcf70 100644 --- a/pomsky-lib/tests/it/files.rs +++ b/pomsky-lib/tests/it/files.rs @@ -201,12 +201,12 @@ pub(crate) fn test_file( } } - struct DisplayMatchCaptures<'i>(&'i Vec>); + struct DisplayMatchCaptures<'i>(&'i Vec); impl std::fmt::Display for DisplayMatchCaptures<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let _ = write!(f, "{{ "); for capture in self.0.iter() { - let _ = match capture.ident { + let _ = match &capture.ident { CaptureIdent::Name(n) => write!(f, "{n}: "), CaptureIdent::Index(i) => write!(f, "{i}: "), }; diff --git a/pomsky-syntax/Cargo.toml b/pomsky-syntax/Cargo.toml index 477fae5..69a166d 100644 --- a/pomsky-syntax/Cargo.toml +++ b/pomsky-syntax/Cargo.toml @@ -18,6 +18,7 @@ categories = ["text-processing", "parser-implementations"] default = [] dbg = [] suggestions = ["dep:strsim"] +arbitrary = ["dep:arbitrary"] [dependencies] strsim = { version = "0.10.0", optional = true } diff --git a/pomsky-syntax/build.rs b/pomsky-syntax/build.rs index 2ae711f..91657b0 100644 --- a/pomsky-syntax/build.rs +++ b/pomsky-syntax/build.rs @@ -126,6 +126,7 @@ fn generate_enum( ) -> String { format!( r#"#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] #[allow(non_camel_case_types)] #[allow(clippy::enum_variant_names)] #[repr({size})] diff --git a/pomsky-syntax/src/exprs/alternation.rs b/pomsky-syntax/src/exprs/alternation.rs index 7ac942f..9f575dd 100644 --- a/pomsky-syntax/src/exprs/alternation.rs +++ b/pomsky-syntax/src/exprs/alternation.rs @@ -1,8 +1,6 @@ //! Implements [alternation](https://www.regular-expressions.info/alternation.html): //! `('alt1' | 'alt2' | 'alt3')`. -use std::borrow::Cow; - use crate::Span; use super::{Literal, Rule}; @@ -15,13 +13,14 @@ use super::{Literal, Rule}; /// removed when compiling to a regex if they aren't required. In other words, /// `'a' | ('b' 'c')` compiles to `a|bc`. #[derive(Debug, Clone)] -pub struct Alternation<'i> { - pub rules: Vec>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Alternation { + pub rules: Vec, pub(crate) span: Span, } -impl<'i> Alternation<'i> { - pub(crate) fn new_expr(rules: Vec>) -> Rule<'i> { +impl Alternation { + pub(crate) fn new_expr(rules: Vec) -> Rule { rules .into_iter() .reduce(|a, b| match (a, b) { @@ -40,7 +39,7 @@ impl<'i> Alternation<'i> { Rule::Alternation(Alternation { rules: vec![a, b], span }) } }) - .unwrap_or_else(|| Rule::Literal(Literal::new(Cow::Borrowed(""), Span::default()))) + .unwrap_or_else(|| Rule::Literal(Literal::new("".to_string(), Span::default()))) } #[cfg(feature = "dbg")] diff --git a/pomsky-syntax/src/exprs/arbitrary.rs b/pomsky-syntax/src/exprs/arbitrary.rs new file mode 100644 index 0000000..6d74047 --- /dev/null +++ b/pomsky-syntax/src/exprs/arbitrary.rs @@ -0,0 +1,57 @@ +use arbitrary::{Arbitrary, Unstructured}; + +pub(crate) struct Ident(pub(crate) String); + +impl Ident { + pub(crate) fn create(u: &mut Unstructured<'_>) -> Result { + Ok(Ident::arbitrary(u)?.0) + } +} + +impl Arbitrary<'_> for Ident { + fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result { + let options = [ + "foo", "bar", "baz", "quux", "blabla", "hello", "world", "match", "regular", "based", + "a", "b", "c", "d", "e", "f", "g", + ]; + let idx = u.int_in_range(0..=options.len() as u8 - 1)?; + let name = options[idx as usize]; + Ok(Ident(name.to_string())) + } + + fn size_hint(_depth: usize) -> (usize, Option) { + (1, Some(1)) + } +} + +#[allow(unused)] +pub(crate) struct Digits(pub(crate) Box<[u8]>); + +impl Digits { + pub(crate) fn create( + u: &mut Unstructured<'_>, + radix: u8, + ) -> Result, arbitrary::Error> { + let len = u.arbitrary_len::()?.min(10); + let mut digits = Vec::with_capacity(len); + for _ in 0..len { + digits.push(u.int_in_range(0..=radix - 1)?); + } + Ok(digits.into_boxed_slice()) + } +} + +impl Arbitrary<'_> for Digits { + fn arbitrary(u: &mut Unstructured<'_>) -> arbitrary::Result { + let len = u.arbitrary_len::()?.min(10); + let mut digits = Vec::with_capacity(len); + for _ in 0..len { + digits.push(u.int_in_range(0..=9)?); + } + Ok(Digits(digits.into_boxed_slice())) + } + + fn size_hint(_depth: usize) -> (usize, Option) { + (0, Some(10)) + } +} diff --git a/pomsky-syntax/src/exprs/boundary.rs b/pomsky-syntax/src/exprs/boundary.rs index 1b56b7f..969c9aa 100644 --- a/pomsky-syntax/src/exprs/boundary.rs +++ b/pomsky-syntax/src/exprs/boundary.rs @@ -11,6 +11,7 @@ use crate::Span; /// All boundaries use a variation of the `%` sigil, so they are easy to /// remember. #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Boundary { pub kind: BoundaryKind, pub unicode_aware: bool, @@ -40,6 +41,7 @@ impl Boundary { } #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum BoundaryKind { /// `Start`, the start of the string (or start of line in single-line mode) Start, diff --git a/pomsky-syntax/src/exprs/char_class/char_group.rs b/pomsky-syntax/src/exprs/char_class/char_group.rs index b69e4bd..14fb69a 100644 --- a/pomsky-syntax/src/exprs/char_class/char_group.rs +++ b/pomsky-syntax/src/exprs/char_class/char_group.rs @@ -125,7 +125,37 @@ impl GroupItem { } } +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for GroupItem { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => GroupItem::Char(u.arbitrary()?), + 1 => { + let first = u.arbitrary()?; + let last = u.arbitrary()?; + if first >= last { + return Err(arbitrary::Error::IncorrectFormat); + } + GroupItem::Range { first, last } + } + _ => GroupItem::Named { name: GroupName::arbitrary(u)?, negative: bool::arbitrary(u)? }, + }) + } + + fn size_hint(depth: usize) -> (usize, Option) { + arbitrary::size_hint::and( + u8::size_hint(depth), + arbitrary::size_hint::or_all(&[ + char::size_hint(depth), + arbitrary::size_hint::and(char::size_hint(depth), char::size_hint(depth)), + arbitrary::size_hint::and(GroupName::size_hint(depth), bool::size_hint(depth)), + ]), + ) + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum GroupName { Word, Digit, diff --git a/pomsky-syntax/src/exprs/char_class/mod.rs b/pomsky-syntax/src/exprs/char_class/mod.rs index 808dfb7..d882cfe 100644 --- a/pomsky-syntax/src/exprs/char_class/mod.rs +++ b/pomsky-syntax/src/exprs/char_class/mod.rs @@ -113,3 +113,21 @@ impl CharClass { buf.push(']'); } } + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for CharClass { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + let len = u.arbitrary_len::()?.max(1); + let mut inner = Vec::with_capacity(len); + for _ in 0..len { + inner.push(u.arbitrary()?); + } + + Ok(CharClass { inner, span: Span::arbitrary(u)?, unicode_aware: bool::arbitrary(u)? }) + } + + fn size_hint(depth: usize) -> (usize, Option) { + let (_, Some(group_item_size)) = GroupItem::size_hint(depth) else { panic!() }; + (group_item_size + 1, Some(group_item_size * 10 + 1)) + } +} diff --git a/pomsky-syntax/src/exprs/group.rs b/pomsky-syntax/src/exprs/group.rs index ac5833e..83577d8 100644 --- a/pomsky-syntax/src/exprs/group.rs +++ b/pomsky-syntax/src/exprs/group.rs @@ -8,14 +8,15 @@ use super::Rule; /// If it is capturing, it must be wrapped in parentheses, and can have a name. /// If it is non-capturing, the parentheses can be omitted in same cases. #[derive(Debug, Clone)] -pub struct Group<'i> { - pub parts: Vec>, - pub kind: GroupKind<'i>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Group { + pub parts: Vec, + pub kind: GroupKind, pub span: Span, } -impl<'i> Group<'i> { - pub fn new(parts: Vec>, kind: GroupKind<'i>, span: Span) -> Self { +impl Group { + pub fn new(parts: Vec, kind: GroupKind, span: Span) -> Self { Group { parts, kind, span } } @@ -24,10 +25,10 @@ impl<'i> Group<'i> { let use_parens = matches!(self.kind, GroupKind::Capturing(_) | GroupKind::Atomic) || needs_parens; - match self.kind { + match &self.kind { GroupKind::Capturing(capture) => { buf.push(':'); - if let Some(name) = capture.name { + if let Some(name) = &capture.name { buf.push_str(name); } } @@ -69,10 +70,11 @@ impl<'i> Group<'i> { } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum GroupKind<'i> { +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum GroupKind { /// A (possibly named) capturing group e.g. `:foo` - Capturing(Capture<'i>), + Capturing(Capture), /// An atomic group Atomic, /// A normal group with a set of parentheses @@ -81,19 +83,34 @@ pub enum GroupKind<'i> { Implicit, } -impl GroupKind<'_> { +impl GroupKind { pub fn is_normal(&self) -> bool { matches!(self, GroupKind::Normal) } } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct Capture<'i> { - pub name: Option<&'i str>, +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Capture { + pub name: Option, } -impl<'i> Capture<'i> { - pub fn new(name: Option<&'i str>) -> Self { - Capture { name } +impl Capture { + pub fn new(name: Option<&str>) -> Self { + Capture { name: name.map(str::to_string) } + } +} + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for Capture { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + if u.arbitrary()? { + Ok(Capture { name: Some(super::arbitrary::Ident::create(u)?) }) + } else { + Ok(Capture { name: None }) + } + } + + fn size_hint(_depth: usize) -> (usize, Option) { + (1, None) } } diff --git a/pomsky-syntax/src/exprs/literal.rs b/pomsky-syntax/src/exprs/literal.rs index e80bfcb..62e82aa 100644 --- a/pomsky-syntax/src/exprs/literal.rs +++ b/pomsky-syntax/src/exprs/literal.rs @@ -1,15 +1,14 @@ -use std::borrow::Cow; - use crate::Span; #[derive(Clone, PartialEq, Eq)] -pub struct Literal<'i> { - pub content: Cow<'i, str>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Literal { + pub content: String, pub span: Span, } -impl<'i> Literal<'i> { - pub fn new(content: Cow<'i, str>, span: Span) -> Self { +impl Literal { + pub fn new(content: String, span: Span) -> Self { Literal { content, span } } @@ -19,7 +18,7 @@ impl<'i> Literal<'i> { } } -impl std::fmt::Debug for Literal<'_> { +impl std::fmt::Debug for Literal { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{:?} at {}", self.content, self.span) } diff --git a/pomsky-syntax/src/exprs/lookaround.rs b/pomsky-syntax/src/exprs/lookaround.rs index b198640..f77afeb 100644 --- a/pomsky-syntax/src/exprs/lookaround.rs +++ b/pomsky-syntax/src/exprs/lookaround.rs @@ -3,13 +3,15 @@ use crate::Span; use super::Rule; #[derive(Debug, Clone)] -pub struct Lookaround<'i> { +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Lookaround { pub kind: LookaroundKind, - pub rule: Rule<'i>, + pub rule: Rule, pub span: Span, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum LookaroundKind { Ahead, Behind, @@ -17,8 +19,8 @@ pub enum LookaroundKind { BehindNegative, } -impl<'i> Lookaround<'i> { - pub(crate) fn new(rule: Rule<'i>, kind: LookaroundKind, span: Span) -> Self { +impl Lookaround { + pub(crate) fn new(rule: Rule, kind: LookaroundKind, span: Span) -> Self { Lookaround { kind, rule, span } } diff --git a/pomsky-syntax/src/exprs/mod.rs b/pomsky-syntax/src/exprs/mod.rs index cbb992a..935ecb2 100644 --- a/pomsky-syntax/src/exprs/mod.rs +++ b/pomsky-syntax/src/exprs/mod.rs @@ -17,6 +17,9 @@ pub(crate) mod stmt; pub mod test; pub(crate) mod var; +#[cfg(feature = "arbitrary")] +pub(crate) mod arbitrary; + pub use self::{ alternation::Alternation, boundary::{Boundary, BoundaryKind}, diff --git a/pomsky-syntax/src/exprs/negation.rs b/pomsky-syntax/src/exprs/negation.rs index c63beff..fb8ae82 100644 --- a/pomsky-syntax/src/exprs/negation.rs +++ b/pomsky-syntax/src/exprs/negation.rs @@ -3,12 +3,13 @@ use crate::Span; use super::Rule; #[derive(Debug, Clone)] -pub struct Negation<'i> { - pub rule: Rule<'i>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Negation { + pub rule: Rule, pub not_span: Span, } -impl<'i> Negation<'i> { +impl Negation { #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter, needs_parens: bool) { buf.push('!'); diff --git a/pomsky-syntax/src/exprs/range.rs b/pomsky-syntax/src/exprs/range.rs index 14df583..2efa5ee 100644 --- a/pomsky-syntax/src/exprs/range.rs +++ b/pomsky-syntax/src/exprs/range.rs @@ -34,3 +34,20 @@ impl Range { } } } + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for Range { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + let radix = u.int_in_range(2..=36)?; + let start = super::arbitrary::Digits::create(u, radix)?; + let end = super::arbitrary::Digits::create(u, radix)?; + if start.len() > end.len() || (start.len() == end.len() && start > end) { + return Err(arbitrary::Error::IncorrectFormat); + } + Ok(Range { start, end, radix, span: Span::arbitrary(u)? }) + } + + fn size_hint(_depth: usize) -> (usize, Option) { + (1, None) + } +} diff --git a/pomsky-syntax/src/exprs/recursion.rs b/pomsky-syntax/src/exprs/recursion.rs index 9253978..2676a4d 100644 --- a/pomsky-syntax/src/exprs/recursion.rs +++ b/pomsky-syntax/src/exprs/recursion.rs @@ -1,6 +1,7 @@ use crate::Span; #[derive(Debug, Clone)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Recursion { pub span: Span, } diff --git a/pomsky-syntax/src/exprs/reference.rs b/pomsky-syntax/src/exprs/reference.rs index 64d92ba..9711626 100644 --- a/pomsky-syntax/src/exprs/reference.rs +++ b/pomsky-syntax/src/exprs/reference.rs @@ -5,34 +5,50 @@ use crate::Span; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct Reference<'i> { - pub target: ReferenceTarget<'i>, +#[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Reference { + pub target: ReferenceTarget, pub span: Span, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ReferenceTarget<'i> { - Named(&'i str), +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ReferenceTarget { + Named(String), Number(u32), Relative(i32), } -impl<'i> Reference<'i> { - pub(crate) fn new(target: ReferenceTarget<'i>, span: Span) -> Self { +impl Reference { + pub(crate) fn new(target: ReferenceTarget, span: Span) -> Self { Reference { target, span } } #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) { buf.push_str("::"); - match self.target { + match &self.target { ReferenceTarget::Named(n) => buf.write(n), ReferenceTarget::Number(i) => buf.write_fmt(i), - ReferenceTarget::Relative(o) => { + &ReferenceTarget::Relative(o) => { buf.push(if o < 0 { '-' } else { '+' }); buf.write_fmt(o); } } } } + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for ReferenceTarget { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + Ok(match u.int_in_range(0u8..=2)? { + 0 => ReferenceTarget::Named(super::arbitrary::Ident::create(u)?), + 1 => ReferenceTarget::Number(u.int_in_range(0u8..=15)? as u32), + _ => ReferenceTarget::Relative(u.int_in_range(-15i8..=15)? as i32), + }) + } + + fn size_hint(depth: usize) -> (usize, Option) { + arbitrary::size_hint::and(super::arbitrary::Ident::size_hint(depth), (3, Some(3))) + } +} diff --git a/pomsky-syntax/src/exprs/regex.rs b/pomsky-syntax/src/exprs/regex.rs index 0286d5d..8dc9398 100644 --- a/pomsky-syntax/src/exprs/regex.rs +++ b/pomsky-syntax/src/exprs/regex.rs @@ -1,15 +1,14 @@ -use std::borrow::Cow; - use crate::Span; #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Regex<'i> { - pub content: Cow<'i, str>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Regex { + pub content: String, pub span: Span, } -impl<'i> Regex<'i> { - pub(crate) fn new(content: Cow<'i, str>, span: Span) -> Self { +impl Regex { + pub(crate) fn new(content: String, span: Span) -> Self { Regex { content, span } } diff --git a/pomsky-syntax/src/exprs/repetition.rs b/pomsky-syntax/src/exprs/repetition.rs index 6cb5d28..d7a0689 100644 --- a/pomsky-syntax/src/exprs/repetition.rs +++ b/pomsky-syntax/src/exprs/repetition.rs @@ -3,16 +3,17 @@ use crate::{error::RepetitionError, Span}; use super::Rule; #[derive(Debug, Clone)] -pub struct Repetition<'i> { - pub rule: Rule<'i>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Repetition { + pub rule: Rule, pub kind: RepetitionKind, pub quantifier: Quantifier, pub span: Span, } -impl<'i> Repetition<'i> { +impl Repetition { pub(crate) fn new( - rule: Rule<'i>, + rule: Rule, kind: RepetitionKind, quantifier: Quantifier, span: Span, @@ -54,6 +55,7 @@ impl<'i> Repetition<'i> { } #[derive(Debug, Clone, PartialEq, Eq, Copy)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Quantifier { Greedy, Lazy, @@ -77,6 +79,19 @@ pub struct RepetitionKind { pub upper_bound: Option, } +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for RepetitionKind { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + let lower = u.int_in_range(0u8..=40)?; + if u.arbitrary()? { + let upper = u.int_in_range(lower..=lower + 40)?; + Ok(RepetitionKind { lower_bound: lower as u32, upper_bound: Some(upper as u32) }) + } else { + Ok(RepetitionKind { lower_bound: lower as u32, upper_bound: None }) + } + } +} + impl RepetitionKind { pub(crate) fn zero_inf() -> Self { RepetitionKind { lower_bound: 0, upper_bound: None } diff --git a/pomsky-syntax/src/exprs/rule.rs b/pomsky-syntax/src/exprs/rule.rs index 4d2926a..84565f3 100644 --- a/pomsky-syntax/src/exprs/rule.rs +++ b/pomsky-syntax/src/exprs/rule.rs @@ -7,36 +7,37 @@ use super::{ /// A parsed pomsky expression, which might contain more sub-expressions. #[derive(Debug, Clone)] -pub enum Rule<'i> { +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum Rule { /// A string literal - Literal(Literal<'i>), + Literal(Literal), /// A character class CharClass(CharClass), /// A group, i.e. a sequence of rules, possibly wrapped in parentheses. - Group(Group<'i>), + Group(Group), /// An alternation, i.e. a list of alternatives; at least one of them has to /// match. - Alternation(Alternation<'i>), + Alternation(Alternation), /// A repetition, i.e. a expression that must be repeated. The number of /// required repetitions is constrained by a lower and possibly an upper /// bound. - Repetition(Box>), + Repetition(Box), /// A boundary (start of string, end of string or word boundary). Boundary(Boundary), /// A (positive or negative) lookahead or lookbehind. - Lookaround(Box>), + Lookaround(Box), /// An variable that has been declared before. - Variable(Variable<'i>), + Variable(Variable), /// A backreference or forward reference. - Reference(Reference<'i>), + Reference(Reference), /// A range of integers Range(Range), /// An expression preceded by a modifier such as `enable lazy;` - StmtExpr(Box>), + StmtExpr(Box), /// Negated expression - Negation(Box>), + Negation(Box), /// A regex string, which is not escaped - Regex(Regex<'i>), + Regex(Regex), /// A regex string, which is not escaped Recursion(Recursion), @@ -48,7 +49,7 @@ pub enum Rule<'i> { Dot, } -impl<'i> Rule<'i> { +impl Rule { /// Returns the span of this rule pub fn span(&self) -> Span { match self { @@ -95,7 +96,7 @@ impl<'i> Rule<'i> { } #[cfg(feature = "dbg")] -impl core::fmt::Display for Rule<'_> { +impl core::fmt::Display for Rule { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let mut buf = crate::PrettyPrinter::new(); self.pretty_print(&mut buf, false); diff --git a/pomsky-syntax/src/exprs/stmt.rs b/pomsky-syntax/src/exprs/stmt.rs index 547cb15..5cda73d 100644 --- a/pomsky-syntax/src/exprs/stmt.rs +++ b/pomsky-syntax/src/exprs/stmt.rs @@ -3,21 +3,24 @@ use crate::Span; use super::{test::Test, Rule}; #[derive(Debug, Clone)] -pub struct StmtExpr<'i> { - pub stmt: Stmt<'i>, - pub rule: Rule<'i>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct StmtExpr { + pub stmt: Stmt, + pub rule: Rule, pub span: Span, } #[derive(Debug, Clone)] -pub enum Stmt<'i> { +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum Stmt { Enable(BooleanSetting, Span), Disable(BooleanSetting, Span), - Let(Let<'i>), - Test(Test<'i>), + Let(Let), + Test(Test), } #[derive(Debug, Clone, PartialEq, Eq)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum BooleanSetting { Lazy, Unicode, @@ -34,24 +37,41 @@ impl BooleanSetting { } #[derive(Debug, Clone)] -pub struct Let<'i> { - pub name: &'i str, - pub rule: Rule<'i>, +pub struct Let { + pub name: String, + pub rule: Rule, pub name_span: Span, } -impl<'i> Let<'i> { - pub fn new(name: &'i str, rule: Rule<'i>, name_span: Span) -> Self { - Self { name, rule, name_span } +impl Let { + pub fn new(name: &str, rule: Rule, name_span: Span) -> Self { + Self { name: name.to_string(), rule, name_span } } - pub fn name(&self) -> &'i str { - self.name + pub fn name(&self) -> &str { + &self.name } } -impl<'i> StmtExpr<'i> { - pub fn new(stmt: Stmt<'i>, rule: Rule<'i>, span: Span) -> Self { +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for Let { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + let name = super::arbitrary::Ident::create(u)?; + Ok(Let { name, rule: Rule::arbitrary(u)?, name_span: Span::arbitrary(u)? }) + } + + fn size_hint(depth: usize) -> (usize, Option) { + arbitrary::size_hint::recursion_guard(depth, |depth| { + arbitrary::size_hint::and( + super::arbitrary::Ident::size_hint(depth), + Rule::size_hint(depth), + ) + }) + } +} + +impl StmtExpr { + pub fn new(stmt: Stmt, rule: Rule, span: Span) -> Self { Self { stmt, rule, span } } @@ -70,7 +90,7 @@ impl<'i> StmtExpr<'i> { } Stmt::Let(r#let) => { buf.push_str("let "); - buf.write(r#let.name); + buf.write(&r#let.name); buf.push_str(" = "); r#let.rule.pretty_print(buf, true); buf.write(";\n"); diff --git a/pomsky-syntax/src/exprs/test.rs b/pomsky-syntax/src/exprs/test.rs index 0b8af7a..cdd378a 100644 --- a/pomsky-syntax/src/exprs/test.rs +++ b/pomsky-syntax/src/exprs/test.rs @@ -3,51 +3,58 @@ use crate::Span; use super::Literal; #[derive(Debug, Clone)] -pub struct Test<'i> { - pub cases: Vec>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct Test { + pub cases: Vec, pub span: Span, } #[derive(Debug, Clone)] -pub enum TestCase<'i> { - Match(TestCaseMatch<'i>), - MatchAll(TestCaseMatchAll<'i>), - Reject(TestCaseReject<'i>), +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum TestCase { + Match(TestCaseMatch), + MatchAll(TestCaseMatchAll), + Reject(TestCaseReject), } #[derive(Debug, Clone)] -pub struct TestCaseMatch<'i> { - pub literal: Literal<'i>, - pub captures: Vec>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct TestCaseMatch { + pub literal: Literal, + pub captures: Vec, pub span: Span, } #[derive(Debug, Clone)] -pub struct TestCaseMatchAll<'i> { - pub literal: Literal<'i>, - pub matches: Vec>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct TestCaseMatchAll { + pub literal: Literal, + pub matches: Vec, } #[derive(Debug, Clone)] -pub struct TestCaseReject<'i> { - pub literal: Literal<'i>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct TestCaseReject { + pub literal: Literal, pub as_substring: bool, } #[derive(Debug, Clone)] -pub struct TestCapture<'i> { - pub ident: CaptureIdent<'i>, +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub struct TestCapture { + pub ident: CaptureIdent, pub ident_span: Span, - pub literal: Literal<'i>, + pub literal: Literal, } -#[derive(Debug, Clone, Copy)] -pub enum CaptureIdent<'i> { - Name(&'i str), +#[derive(Debug, Clone)] +#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] +pub enum CaptureIdent { + Name(String), Index(u16), } -impl<'i> TestCase<'i> { +impl TestCase { #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) { match self { @@ -87,7 +94,7 @@ impl<'i> TestCase<'i> { } } -impl<'i> TestCaseMatch<'i> { +impl TestCaseMatch { #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) { self.literal.pretty_print(buf); @@ -97,7 +104,7 @@ impl<'i> TestCaseMatch<'i> { let len = self.captures.len(); for (i, capture) in self.captures.iter().enumerate() { - match capture.ident { + match &capture.ident { CaptureIdent::Name(name) => buf.push_str(name), CaptureIdent::Index(idx) => buf.write_fmt(idx), } diff --git a/pomsky-syntax/src/exprs/var.rs b/pomsky-syntax/src/exprs/var.rs index f7f145c..d510568 100644 --- a/pomsky-syntax/src/exprs/var.rs +++ b/pomsky-syntax/src/exprs/var.rs @@ -1,18 +1,30 @@ use crate::Span; #[derive(Debug, Clone, PartialEq, Eq)] -pub struct Variable<'i> { - pub name: &'i str, +pub struct Variable { + pub name: String, pub span: Span, } -impl<'i> Variable<'i> { - pub(crate) fn new(name: &'i str, span: Span) -> Self { - Variable { name, span } +impl Variable { + pub(crate) fn new(name: &str, span: Span) -> Self { + Variable { name: name.to_string(), span } } #[cfg(feature = "dbg")] pub(super) fn pretty_print(&self, buf: &mut crate::PrettyPrinter) { - buf.write(self.name); + buf.write(&self.name); + } +} + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for Variable { + fn arbitrary(u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + let name = super::arbitrary::Ident::create(u)?; + Ok(Variable { name, span: Span::arbitrary(u)? }) + } + + fn size_hint(depth: usize) -> (usize, Option) { + super::arbitrary::Ident::size_hint(depth) } } diff --git a/pomsky-syntax/src/parse/parser.rs b/pomsky-syntax/src/parse/parser.rs index 726fec9..f796ab2 100644 --- a/pomsky-syntax/src/parse/parser.rs +++ b/pomsky-syntax/src/parse/parser.rs @@ -13,7 +13,7 @@ use crate::{ /// expression. Note that **pomsky will overflow the stack** when parsing an /// expression with too much nesting, so the `recursion` argument should be low /// enough to prevent that. The recommended default is 256. -pub fn parse(source: &str, recursion: u32) -> (Option>, Vec) { +pub fn parse(source: &str, recursion: u32) -> (Option, Vec) { let tokens = tokenize(source); let mut errors = Vec::new(); diff --git a/pomsky-syntax/src/parse/parser_impl.rs b/pomsky-syntax/src/parse/parser_impl.rs index ed6f80e..d45d8e8 100644 --- a/pomsky-syntax/src/parse/parser_impl.rs +++ b/pomsky-syntax/src/parse/parser_impl.rs @@ -18,7 +18,7 @@ type PResult = Result; const MAX_REPETITION: u32 = 65_535; impl<'i> Parser<'i> { - pub(super) fn parse_modified(&mut self) -> PResult> { + pub(super) fn parse_modified(&mut self) -> PResult { let mut stmts = Vec::new(); let was_lazy = self.is_lazy; @@ -72,7 +72,7 @@ impl<'i> Parser<'i> { Ok(rule) } - fn parse_mode_modifier(&mut self) -> PResult, Span)>> { + fn parse_mode_modifier(&mut self) -> PResult> { let mode = if self.consume_reserved("enable") { true } else if self.consume_reserved("disable") { @@ -99,7 +99,7 @@ impl<'i> Parser<'i> { Ok(Some((stmt, span))) } - fn parse_let(&mut self) -> PResult, Span)>> { + fn parse_let(&mut self) -> PResult> { if self.consume_reserved("let") { let span_start = self.last_span(); let name_span = self.span(); @@ -127,7 +127,7 @@ impl<'i> Parser<'i> { } } - fn parse_test(&mut self) -> PResult, Span)>> { + fn parse_test(&mut self) -> PResult> { if self.consume_reserved("test") { let span_start = self.last_span(); self.expect(Token::OpenBrace)?; @@ -147,7 +147,7 @@ impl<'i> Parser<'i> { } } - fn parse_test_cases(&mut self) -> PResult>> { + fn parse_test_cases(&mut self) -> PResult> { if self.consume_contextual_keyword("match") { let mut matches = Vec::new(); let mut literal = None; @@ -192,7 +192,7 @@ impl<'i> Parser<'i> { } } - fn parse_test_match(&mut self) -> PResult> { + fn parse_test_match(&mut self) -> PResult { let Some(literal) = self.parse_literal()? else { return Err(PEK::ExpectedToken(Token::String).at(self.span())); }; @@ -222,11 +222,11 @@ impl<'i> Parser<'i> { Ok(TestCaseMatch { literal, captures, span: span_start.join(span_end) }) } - fn parse_test_capture(&mut self) -> PResult>> { + fn parse_test_capture(&mut self) -> PResult> { let ident = if let Some(n) = self.consume_number(u16::MAX)? { CaptureIdent::Index(n) } else if let Some(name) = self.consume_as(Token::Identifier) { - CaptureIdent::Name(name) + CaptureIdent::Name(name.to_string()) } else { return Ok(None); }; @@ -239,7 +239,7 @@ impl<'i> Parser<'i> { Ok(Some(TestCapture { ident, ident_span, literal })) } - fn parse_or(&mut self) -> PResult> { + fn parse_or(&mut self) -> PResult { let mut span = self.span(); let leading_pipe = self.consume(Token::Pipe); @@ -268,7 +268,7 @@ impl<'i> Parser<'i> { } } - fn parse_sequence(&mut self) -> PResult>> { + fn parse_sequence(&mut self) -> PResult> { let mut fixes = Vec::new(); while let Some(fix) = self.parse_fixes()? { fixes.push(fix); @@ -287,7 +287,7 @@ impl<'i> Parser<'i> { }) } - fn parse_fixes(&mut self) -> PResult>> { + fn parse_fixes(&mut self) -> PResult> { let mut nots_span = self.span(); let mut nots = 0usize; while self.consume(Token::Not) { @@ -315,7 +315,7 @@ impl<'i> Parser<'i> { Ok(Some(rule)) } - fn parse_lookaround(&mut self) -> PResult>> { + fn parse_lookaround(&mut self) -> PResult> { let kind = if self.consume(Token::LookAhead) { LookaroundKind::Ahead } else if self.consume(Token::LookBehind) { @@ -410,7 +410,7 @@ impl<'i> Parser<'i> { } } - fn parse_atom(&mut self) -> PResult>> { + fn parse_atom(&mut self) -> PResult> { Ok(self .parse_group()? .try_or_else(|| self.parse_string())? @@ -426,7 +426,7 @@ impl<'i> Parser<'i> { } /// Parses a (possibly capturing) group, e.g. `(E E | E)` or `:name(E)`. - fn parse_group(&mut self) -> PResult>> { + fn parse_group(&mut self) -> PResult> { let (kind, start_span) = self.parse_group_kind()?; if !kind.is_normal() { self.expect(Token::OpenParen)?; @@ -448,7 +448,7 @@ impl<'i> Parser<'i> { } /// Parses `:name` or just `:`. Returns the span of the colon with the name. - fn parse_group_kind(&mut self) -> PResult<(GroupKind<'i>, Span)> { + fn parse_group_kind(&mut self) -> PResult<(GroupKind, Span)> { if self.consume_reserved("atomic") { let span = self.last_span(); Ok((GroupKind::Atomic, span)) @@ -479,15 +479,15 @@ impl<'i> Parser<'i> { } /// Parses a string literal. - fn parse_string(&mut self) -> PResult>> { + fn parse_string(&mut self) -> PResult> { Ok(self.parse_literal()?.map(Rule::Literal)) } - fn parse_literal(&mut self) -> PResult>> { + fn parse_literal(&mut self) -> PResult> { if let Some(s) = self.consume_as(Token::String) { let span = self.last_span(); let content = helper::parse_quoted_text(s).map_err(|k| k.at(span))?; - Ok(Some(Literal::new(content, span))) + Ok(Some(Literal::new(content.to_string(), span))) } else { Ok(None) } @@ -498,7 +498,7 @@ impl<'i> Parser<'i> { /// /// This function does _not_ parse exclamation marks in front of a char /// class, because negation is handled separately. - fn parse_char_set(&mut self) -> PResult>> { + fn parse_char_set(&mut self) -> PResult> { if self.consume(Token::OpenBracket) { let start_span = self.last_span(); @@ -654,7 +654,7 @@ impl<'i> Parser<'i> { } } - fn parse_code_point_rule(&mut self) -> PResult>> { + fn parse_code_point_rule(&mut self) -> PResult> { if let Some((c, span)) = self.parse_code_point()? { Ok(Some(Rule::CharClass(CharClass::new( vec![GroupItem::Char(c)], @@ -692,7 +692,7 @@ impl<'i> Parser<'i> { /// This function does _not_ parse negated negated word boundaries (`!%`), /// since negation is handled elsewhere. It also does _not_ parse the /// `Start` and `End` global variables. - fn parse_boundary(&mut self) -> Option> { + fn parse_boundary(&mut self) -> Option { let span = self.span(); let kind = if self.consume(Token::Caret) { BoundaryKind::Start @@ -712,7 +712,7 @@ impl<'i> Parser<'i> { /// Parses a reference. Supported syntaxes are `::name`, `::3`, `::+3` and /// `::-3`. - fn parse_reference(&mut self) -> PResult>> { + fn parse_reference(&mut self) -> PResult> { if self.consume(Token::DoubleColon) { let start_span = self.last_span(); @@ -730,7 +730,7 @@ impl<'i> Parser<'i> { let name = self .expect_as(Token::Identifier) .map_err(|p| PEK::Expected("number or group name").at(p.span))?; - ReferenceTarget::Named(name) + ReferenceTarget::Named(name.to_string()) }; let span = start_span.join(self.last_span()); @@ -740,7 +740,7 @@ impl<'i> Parser<'i> { } } - fn parse_range(&mut self) -> PResult>> { + fn parse_range(&mut self) -> PResult> { if self.consume_reserved("range") { let span_start = self.last_span(); @@ -797,7 +797,7 @@ impl<'i> Parser<'i> { } /// Parses an unescaped regex expression (`regex "[test]"`) - fn parse_regex(&mut self) -> PResult>> { + fn parse_regex(&mut self) -> PResult> { if self.consume_reserved("regex") { let span_start = self.last_span(); let lit = self.expect_as(Token::String)?; @@ -806,14 +806,14 @@ impl<'i> Parser<'i> { let content = helper::parse_quoted_text(lit).map_err(|k| k.at(span_end))?; let span = span_start.join(span_end); - Ok(Some(Rule::Regex(Regex::new(content, span)))) + Ok(Some(Rule::Regex(Regex::new(content.to_string(), span)))) } else { Ok(None) } } /// Parses a variable (usage site). - fn parse_variable(&mut self) -> PResult>> { + fn parse_variable(&mut self) -> PResult> { if let Some(ident) = self.consume_as(Token::Identifier) { let span1 = self.last_span(); let rule = Rule::Variable(Variable::new(ident, span1)); @@ -827,7 +827,7 @@ impl<'i> Parser<'i> { } /// Parses the dot - fn parse_dot(&mut self) -> Option> { + fn parse_dot(&mut self) -> Option { if self.consume(Token::Dot) { Some(Rule::Dot) } else { @@ -836,7 +836,7 @@ impl<'i> Parser<'i> { } /// Parses the `recursion` keyword - fn parse_recursion(&mut self) -> Option> { + fn parse_recursion(&mut self) -> Option { if self.consume_reserved("recursion") { Some(Rule::Recursion(Recursion { span: self.last_span() })) } else { diff --git a/pomsky-syntax/src/span.rs b/pomsky-syntax/src/span.rs index 9ff9753..419946c 100644 --- a/pomsky-syntax/src/span.rs +++ b/pomsky-syntax/src/span.rs @@ -93,3 +93,14 @@ impl Debug for Span { write!(f, "Span({}..{})", self.start, self.end) } } + +#[cfg(feature = "arbitrary")] +impl arbitrary::Arbitrary<'_> for Span { + fn arbitrary(_u: &mut arbitrary::Unstructured<'_>) -> arbitrary::Result { + Ok(Span { start: 0, end: 0 }) + } + + fn size_hint(_depth: usize) -> (usize, Option) { + (0, Some(0)) + } +}