From 43eff4397f58cc23c7a9d1435af99e5892e6f671 Mon Sep 17 00:00:00 2001 From: Luke Sandberg Date: Mon, 12 May 2025 14:51:25 -0700 Subject: [PATCH] globset: support nested alternates For example, `**/{node_modules/**/*/{ts,js},crates/**/*.{rs,toml}`. I originally didn't add this I think for implementation simplicity, but it turns out that it really isn't much work to do. There might have also been some odd behavior in the regex engine for dealing with empty alternates, but that has all been long fixed. Closes #3048, Closes #3112 --- CHANGELOG.md | 2 ++ crates/globset/src/glob.rs | 71 +++++++++++++++++++++++--------------- crates/globset/src/lib.rs | 7 ++-- 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 605e9ef5..5aff37e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,8 @@ Feature enhancements: When using multithreading, schedule files to search in order given on CLI. * [FEATURE #2943](https://github.com/BurntSushi/ripgrep/issues/2943): Add `aarch64` release artifacts for Windows. +* [FEATURE #3048](https://github.com/BurntSushi/ripgrep/pull/3048): + Globs in ripgrep (and the `globset` crate) now support nested alternates. 14.1.1 (2024-09-08) diff --git a/crates/globset/src/glob.rs b/crates/globset/src/glob.rs index 8cc7645f..378f3f4a 100644 --- a/crates/globset/src/glob.rs +++ b/crates/globset/src/glob.rs @@ -583,25 +583,26 @@ impl<'a> GlobBuilder<'a> { pub fn build(&self) -> Result { let mut p = Parser { glob: &self.glob, - stack: vec![Tokens::default()], + alternates_stack: Vec::new(), + branches: vec![Tokens::default()], chars: self.glob.chars().peekable(), prev: None, cur: None, opts: &self.opts, }; p.parse()?; - if p.stack.is_empty() { - Err(Error { - glob: Some(self.glob.to_string()), - kind: ErrorKind::UnopenedAlternates, - }) - } else if p.stack.len() > 1 { + if p.branches.is_empty() { + // OK because of how the the branches/alternate_stack are managed. + // If we end up here, then there *must* be a bug in the parser + // somewhere. + unreachable!() + } else if p.branches.len() > 1 { Err(Error { glob: Some(self.glob.to_string()), kind: ErrorKind::UnclosedAlternates, }) } else { - let tokens = p.stack.pop().unwrap(); + let tokens = p.branches.pop().unwrap(); Ok(Glob { glob: self.glob.to_string(), re: tokens.to_regex_with(&self.opts), @@ -776,7 +777,11 @@ fn bytes_to_escaped_literal(bs: &[u8]) -> String { struct Parser<'a> { glob: &'a str, - stack: Vec, + // Marks the index in `stack` where the alternation started. + alternates_stack: Vec, + // The set of active alternation branches being parsed. + // Tokens are added to the end of the last one. + branches: Vec, chars: std::iter::Peekable>, prev: Option, cur: Option, @@ -805,36 +810,37 @@ impl<'a> Parser<'a> { } fn push_alternate(&mut self) -> Result<(), Error> { - if self.stack.len() > 1 { - return Err(self.error(ErrorKind::NestedAlternates)); - } - Ok(self.stack.push(Tokens::default())) + self.alternates_stack.push(self.branches.len()); + self.branches.push(Tokens::default()); + Ok(()) } fn pop_alternate(&mut self) -> Result<(), Error> { - let mut alts = vec![]; - while self.stack.len() >= 2 { - alts.push(self.stack.pop().unwrap()); - } - self.push_token(Token::Alternates(alts)) + let Some(start) = self.alternates_stack.pop() else { + return Err(self.error(ErrorKind::UnopenedAlternates)); + }; + assert!(start <= self.branches.len()); + let alts = Token::Alternates(self.branches.drain(start..).collect()); + self.push_token(alts)?; + Ok(()) } fn push_token(&mut self, tok: Token) -> Result<(), Error> { - if let Some(ref mut pat) = self.stack.last_mut() { + if let Some(ref mut pat) = self.branches.last_mut() { return Ok(pat.push(tok)); } Err(self.error(ErrorKind::UnopenedAlternates)) } fn pop_token(&mut self) -> Result { - if let Some(ref mut pat) = self.stack.last_mut() { + if let Some(ref mut pat) = self.branches.last_mut() { return Ok(pat.pop().unwrap()); } Err(self.error(ErrorKind::UnopenedAlternates)) } fn have_tokens(&self) -> Result { - match self.stack.last() { + match self.branches.last() { None => Err(self.error(ErrorKind::UnopenedAlternates)), Some(ref pat) => Ok(!pat.is_empty()), } @@ -843,11 +849,11 @@ impl<'a> Parser<'a> { fn parse_comma(&mut self) -> Result<(), Error> { // If we aren't inside a group alternation, then don't // treat commas specially. Otherwise, we need to start - // a new alternate. - if self.stack.len() <= 1 { + // a new alternate branch. + if self.alternates_stack.is_empty() { self.push_token(Token::Literal(',')) } else { - Ok(self.stack.push(Tokens::default())) + Ok(self.branches.push(Tokens::default())) } } @@ -884,7 +890,7 @@ impl<'a> Parser<'a> { } if !prev.map(is_separator).unwrap_or(false) { - if self.stack.len() <= 1 + if self.branches.len() <= 1 || (prev != Some(',') && prev != Some('{')) { self.push_token(Token::ZeroOrMore)?; @@ -897,7 +903,7 @@ impl<'a> Parser<'a> { assert!(self.bump().is_none()); true } - Some(',') | Some('}') if self.stack.len() >= 2 => true, + Some(',') | Some('}') if self.branches.len() >= 2 => true, Some(c) if is_separator(c) => { assert!(self.bump().map(is_separator).unwrap_or(false)); false @@ -1225,6 +1231,10 @@ mod tests { syntaxerr!(err_unclosed4, "[!]", ErrorKind::UnclosedClass); syntaxerr!(err_range1, "[z-a]", ErrorKind::InvalidRange('z', 'a')); syntaxerr!(err_range2, "[z--]", ErrorKind::InvalidRange('z', '-')); + syntaxerr!(err_alt1, "{a,b", ErrorKind::UnclosedAlternates); + syntaxerr!(err_alt2, "{a,{b,c}", ErrorKind::UnclosedAlternates); + syntaxerr!(err_alt3, "a,b}", ErrorKind::UnopenedAlternates); + syntaxerr!(err_alt4, "{a,b}}", ErrorKind::UnopenedAlternates); const CASEI: Options = Options { casei: Some(true), litsep: None, bsesc: None, ealtre: None }; @@ -1245,6 +1255,8 @@ mod tests { ealtre: Some(true), }; + toregex!(re_empty, "", "^$"); + toregex!(re_casei, "a", "(?i)^a$", &CASEI); toregex!(re_slash1, "?", r"^[^/]$", SLASHLIT); @@ -1284,7 +1296,9 @@ mod tests { toregex!(re32, "/a**", r"^/a.*.*$"); toregex!(re33, "/**a", r"^/.*.*a$"); toregex!(re34, "/a**b", r"^/a.*.*b$"); - toregex!(re35, "{a,b}", r"^(?:b|a)$"); + toregex!(re35, "{a,b}", r"^(?:a|b)$"); + toregex!(re36, "{a,{b,c}}", r"^(?:a|(?:b|c))$"); + toregex!(re37, "{{a,b},{c,d}}", r"^(?:(?:a|b)|(?:c|d))$"); matches!(match1, "a", "a"); matches!(match2, "a*b", "a_b"); @@ -1372,6 +1386,9 @@ mod tests { matches!(matchalt14, "foo{,.txt}", "foo.txt"); nmatches!(matchalt15, "foo{,.txt}", "foo"); matches!(matchalt16, "foo{,.txt}", "foo", EALTRE); + matches!(matchalt17, "{a,b{c,d}}", "bc"); + matches!(matchalt18, "{a,b{c,d}}", "bd"); + matches!(matchalt19, "{a,b{c,d}}", "a"); matches!(matchslash1, "abc/def", "abc/def", SLASHLIT); #[cfg(unix)] diff --git a/crates/globset/src/lib.rs b/crates/globset/src/lib.rs index 623d7f9d..4f0c729b 100644 --- a/crates/globset/src/lib.rs +++ b/crates/globset/src/lib.rs @@ -182,8 +182,11 @@ pub enum ErrorKind { UnopenedAlternates, /// Occurs when a `{` is found without a matching `}`. UnclosedAlternates, - /// Occurs when an alternating group is nested inside another alternating - /// group, e.g., `{{a,b},{c,d}}`. + /// **DEPRECATED**. + /// + /// This error used to occur when an alternating group was nested inside + /// another alternating group, e.g., `{{a,b},{c,d}}`. However, this is now + /// supported and as such this error cannot occur. NestedAlternates, /// Occurs when an unescaped '\' is found at the end of a glob. DanglingEscape,