grep: upgrade to regex-syntax 0.5

This update brings with it many bug fixes: * Better error messages are printed overall. We also include explicit call out for unsupported features like backreferences and look-around. * Regexes like `\s*{` no longer emit incomprehensible errors. * Unicode escape sequences, such as `\u{..}` are now supported. For the most part, this upgrade was done in a straight-forward way. We resist the urge to refactor the `grep` crate, in anticipation of it being rewritten anyway. Note that we removed the `--fixed-strings` suggestion whenever a regex syntax error occurs. In practice, I've found that it results in a lot of false positives, and I believe that its use is not as paramount now that regex parse errors are much more readable. Closes #268, Closes #395, Closes #702, Closes #853
2025-05-19 09:40:22 -07:00 · 2018-03-13 20:38:50 -04:00 · 2018-03-13 20:38:50 -04:00 · cd08707c7c
commit cd08707c7c
parent c2e97cd858
9 changed files with 152 additions and 159 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -14,11 +14,16 @@ CPUs (such as AVX2) for additional optimizations.
  per line. Previously, the behavior of ripgrep was to report the total number
  of matching lines. (Note that this behavior diverges from the behavior of
  GNU grep.)
 * Octal syntax is no longer supported. ripgrep previously accepted expressions
  like `\1` as syntax for matching `U+0001`, but ripgrep will now report an
  error instead.
 Feature enhancements:
 * [FEATURE #411](https://github.com/BurntSushi/ripgrep/issues/411):
  Add a `--stats` flag, which emits aggregate statistics after search results.
 * [FEATURE #702](https://github.com/BurntSushi/ripgrep/issues/702):
  Support `\u{..}` Unicode escape sequences.
 * [FEATURE #812](https://github.com/BurntSushi/ripgrep/issues/812):
  Add `-b/--byte-offset` flag that reports byte offset of each matching line.
 * [FEATURE #814](https://github.com/BurntSushi/ripgrep/issues/814):
@ -29,12 +34,19 @@ Bug fixes:
 * [BUG #135](https://github.com/BurntSushi/ripgrep/issues/135):
  Release portable binaries that conditionally use SSSE3, AVX2, etc., at
  runtime.
 * [BUG #268](https://github.com/BurntSushi/ripgrep/issues/268):
  Print descriptive error message when trying to use look-around or
  backreferences.
 * [BUG #395](https://github.com/BurntSushi/ripgrep/issues/395):
  Show comprehensible error messages for regexes like `\s*{`.
 * [BUG #526](https://github.com/BurntSushi/ripgrep/issues/526):
  Support backslash escapes in globs.
 * [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
  Clarify usage instructions for `-f/--file` flag.
 * [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
  Be robust with respect to `ENOMEM` errors returned by `mmap`.
 * [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
  Upgrade `grep` crate to `regex-syntax 0.5.0`.
 0.8.1 (2018-02-20)
--- a/Cargo.lock
+++ b/Cargo.lock
@ -109,7 +109,7 @@ dependencies = [
 "log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)",
- "regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
@ -212,19 +212,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
 "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
- "regex-syntax 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
 "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "regex-syntax"
-version = "0.4.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "regex-syntax"
 version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
 "ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -401,8 +396,7 @@ dependencies = [
 "checksum redox_syscall 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "0d92eecebad22b767915e4d529f89f28ee96dbbf5a4810d2b844373f136417fd"
 "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
 "checksum regex 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bde64a9b799f85750f6469fd658cff5fce8d910a7d510858a1f9d15ca9f023bf"
-"checksum regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "8e931c58b93d86f080c734bfd2bce7dd0079ae2331235818133c8be7f422e20e"
+"checksum regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b2550876c31dc914696a6c2e01cbce8afba79a93c8ae979d2fe051c0230b3756"
 "checksum regex-syntax 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d8337992c33b62cf49462a1ce4d0eedbb021f48de017e40ec307cca445df0ca9"
 "checksum same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "cfb6eded0b06a0b512c8ddbcf04089138c9b4362c2f696f3c3d76039d68f3637"
 "checksum simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673"
 "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550"
--- a/grep/Cargo.toml
+++ b/grep/Cargo.toml
@ -16,4 +16,4 @@ license = "Unlicense/MIT"
 log = "0.4"
 memchr = "2"
 regex = "0.2.9"
-regex-syntax = "0.4.0"
+regex-syntax = "0.5.3"
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@ -10,10 +10,8 @@ principled.
 use std::cmp;
 use regex::bytes::RegexBuilder;
-use syntax::{
+use syntax::hir::{self, Hir, HirKind};
-    Expr, Literals, Lit,
+use syntax::hir::literal::{Literal, Literals};
    ByteClass, ByteRange, CharClass, ClassRange, Repeater,
 };
 #[derive(Clone, Debug)]
 pub struct LiteralSets {
@ -23,12 +21,12 @@ pub struct LiteralSets {
 }
 impl LiteralSets {
-    pub fn create(expr: &Expr) -> Self {
+    pub fn create(expr: &Hir) -> Self {
        let mut required = Literals::empty();
        union_required(expr, &mut required);
        LiteralSets {
-            prefixes: expr.prefixes(),
+            prefixes: Literals::prefixes(expr),
-            suffixes: expr.suffixes(),
+            suffixes: Literals::suffixes(expr),
            required: required,
        }
    }
@ -93,60 +91,52 @@ impl LiteralSets {
    }
 }
-fn union_required(expr: &Expr, lits: &mut Literals) {
+fn union_required(expr: &Hir, lits: &mut Literals) {
-    use syntax::Expr::*;
+    match *expr.kind() {
-    match *expr {
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
-        Literal { ref chars, casei: false } => {
+            let mut buf = [0u8; 4];
-            let s: String = chars.iter().cloned().collect();
+            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
            lits.cross_add(s.as_bytes());
        }
-        Literal { ref chars, casei: true } => {
+        HirKind::Literal(hir::Literal::Byte(b)) => {
-            for &c in chars {
+            lits.cross_add(&[b]);
-                let cls = CharClass::new(vec![
+        }
-                    ClassRange { start: c, end: c },
+        HirKind::Class(hir::Class::Unicode(ref cls)) => {
-                ]).case_fold();
+            if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
-                if !lits.add_char_class(&cls) {
+                lits.cut();
            }
        }
        HirKind::Class(hir::Class::Bytes(ref cls)) => {
            if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
                lits.cut();
            }
        }
        HirKind::Group(hir::Group { ref hir, .. }) => {
            union_required(&**hir, lits);
        }
        HirKind::Repetition(ref x) => {
            match x.kind {
                hir::RepetitionKind::ZeroOrOne => lits.cut(),
                hir::RepetitionKind::ZeroOrMore => lits.cut(),
                hir::RepetitionKind::OneOrMore => {
                    union_required(&x.hir, lits);
                    lits.cut();
-                    return;
+                }
                hir::RepetitionKind::Range(ref rng) => {
                    let (min, max) = match *rng {
                        hir::RepetitionRange::Exactly(m) => (m, Some(m)),
                        hir::RepetitionRange::AtLeast(m) => (m, None),
                        hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
                    };
                    repeat_range_literals(
                        &x.hir, min, max, x.greedy, lits, union_required);
                }
            }
        }
-        LiteralBytes { ref bytes, casei: false } => {
+        HirKind::Concat(ref es) if es.is_empty() => {}
-            lits.cross_add(bytes);
+        HirKind::Concat(ref es) if es.len() == 1 => {
            union_required(&es[0], lits)
        }
-        LiteralBytes { ref bytes, casei: true } => {
+        HirKind::Concat(ref es) => {
            for &b in bytes {
                let cls = ByteClass::new(vec![
                    ByteRange { start: b, end: b },
                ]).case_fold();
                if !lits.add_byte_class(&cls) {
                    lits.cut();
                    return;
                }
            }
        }
        Class(_) => {
            lits.cut();
        }
        ClassBytes(_) => {
            lits.cut();
        }
        Group { ref e, .. } => {
            union_required(&**e, lits);
        }
        Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(),
        Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(),
        Repeat { ref e, r: Repeater::OneOrMore, .. } => {
            union_required(&**e, lits);
            lits.cut();
        }
        Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
            repeat_range_literals(
                &**e, min, max, greedy, lits, union_required);
        }
        Concat(ref es) if es.is_empty() => {}
        Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
        Concat(ref es) => {
            for e in es {
                let mut lits2 = lits.to_empty();
                union_required(e, &mut lits2);
@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
                if lits2.contains_empty() {
                    lits.cut();
                }
                // if !lits.union(lits2) {
                if !lits.cross_product(&lits2) {
                    // If this expression couldn't yield any literal that
                    // could be extended, then we need to quit. Since we're
@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
                }
            }
        }
-        Alternate(ref es) => {
+        HirKind::Alternation(ref es) => {
            alternate_literals(es, lits, union_required);
        }
        _ => lits.cut(),
    }
 }
-fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
+fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
-    e: &Expr,
+    e: &Hir,
    min: u32,
    max: Option<u32>,
    _greedy: bool,
@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
    }
 }
-fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
+fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
-    es: &[Expr],
+    es: &[Hir],
    lits: &mut Literals,
    mut f: F,
 ) {
@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
    }
    lits.cut();
    if !lcs.is_empty() {
-        lits.add(Lit::empty());
+        lits.add(Literal::empty());
-        lits.add(Lit::new(lcs.to_vec()));
+        lits.add(Literal::new(lcs.to_vec()));
    }
 }
 /// Return the number of characters in the given class.
 fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
 }
 /// Return the number of bytes in the given class.
 fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
    cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
 }
 /// Converts an arbitrary sequence of bytes to a literal suitable for building
 /// a regular expression.
 fn bytes_to_regex(bs: &[u8]) -> String {
--- a/grep/src/nonl.rs
+++ b/grep/src/nonl.rs
@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};
 use {Error, Result};
@ -9,59 +9,66 @@ use {Error, Result};
 ///
 /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
 /// function panics.
-pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
+pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
    // TODO(burntsushi): There is a bug in this routine where only `\n` is
    // handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
    // to proper character classes instead of the special `AnyCharNoNL` and
    // `AnyByteNoNL` classes.
    use syntax::Expr::*;
    assert!(byte <= 0x7F);
    let chr = byte as char;
    assert!(chr.len_utf8() == 1);
-    Ok(match expr {
+    Ok(match expr.into_kind() {
-        Literal { chars, casei } => {
+        HirKind::Empty => Hir::empty(),
-            if chars.iter().position(|&c| c == chr).is_some() {
+        HirKind::Literal(hir::Literal::Unicode(c)) => {
            if c == chr {
                return Err(Error::LiteralNotAllowed(chr));
            }
-            Literal { chars: chars, casei: casei }
+            Hir::literal(hir::Literal::Unicode(c))
        }
-        LiteralBytes { bytes, casei } => {
+        HirKind::Literal(hir::Literal::Byte(b)) => {
-            if bytes.iter().position(|&b| b == byte).is_some() {
+            if b as char == chr {
                return Err(Error::LiteralNotAllowed(chr));
            }
-            LiteralBytes { bytes: bytes, casei: casei }
+            Hir::literal(hir::Literal::Byte(b))
        }
-        AnyChar => AnyCharNoNL,
+        HirKind::Class(hir::Class::Unicode(mut cls)) => {
-        AnyByte => AnyByteNoNL,
+            let remove = hir::ClassUnicode::new(Some(
-        Class(mut cls) => {
+                hir::ClassUnicodeRange::new(chr, chr),
-            cls.remove(chr);
+            ));
-            Class(cls)
+            cls.difference(&remove);
-        }
+            if cls.iter().next().is_none() {
-        ClassBytes(mut cls) => {
+                return Err(Error::LiteralNotAllowed(chr));
            cls.remove(byte);
            ClassBytes(cls)
        }
        Group { e, i, name } => {
            Group {
                e: Box::new(remove(*e, byte)?),
                i: i,
                name: name,
            }
            Hir::class(hir::Class::Unicode(cls))
        }
-        Repeat { e, r, greedy } => {
+        HirKind::Class(hir::Class::Bytes(mut cls)) => {
-            Repeat {
+            let remove = hir::ClassBytes::new(Some(
-                e: Box::new(remove(*e, byte)?),
+                hir::ClassBytesRange::new(byte, byte),
-                r: r,
+            ));
-                greedy: greedy,
+            cls.difference(&remove);
            if cls.iter().next().is_none() {
                return Err(Error::LiteralNotAllowed(chr));
            }
            Hir::class(hir::Class::Bytes(cls))
        }
-        Concat(exprs) => {
+        HirKind::Anchor(x) => Hir::anchor(x),
-            Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+        HirKind::WordBoundary(x) => Hir::word_boundary(x),
        HirKind::Repetition(mut x) => {
            x.hir = Box::new(remove(*x.hir, byte)?);
            Hir::repetition(x)
        }
-        Alternate(exprs) => {
+        HirKind::Group(mut x) => {
-            Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
+            x.hir = Box::new(remove(*x.hir, byte)?);
            Hir::group(x)
        }
        HirKind::Concat(xs) => {
            let xs = xs.into_iter()
                .map(|e| remove(e, byte))
                .collect::<Result<Vec<Hir>>>()?;
            Hir::concat(xs)
        }
        HirKind::Alternation(xs) => {
            let xs = xs.into_iter()
                .map(|e| remove(e, byte))
                .collect::<Result<Vec<Hir>>>()?;
            Hir::alternation(xs)
        }
        e => e,
    })
 }
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@ -1,10 +1,10 @@
 use memchr::{memchr, memrchr};
 use regex::bytes::{Regex, RegexBuilder};
 use syntax;
 use literals::LiteralSets;
 use nonl;
-use syntax::Expr;
+use syntax::ParserBuilder;
 use syntax::hir::Hir;
 use word_boundary::strip_unicode_word_boundaries;
 use Result;
@ -166,7 +166,7 @@ impl GrepBuilder {
    /// Creates a new regex from the given expression with the current
    /// configuration.
-    fn regex(&self, expr: &Expr) -> Result<Regex> {
+    fn regex(&self, expr: &Hir) -> Result<Regex> {
        let mut builder = RegexBuilder::new(&expr.to_string());
        builder.unicode(true);
        self.regex_build(builder)
@ -184,15 +184,16 @@ impl GrepBuilder {
    /// Parses the underlying pattern and ensures the pattern can never match
    /// the line terminator.
-    fn parse(&self) -> Result<syntax::Expr> {
+    fn parse(&self) -> Result<Hir> {
-        let expr =
+        let expr = ParserBuilder::new()
-            syntax::ExprBuilder::new()
+            .allow_invalid_utf8(true)
            .allow_bytes(true)
            .unicode(true)
            .case_insensitive(self.is_case_insensitive()?)
            .multi_line(true)
            .build()
            .parse(&self.pattern)?;
        debug!("original regex HIR pattern:\n{}", expr);
        let expr = nonl::remove(expr, self.opts.line_terminator)?;
-        debug!("regex ast:\n{:#?}", expr);
+        debug!("transformed regex HIR pattern:\n{}", expr);
        Ok(expr)
    }
--- a/grep/src/word_boundary.rs
+++ b/grep/src/word_boundary.rs
@ -1,4 +1,4 @@
-use syntax::Expr;
+use syntax::hir::{self, Hir, HirKind};
 /// Strips Unicode word boundaries from the given expression.
 ///
@ -8,7 +8,7 @@ use syntax::Expr;
 /// false negatives.
 ///
 /// If no word boundaries could be stripped, then None is returned.
-pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
+pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
    // The real reason we do this is because Unicode word boundaries are the
    // one thing that Rust's regex DFA engine can't handle. When it sees a
    // Unicode word boundary among non-ASCII text, it falls back to one of the
@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
    // a regex to find candidate matches without a Unicode word boundary. We'll
    // only then use the full (and slower) regex to confirm a candidate as a
    // match or not during search.
-    use syntax::Expr::*;
+    //
-
+    // It looks like we only check the outer edges for `\b`? I guess this is
-    match *expr {
+    // an attempt to optimize for the `-w/--word-regexp` flag? ---AG
-        Concat(ref es) if !es.is_empty() => {
+    match *expr.kind() {
        HirKind::Concat(ref es) if !es.is_empty() => {
            let first = is_unicode_word_boundary(&es[0]);
            let last = is_unicode_word_boundary(es.last().unwrap());
            // Be careful not to strip word boundaries if there are no other
            // expressions to match.
            match (first, last) {
                (true, false) if es.len() > 1 => {
-                    Some(Concat(es[1..].to_vec()))
+                    Some(Hir::concat(es[1..].to_vec()))
                }
                (false, true) if es.len() > 1 => {
-                    Some(Concat(es[..es.len() - 1].to_vec()))
+                    Some(Hir::concat(es[..es.len() - 1].to_vec()))
                }
                (true, true) if es.len() > 2 => {
-                    Some(Concat(es[1..es.len() - 1].to_vec()))
+                    Some(Hir::concat(es[1..es.len() - 1].to_vec()))
                }
                _ => None,
            }
@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
 }
 /// Returns true if the given expression is a Unicode word boundary.
-fn is_unicode_word_boundary(expr: &Expr) -> bool {
+fn is_unicode_word_boundary(expr: &Hir) -> bool {
-    use syntax::Expr::*;
+    match *expr.kind() {
-
+        HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
-    match *expr {
+        HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
-        WordBoundary => true,
+        HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
        NotWordBoundary => true,
        Group { ref e, .. } => is_unicode_word_boundary(e),
        _ => false,
    }
 }
--- a/src/args.rs
+++ b/src/args.rs
@ -9,7 +9,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use clap;
 use encoding_rs::Encoding;
-use grep::{Grep, GrepBuilder, Error as GrepError};
+use grep::{Grep, GrepBuilder};
 use log;
 use num_cpus;
 use regex;
@ -882,16 +882,7 @@ impl<'a> ArgMatches<'a> {
        if let Some(limit) = self.regex_size_limit()? {
            gb = gb.size_limit(limit);
        }
-        gb.build().map_err(|err| {
+        Ok(gb.build()?)
            match err {
                GrepError::Regex(err) => {
                  let s = format!("{}\n(Hint: Try the --fixed-strings flag \
                  to search for a literal string.)", err.to_string());
                  From::from(s)
                },
                err => From::from(err)
            }
        })
    }
    /// Builds the set of glob overrides from the command line flags.
--- a/tests/tests.rs
+++ b/tests/tests.rs
@ -1687,16 +1687,6 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
    assert_eq!(lines, "sherlock\x002\n");
 });
 // See: https://github.com/BurntSushi/ripgrep/issues/709
 clean!(suggest_fixed_strings_for_invalid_regex, "foo(", ".",
 |wd: WorkDir, mut cmd: Command| {
    wd.assert_non_empty_stderr(&mut cmd);
    let output = cmd.output().unwrap();
    let err = String::from_utf8_lossy(&output.stderr);
    assert_eq!(err.contains("--fixed-strings"), true);
 });
 #[test]
 fn compressed_gzip() {
    if !cmd_exists("gzip") {