regex: some small cleanup in 'strip.rs'

We also utilize bstr's methods to get rid of some helpers we had written by hand.
2025-05-19 09:40:22 -07:00 · 2023-06-16 15:01:30 -04:00 · 2023-06-16 15:01:30 -04:00 · d9bd261be8
commit d9bd261be8
parent 9d62eb997a
5 changed files with 48 additions and 61 deletions
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@ -11,7 +11,7 @@ repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex"
 readme = "README.md"
 keywords = ["regex", "grep", "search", "pattern", "line"]
 license = "Unlicense OR MIT"
-edition = "2018"
+edition = "2021"
 [dependencies]
 aho-corasick = "1.0.2"
--- a/crates/regex/src/error.rs
+++ b/crates/regex/src/error.rs
@ -1,8 +1,3 @@
 use std::error;
 use std::fmt;
 use crate::util;
 /// An error that can occur in this crate.
 ///
 /// Generally, this error corresponds to problems building a regular
@ -32,7 +27,7 @@ impl Error {
        }
    }
-    pub(crate) fn generic<E: error::Error>(err: E) -> Error {
+    pub(crate) fn generic<E: std::error::Error>(err: E) -> Error {
        Error { kind: ErrorKind::Regex(err.to_string()) }
    }
@ -68,18 +63,23 @@ pub enum ErrorKind {
    InvalidLineTerminator(u8),
 }
-impl error::Error for Error {}
+impl std::error::Error for Error {}
 impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use bstr::ByteSlice;
 impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self.kind {
            ErrorKind::Regex(ref s) => write!(f, "{}", s),
            ErrorKind::NotAllowed(ref lit) => {
-                write!(f, "the literal '{:?}' is not allowed in a regex", lit)
+                write!(f, "the literal {:?} is not allowed in a regex", lit)
            }
            ErrorKind::InvalidLineTerminator(byte) => {
-                let x = util::show_bytes(&[byte]);
+                write!(
-                write!(f, "line terminators must be ASCII, but '{}' is not", x)
+                    f,
                    "line terminators must be ASCII, but {} is not",
                    [byte].as_bstr()
                )
            }
        }
    }
--- a/crates/regex/src/lib.rs
+++ b/crates/regex/src/lib.rs
@ -14,5 +14,4 @@ mod matcher;
 mod multi;
 mod non_matching;
 mod strip;
 mod util;
 mod word;
--- a/crates/regex/src/strip.rs
+++ b/crates/regex/src/strip.rs
@ -1,5 +1,7 @@
-use grep_matcher::LineTerminator;
+use {
-use regex_syntax::hir::{self, Hir, HirKind};
+    grep_matcher::LineTerminator,
    regex_syntax::hir::{self, Hir, HirKind},
 };
 use crate::error::{Error, ErrorKind};
@ -15,7 +17,26 @@ use crate::error::{Error, ErrorKind};
 ///
 /// If the given line terminator is not ASCII, then this function returns an
 /// error.
-pub fn strip_from_match(
+///
 /// Note that as of regex 1.9, this routine could theoretically be implemented
 /// without returning an error. Namely, for example, we could turn
 /// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a
 /// sub-expression that can never match anything. Thus, ripgrep would accept
 /// such regexes and just silently not match anything. Regex versions prior to 1.8
 /// don't support such constructs. I ended up deciding to leave the existing
 /// behavior of returning an error instead. For example:
 ///
 /// ```text
 /// $ echo -n 'foo\nbar\n' | rg 'foo\nbar'
 /// the literal '"\n"' is not allowed in a regex
 ///
 /// Consider enabling multiline mode with the --multiline flag (or -U for short).
 /// When multiline mode is enabled, new line characters can be matched.
 /// ```
 ///
 /// This looks like a good error message to me, and even suggests a flag that
 /// the user can use instead.
 pub(crate) fn strip_from_match(
    expr: Hir,
    line_term: LineTerminator,
 ) -> Result<Hir, Error> {
@ -23,23 +44,20 @@ pub fn strip_from_match(
        let expr1 = strip_from_match_ascii(expr, b'\r')?;
        strip_from_match_ascii(expr1, b'\n')
    } else {
-        let b = line_term.as_byte();
+        strip_from_match_ascii(expr, line_term.as_byte())
        if b > 0x7F {
            return Err(Error::new(ErrorKind::InvalidLineTerminator(b)));
        }
        strip_from_match_ascii(expr, b)
    }
 }
-/// The implementation of strip_from_match. The given byte must be ASCII. This
+/// The implementation of strip_from_match. The given byte must be ASCII.
-/// function panics otherwise.
+/// This function returns an error otherwise. It also returns an error if
 /// it couldn't remove `\n` from the given regex without leaving an empty
 /// character class in its place.
 fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
-    assert!(byte <= 0x7F);
+    if !byte.is_ascii() {
-    let chr = byte as char;
+        return Err(Error::new(ErrorKind::InvalidLineTerminator(byte)));
-    assert_eq!(chr.len_utf8(), 1);
+    }
-
+    let ch = char::from(byte);
-    let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string())));
+    let invalid = || Err(Error::new(ErrorKind::NotAllowed(ch.to_string())));
    Ok(match expr.into_kind() {
        HirKind::Empty => Hir::empty(),
        HirKind::Literal(hir::Literal(lit)) => {
@ -50,7 +68,7 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
        }
        HirKind::Class(hir::Class::Unicode(mut cls)) => {
            let remove = hir::ClassUnicode::new(Some(
-                hir::ClassUnicodeRange::new(chr, chr),
+                hir::ClassUnicodeRange::new(ch, ch),
            ));
            cls.difference(&remove);
            if cls.ranges().is_empty() {
--- a/crates/regex/src/util.rs
+++ b/crates/regex/src/util.rs
@ -1,30 +0,0 @@
 /// Converts an arbitrary sequence of bytes to a literal suitable for building
 /// a regular expression.
 #[allow(dead_code)]
 pub fn bytes_to_regex(bs: &[u8]) -> String {
    use regex_syntax::is_meta_character;
    use std::fmt::Write;
    let mut s = String::with_capacity(bs.len());
    for &b in bs {
        if b <= 0x7F && !is_meta_character(b as char) {
            write!(s, r"{}", b as char).unwrap();
        } else {
            write!(s, r"\x{:02x}", b).unwrap();
        }
    }
    s
 }
 /// Converts arbitrary bytes to a nice string.
 pub fn show_bytes(bs: &[u8]) -> String {
    use std::ascii::escape_default;
    use std::str;
    let mut nice = String::new();
    for &b in bs {
        let part: Vec<u8> = escape_default(b).collect();
        nice.push_str(str::from_utf8(&part).unwrap());
    }
    nice
 }