deps: initial migration steps to regex 1.9

This leaves the grep-regex crate in tatters. Pretty much the entire thing needs to be re-worked. The upshot is that it should result in some big simplifications. I hope. The idea here is to drop down and actually use regex-automata 0.3 instead of the regex crate itself.
2025-08-05 14:42:07 -07:00 · 2023-06-11 21:25:23 -04:00
parent a7f1276021
commit 1035f6b1ff
15 changed files with 606 additions and 558 deletions
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -1,6 +1,10 @@
-use grep_matcher::ByteSet;
-use regex_syntax::hir::{self, Hir, HirKind};
-use regex_syntax::utf8::Utf8Sequences;
+use {
+    grep_matcher::ByteSet,
+    regex_syntax::{
+        hir::{self, Hir, HirKind, Look},
+        utf8::Utf8Sequences,
+    },
+};

 /// Return a confirmed set of non-matching bytes from the given expression.
 pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
@@ -13,18 +17,28 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
 /// the given expression.
 fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
    match *expr.kind() {
-        HirKind::Empty | HirKind::WordBoundary(_) => {}
-        HirKind::Anchor(_) => {
+        HirKind::Empty
+        // | HirKind::Look(Look::Start | Look::End)
+        | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
+        | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
+        HirKind::Look(Look::Start | Look::End) => {
+            // FIXME: This is wrong, but not doing this leads to incorrect
+            // results because of how anchored searches are implemented in
+            // the 'grep-searcher' crate.
            set.remove(b'\n');
        }
-        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
+        HirKind::Look(Look::StartLF | Look::EndLF) => {
+            set.remove(b'\n');
+        }
+        HirKind::Look(Look::StartCRLF | Look::EndCRLF) => {
+            set.remove(b'\r');
+            set.remove(b'\n');
+        }
+        HirKind::Literal(hir::Literal(ref lit)) => {
+            for &b in lit.iter() {
                set.remove(b);
            }
        }
-        HirKind::Literal(hir::Literal::Byte(b)) => {
-            set.remove(b);
-        }
        HirKind::Class(hir::Class::Unicode(ref cls)) => {
            for range in cls.iter() {
                // This is presumably faster than encoding every codepoint
@@ -42,10 +56,10 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
            }
        }
        HirKind::Repetition(ref x) => {
-            remove_matching_bytes(&x.hir, set);
+            remove_matching_bytes(&x.sub, set);
        }
-        HirKind::Group(ref x) => {
-            remove_matching_bytes(&x.hir, set);
+        HirKind::Capture(ref x) => {
+            remove_matching_bytes(&x.sub, set);
        }
        HirKind::Concat(ref xs) => {
            for x in xs {
@@ -62,17 +76,13 @@ fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {

 #[cfg(test)]
 mod tests {
-    use grep_matcher::ByteSet;
-    use regex_syntax::ParserBuilder;
+    use {grep_matcher::ByteSet, regex_syntax::ParserBuilder};

    use super::non_matching_bytes;

    fn extract(pattern: &str) -> ByteSet {
-        let expr = ParserBuilder::new()
-            .allow_invalid_utf8(true)
-            .build()
-            .parse(pattern)
-            .unwrap();
+        let expr =
+            ParserBuilder::new().utf8(false).build().parse(pattern).unwrap();
        non_matching_bytes(&expr)
    }

@@ -131,9 +141,13 @@ mod tests {

    #[test]
    fn anchor() {
+        // FIXME: The first four tests below should correspond to a full set
+        // of bytes for the non-matching bytes I think.
        assert_eq!(sparse(&extract(r"^")), sparse_except(&[b'\n']));
        assert_eq!(sparse(&extract(r"$")), sparse_except(&[b'\n']));
        assert_eq!(sparse(&extract(r"\A")), sparse_except(&[b'\n']));
        assert_eq!(sparse(&extract(r"\z")), sparse_except(&[b'\n']));
+        assert_eq!(sparse(&extract(r"(?m)^")), sparse_except(&[b'\n']));
+        assert_eq!(sparse(&extract(r"(?m)$")), sparse_except(&[b'\n']));
    }
 }