Fix required literal handling and add debug prints.

In particular, if we had an inner literal and were doing a case insensitive search, then the literals are dropped because we previously only allowed a single inner literal to have an effect. Now we allow alternations of inner literals, but still don't quite take full advantage.
2025-05-19 09:40:22 -07:00 · 2016-09-06 19:33:03 -04:00 · 2016-09-06 19:33:03 -04:00 · fd3e5069b6
commit fd3e5069b6
parent 0891b4a3c0
4 changed files with 67 additions and 8 deletions
--- a/grep/Cargo.toml
+++ b/grep/Cargo.toml
@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
 license = "Unlicense/MIT"
 [dependencies]
 log = "0.3"
 memchr = "0.1"
 memmap = "0.2"
 regex = "0.1.75"
--- a/grep/src/lib.rs
+++ b/grep/src/lib.rs
@ -4,6 +4,8 @@
 A fast line oriented regex searcher.
 */
 #[macro_use]
 extern crate log;
 extern crate memchr;
 extern crate regex;
 extern crate regex_syntax as syntax;
--- a/grep/src/literals.rs
+++ b/grep/src/literals.rs
@ -1,13 +1,22 @@
 /*!
 The literals module is responsible for extracting *inner* literals out of the
 AST of a regular expression. Normally this is the job of the regex engine
 itself, but the regex engine doesn't look for inner literals. Since we're doing
 line based searching, we can use them, so we need to do it ourselves.
 Note that this implementation is incredibly suspicious. We need something more
 principled.
 */
 use std::cmp;
 use std::iter;
 use regex::bytes::Regex;
 use syntax::{
    Expr, Literals, Lit,
-    Repeater,
+    ByteClass, ByteRange, CharClass, ClassRange, Repeater,
 };
-#[derive(Debug)]
+#[derive(Clone, Debug)]
 pub struct LiteralSets {
    prefixes: Literals,
    suffixes: Literals,
@ -27,6 +36,7 @@ impl LiteralSets {
    pub fn to_regex(&self) -> Option<Regex> {
        if self.prefixes.all_complete() && !self.prefixes.is_empty() {
            debug!("literal prefixes detected: {:?}", self.prefixes);
            // When this is true, the regex engine will do a literal scan.
            return None;
        }
@ -56,13 +66,27 @@ impl LiteralSets {
        if suf_lcs.len() > lit.len() {
            lit = suf_lcs;
        }
-        if req.len() > lit.len() {
+        if req_lits.len() == 1 && req.len() > lit.len() {
            lit = req;
        }
-        if lit.is_empty() {
+
        // Special case: if we detected an alternation of inner required
        // literals and its longest literal is bigger than the longest
        // prefix/suffix, then choose the alternation. In practice, this
        // helps with case insensitive matching, which can generate lots of
        // inner required literals.
        let any_empty = req_lits.iter().any(|lit| lit.is_empty());
        if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
            debug!("required literals found: {:?}", req_lits);
            let alts: Vec<String> =
                req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
            // Literals always compile.
            Some(Regex::new(&alts.join("|")).unwrap())
        } else if lit.is_empty() {
            None
        } else {
            // Literals always compile.
            debug!("required literal found: {:?}", show(lit));
            Some(Regex::new(&bytes_to_regex(lit)).unwrap())
        }
    }
@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
            let s: String = chars.iter().cloned().collect();
            lits.cross_add(s.as_bytes());
        }
-        Literal { casei: true, .. } => {
+        Literal { ref chars, casei: true } => {
-            lits.cut();
+            for &c in chars {
                let cls = CharClass::new(vec![
                    ClassRange { start: c, end: c },
                ]).case_fold();
                if !lits.add_char_class(&cls) {
                    lits.cut();
                    return;
                }
            }
        }
        LiteralBytes { ref bytes, casei: false } => {
            lits.cross_add(bytes);
        }
-        LiteralBytes { casei: true, .. } => {
+        LiteralBytes { ref bytes, casei: true } => {
-            lits.cut();
+            for &b in bytes {
                let cls = ByteClass::new(vec![
                    ByteRange { start: b, end: b },
                ]).case_fold();
                if !lits.add_byte_class(&cls) {
                    lits.cut();
                    return;
                }
            }
        }
        Class(_) => {
            lits.cut();
@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
    }
    s
 }
 /// Converts arbitrary bytes to a nice string.
 fn show(bs: &[u8]) -> String {
    // Why aren't we using this to feed to the regex? Doesn't really matter
    // I guess. ---AG
    use std::ascii::escape_default;
    use std::str;
    let mut nice = String::new();
    for &b in bs {
        let part: Vec<u8> = escape_default(b).collect();
        nice.push_str(str::from_utf8(&part).unwrap());
    }
    nice
 }
--- a/grep/src/search.rs
+++ b/grep/src/search.rs
@ -152,6 +152,7 @@ impl GrepBuilder {
                 .unicode(true)
                 .case_insensitive(self.opts.case_insensitive)
                 .parse(&self.pattern));
        debug!("regex ast:\n{:#?}", expr);
        Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
    }
 }