diff --git a/CHANGELOG.md b/CHANGELOG.md index 49b7a692..34caeaa2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Performance improvements: Improve inner literal detection to cover more cases more effectively. e.g., ` +Sherlock Holmes +` now has ` Sherlock Holmes ` extracted instead of ` `. +* PERF: + Improve literal detection when the `-w/--word-regexp` flag is used. Feature enhancements: diff --git a/grep-regex/src/literal.rs b/grep-regex/src/literal.rs index cc1e8965..e4c8e65c 100644 --- a/grep-regex/src/literal.rs +++ b/grep-regex/src/literal.rs @@ -110,7 +110,63 @@ impl LiteralSets { // We're matching raw bytes, so disable Unicode mode. Some(format!("(?-u:{})", alts.join("|"))) } else if lit.is_empty() { - None + // If we're here, then we have no LCP. No LCS. And no detected + // inner required literals. In theory this shouldn't happen, but + // the inner literal detector isn't as nice as we hope and doens't + // actually support returning a set of alternating required + // literals. (Instead, it only returns a set where EVERY literal + // in it is required. It cannot currently express "either P or Q + // is required.") + // + // In this case, it is possible that we still have meaningful + // prefixes or suffixes to use. So we look for the set of literals + // with the highest minimum length and use that to build our "fast" + // regex. + // + // This manifest in fairly common scenarios. e.g., + // + // rg -w 'foo|bar|baz|quux' + // + // Normally, without the `-w`, the regex engine itself would + // detect the prefix correctly. Unfortunately, the `-w` option + // turns the regex into something like this: + // + // rg '(^|\W)(foo|bar|baz|quux)($|\W)' + // + // Which will defeat all prefix and suffix literal optimizations. + // (Not in theory---it could be better. But the current + // implementation isn't good enough.) ... So we make up for it + // here. + let p_min_len = self.prefixes.min_len(); + let s_min_len = self.suffixes.min_len(); + let lits = match (p_min_len, s_min_len) { + (None, None) => return None, + (Some(_), None) => { + debug!("prefix literals found"); + self.prefixes.literals() + } + (None, Some(_)) => { + debug!("suffix literals found"); + self.suffixes.literals() + } + (Some(p), Some(s)) => { + if p >= s { + debug!("prefix literals found"); + self.prefixes.literals() + } else { + debug!("suffix literals found"); + self.suffixes.literals() + } + } + }; + + debug!("prefix/suffix literals found: {:?}", lits); + let alts: Vec = lits + .into_iter() + .map(|x| util::bytes_to_regex(x)) + .collect(); + // We're matching raw bytes, so disable Unicode mode. + Some(format!("(?-u:{})", alts.join("|"))) } else { debug!("required literal found: {:?}", util::show_bytes(lit)); Some(format!("(?-u:{})", util::bytes_to_regex(&lit))) diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs index 61af0518..42504656 100644 --- a/grep-regex/src/matcher.rs +++ b/grep-regex/src/matcher.rs @@ -49,7 +49,7 @@ impl RegexMatcherBuilder { let fast_line_regex = chir.fast_line_regex()?; let non_matching_bytes = chir.non_matching_bytes(); if let Some(ref re) = fast_line_regex { - trace!("extracted fast line regex: {:?}", re); + debug!("extracted fast line regex: {:?}", re); } let matcher = RegexMatcherImpl::new(&chir)?;