Files
ripgrep/grep/src/word_boundary.rs
Andrew Gallant 2a2b1506d4 Fix a performance bug where using -w could result in very bad performance.
The specific issue is that -w causes the regex to be wrapped in Unicode
word boundaries. Regrettably, Unicode word boundaries are the one thing
our regex engine can't handle well in the presence of non-ASCII text. We
work around its slowness by stripping word boundaries in some
circumstances, and using the resulting expression as a way to produce match
candidates that are then verified by the full original regex.

This doesn't fix all cases, but it should fix all cases where -w is used.
2016-09-21 19:12:07 -04:00

55 lines
2.1 KiB
Rust

use syntax::Expr;
/// Strips Unicode word boundaries from the given expression.
///
/// The key invariant this maintains is that the expression returned will match
/// *at least* every where the expression given will match. Namely, a match of
/// the returned expression can report false positives but it will never report
/// false negatives.
///
/// If no word boundaries could be stripped, then None is returned.
pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
// The real reason we do this is because Unicode word boundaries are the
// one thing that Rust's regex DFA engine can't handle. When it sees a
// Unicode word boundary among non-ASCII text, it falls back to one of the
// slower engines. We work around this limitation by attempting to use
// a regex to find candidate matches without a Unicode word boundary. We'll
// only then use the full (and slower) regex to confirm a candidate as a
// match or not during search.
use syntax::Expr::*;
match *expr {
Concat(ref es) if !es.is_empty() => {
let first = is_unicode_word_boundary(&es[0]);
let last = is_unicode_word_boundary(es.last().unwrap());
// Be careful not to strip word boundaries if there are no other
// expressions to match.
match (first, last) {
(true, false) if es.len() > 1 => {
Some(Concat(es[1..].to_vec()))
}
(false, true) if es.len() > 1 => {
Some(Concat(es[..es.len() - 1].to_vec()))
}
(true, true) if es.len() > 2 => {
Some(Concat(es[1..es.len() - 1].to_vec()))
}
_ => None,
}
}
_ => None,
}
}
/// Returns true if the given expression is a Unicode word boundary.
fn is_unicode_word_boundary(expr: &Expr) -> bool {
use syntax::Expr::*;
match *expr {
WordBoundary => true,
NotWordBoundary => true,
Group { ref e, .. } => is_unicode_word_boundary(e),
_ => false,
}
}