mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-08-08 16:12:04 -07:00
Fix a performance bug where using -w could result in very bad performance.
The specific issue is that -w causes the regex to be wrapped in Unicode word boundaries. Regrettably, Unicode word boundaries are the one thing our regex engine can't handle well in the presence of non-ASCII text. We work around its slowness by stripping word boundaries in some circumstances, and using the resulting expression as a way to produce match candidates that are then verified by the full original regex. This doesn't fix all cases, but it should fix all cases where -w is used.
This commit is contained in:
54
grep/src/word_boundary.rs
Normal file
54
grep/src/word_boundary.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use syntax::Expr;
|
||||
|
||||
/// Strips Unicode word boundaries from the given expression.
|
||||
///
|
||||
/// The key invariant this maintains is that the expression returned will match
|
||||
/// *at least* every where the expression given will match. Namely, a match of
|
||||
/// the returned expression can report false positives but it will never report
|
||||
/// false negatives.
|
||||
///
|
||||
/// If no word boundaries could be stripped, then None is returned.
|
||||
pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
||||
// The real reason we do this is because Unicode word boundaries are the
|
||||
// one thing that Rust's regex DFA engine can't handle. When it sees a
|
||||
// Unicode word boundary among non-ASCII text, it falls back to one of the
|
||||
// slower engines. We work around this limitation by attempting to use
|
||||
// a regex to find candidate matches without a Unicode word boundary. We'll
|
||||
// only then use the full (and slower) regex to confirm a candidate as a
|
||||
// match or not during search.
|
||||
use syntax::Expr::*;
|
||||
|
||||
match *expr {
|
||||
Concat(ref es) if !es.is_empty() => {
|
||||
let first = is_unicode_word_boundary(&es[0]);
|
||||
let last = is_unicode_word_boundary(es.last().unwrap());
|
||||
// Be careful not to strip word boundaries if there are no other
|
||||
// expressions to match.
|
||||
match (first, last) {
|
||||
(true, false) if es.len() > 1 => {
|
||||
Some(Concat(es[1..].to_vec()))
|
||||
}
|
||||
(false, true) if es.len() > 1 => {
|
||||
Some(Concat(es[..es.len() - 1].to_vec()))
|
||||
}
|
||||
(true, true) if es.len() > 2 => {
|
||||
Some(Concat(es[1..es.len() - 1].to_vec()))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the given expression is a Unicode word boundary.
|
||||
fn is_unicode_word_boundary(expr: &Expr) -> bool {
|
||||
use syntax::Expr::*;
|
||||
|
||||
match *expr {
|
||||
WordBoundary => true,
|
||||
NotWordBoundary => true,
|
||||
Group { ref e, .. } => is_unicode_word_boundary(e),
|
||||
_ => false,
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user