mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 01:30:21 -07:00
regex: fix -F aho-corasick optimization
It turns out that when the -F flag was used, if any of the patterns contained a regex meta character (such as `.`), then we winded up escaping the pattern first before handing it off to Aho-Corasick, which treats all patterns literally. We continue to apply band-aides here and just avoid Aho-Corasick if there is an escape in any of the literal patterns. This is unfortunate, but making this work better requires more refactoring, and the right solution is to get this optimization pushed down into the regex engine. Fixes #1334
This commit is contained in:
parent
bc37c32717
commit
adb9332f52
@ -71,10 +71,31 @@ impl RegexMatcherBuilder {
|
||||
&self,
|
||||
literals: &[B],
|
||||
) -> Result<RegexMatcher, Error> {
|
||||
let slices: Vec<_> = literals.iter().map(|s| s.as_ref()).collect();
|
||||
if !self.config.can_plain_aho_corasick() || literals.len() < 40 {
|
||||
let mut has_escape = false;
|
||||
let mut slices = vec![];
|
||||
for lit in literals {
|
||||
slices.push(lit.as_ref());
|
||||
has_escape = has_escape || lit.as_ref().contains('\\');
|
||||
}
|
||||
// Even when we have a fixed set of literals, we might still want to
|
||||
// use the regex engine. Specifically, if any string has an escape
|
||||
// in it, then we probably can't feed it to Aho-Corasick without
|
||||
// removing the escape. Additionally, if there are any particular
|
||||
// special match semantics we need to honor, that Aho-Corasick isn't
|
||||
// enough. Finally, the regex engine can do really well with a small
|
||||
// number of literals (at time of writing, this is changing soon), so
|
||||
// we use it when there's a small set.
|
||||
//
|
||||
// Yes, this is one giant hack. Ideally, this entirely separate literal
|
||||
// matcher that uses Aho-Corasick would be pushed down into the regex
|
||||
// engine.
|
||||
if has_escape
|
||||
|| !self.config.can_plain_aho_corasick()
|
||||
|| literals.len() < 40
|
||||
{
|
||||
return self.build(&slices.join("|"));
|
||||
}
|
||||
|
||||
let matcher = MultiLiteralMatcher::new(&slices)?;
|
||||
let imp = RegexMatcherImpl::MultiLiteral(matcher);
|
||||
Ok(RegexMatcher {
|
||||
|
@ -716,3 +716,13 @@ rgtest!(r1259_drop_last_byte_nonl, |dir: Dir, mut cmd: TestCommand| {
|
||||
cmd = dir.command();
|
||||
eqnice!("fz\n", cmd.arg("-f").arg("patterns-nl").arg("test").stdout());
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1334
|
||||
rgtest!(r1334_crazy_literals, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create("patterns", &"1.208.0.0/12\n".repeat(40));
|
||||
dir.create("corpus", "1.208.0.0/12\n");
|
||||
eqnice!(
|
||||
"1.208.0.0/12\n",
|
||||
cmd.arg("-Ff").arg("patterns").arg("corpus").stdout()
|
||||
);
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user