regex: fix -F aho-corasick optimization

It turns out that when the -F flag was used, if any of the patterns
contained a regex meta character (such as `.`), then we winded up
escaping the pattern first before handing it off to Aho-Corasick, which
treats all patterns literally.

We continue to apply band-aides here and just avoid Aho-Corasick if
there is an escape in any of the literal patterns. This is unfortunate,
but making this work better requires more refactoring, and the right
solution is to get this optimization pushed down into the regex engine.

Fixes #1334
This commit is contained in:
Andrew Gallant 2019-08-01 16:58:12 -04:00
parent bc37c32717
commit adb9332f52
No known key found for this signature in database
GPG Key ID: B2E3A4923F8B0D44
2 changed files with 33 additions and 2 deletions

View File

@ -71,10 +71,31 @@ impl RegexMatcherBuilder {
&self, &self,
literals: &[B], literals: &[B],
) -> Result<RegexMatcher, Error> { ) -> Result<RegexMatcher, Error> {
let slices: Vec<_> = literals.iter().map(|s| s.as_ref()).collect(); let mut has_escape = false;
if !self.config.can_plain_aho_corasick() || literals.len() < 40 { let mut slices = vec![];
for lit in literals {
slices.push(lit.as_ref());
has_escape = has_escape || lit.as_ref().contains('\\');
}
// Even when we have a fixed set of literals, we might still want to
// use the regex engine. Specifically, if any string has an escape
// in it, then we probably can't feed it to Aho-Corasick without
// removing the escape. Additionally, if there are any particular
// special match semantics we need to honor, that Aho-Corasick isn't
// enough. Finally, the regex engine can do really well with a small
// number of literals (at time of writing, this is changing soon), so
// we use it when there's a small set.
//
// Yes, this is one giant hack. Ideally, this entirely separate literal
// matcher that uses Aho-Corasick would be pushed down into the regex
// engine.
if has_escape
|| !self.config.can_plain_aho_corasick()
|| literals.len() < 40
{
return self.build(&slices.join("|")); return self.build(&slices.join("|"));
} }
let matcher = MultiLiteralMatcher::new(&slices)?; let matcher = MultiLiteralMatcher::new(&slices)?;
let imp = RegexMatcherImpl::MultiLiteral(matcher); let imp = RegexMatcherImpl::MultiLiteral(matcher);
Ok(RegexMatcher { Ok(RegexMatcher {

View File

@ -716,3 +716,13 @@ rgtest!(r1259_drop_last_byte_nonl, |dir: Dir, mut cmd: TestCommand| {
cmd = dir.command(); cmd = dir.command();
eqnice!("fz\n", cmd.arg("-f").arg("patterns-nl").arg("test").stdout()); eqnice!("fz\n", cmd.arg("-f").arg("patterns-nl").arg("test").stdout());
}); });
// See: https://github.com/BurntSushi/ripgrep/issues/1334
rgtest!(r1334_crazy_literals, |dir: Dir, mut cmd: TestCommand| {
dir.create("patterns", &"1.208.0.0/12\n".repeat(40));
dir.create("corpus", "1.208.0.0/12\n");
eqnice!(
"1.208.0.0/12\n",
cmd.arg("-Ff").arg("patterns").arg("corpus").stdout()
);
});