regex: refactor matcher construction

This does a little bit of refactoring so that we can pass both a
ConfiguredHIR and a Regex to the inner literal extraction routine.

One downside of this approach is that a regex object hangs on to a
ConfiguredHIR. But the extra memory usage is probably negligible. A
benefit though is that converting the HIR to its concrete syntax is now
lazy and only happens when logging is enabled.
This commit is contained in:
Andrew Gallant 2023-06-20 13:05:57 -04:00
parent 04dde9a4eb
commit 8ac66a9e04
4 changed files with 111 additions and 76 deletions

View File

@ -8,8 +8,8 @@ use {
}; };
use crate::{ use crate::{
ast::AstAnalysis, error::Error, literal::LiteralSets, ast::AstAnalysis, error::Error, non_matching::non_matching_bytes,
non_matching::non_matching_bytes, strip::strip_from_match, strip::strip_from_match,
}; };
/// Config represents the configuration of a regex matcher in this crate. /// Config represents the configuration of a regex matcher in this crate.
@ -228,6 +228,11 @@ impl ConfiguredHIR {
&self.config &self.config
} }
/// Return a reference to the underyling HIR.
pub(crate) fn hir(&self) -> &Hir {
&self.hir
}
/// Convert this HIR to a regex that can be used for matching. /// Convert this HIR to a regex that can be used for matching.
pub(crate) fn to_regex(&self) -> Result<Regex, Error> { pub(crate) fn to_regex(&self) -> Result<Regex, Error> {
let meta = Regex::config() let meta = Regex::config()
@ -240,8 +245,8 @@ impl ConfiguredHIR {
// Same deal here. The default limit for full DFAs is VERY small, // Same deal here. The default limit for full DFAs is VERY small,
// but with ripgrep we can afford to spend a bit more time on // but with ripgrep we can afford to spend a bit more time on
// building them I think. // building them I think.
.dfa_size_limit(Some(10 * (1 << 20))) .dfa_size_limit(Some(1 * (1 << 20)))
.dfa_state_limit(Some(10_000)) .dfa_state_limit(Some(1_000))
.hybrid_cache_capacity(self.config.dfa_size_limit); .hybrid_cache_capacity(self.config.dfa_size_limit);
Regex::builder() Regex::builder()
.configure(meta) .configure(meta)
@ -249,31 +254,6 @@ impl ConfiguredHIR {
.map_err(Error::regex) .map_err(Error::regex)
} }
/// Convert this HIR to its concrete syntax.
pub(crate) fn to_pattern(&self) -> String {
self.hir.to_string()
}
/// Attempt to extract a "fast" regex that can be used for quickly finding
/// candidates lines for a match.
///
/// If no line terminator was configured, then this always returns
/// `Ok(None)`. If a line terminator is configured, then this may return a
/// regex.
pub(crate) fn to_fast_line_regex(&self) -> Result<Option<Regex>, Error> {
if self.config.line_terminator.is_none() {
return Ok(None);
}
match LiteralSets::new(&self.hir).one_regex(self.config.word) {
None => Ok(None),
Some(pattern) => {
let config = self.config.clone();
let chir = ConfiguredHIR::new(config, &[pattern])?;
Ok(Some(chir.to_regex()?))
}
}
}
/// Compute the set of non-matching bytes for this HIR expression. /// Compute the set of non-matching bytes for this HIR expression.
pub(crate) fn non_matching_bytes(&self) -> ByteSet { pub(crate) fn non_matching_bytes(&self) -> ByteSet {
non_matching_bytes(&self.hir) non_matching_bytes(&self.hir)

View File

@ -1,32 +1,42 @@
use regex_syntax::hir::Hir; #![allow(warnings)]
// BREADCRUMBS: use {
// regex_automata::meta::Regex,
// The way we deal with line terminators in the regex is clunky, but probably regex_syntax::hir::{
// the least bad option for now unfortunately. literal::{self, Seq},
// Hir,
// The `non_matching_bytes` routine currently hardcodes line terminators for },
// anchors. But it's not really clear it should even care about line terminators };
// anyway, since anchors aren't actually part of a match. If we fix that
// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't use crate::config::ConfiguredHIR;
// implemented correctly in multi-line search, because it defers to the fast
// line-by-line strategy, which ends up being wrong. I think the way forward
// there is to:
//
// 1) Adding something in the grep-matcher interface that exposes a way to
// query for \A and \z specifically. If they're in the pattern, then we can
// decide how to handle them.
//
// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when
// mulit-line search is not enabled.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct LiteralSets {} pub(crate) struct InnerLiterals {
seq: Seq,
}
impl LiteralSets { impl InnerLiterals {
/// Create a set of literals from the given HIR expression. /// Create a set of inner literals from the given HIR expression.
pub fn new(_: &Hir) -> LiteralSets { ///
LiteralSets {} /// If no line terminator was configured, then this always declines to
/// extract literals because the inner literal optimization may not be
/// valid.
///
/// Note that this requires the actual regex that will be used for a search
/// because it will query some state about the compiled regex. That state
/// may influence inner literal extraction.
pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals {
let seq = Seq::infinite();
if chir.config().line_terminator.is_none() || re.is_accelerated() {
return InnerLiterals::none();
}
InnerLiterals { seq }
}
/// Returns a infinite set of inner literals, such that it can never
/// produce a matcher.
pub(crate) fn none() -> InnerLiterals {
InnerLiterals { seq: Seq::infinite() }
} }
/// If it is deemed advantageuous to do so (via various suspicious /// If it is deemed advantageuous to do so (via various suspicious
@ -35,7 +45,10 @@ impl LiteralSets {
/// generated these literal sets. The idea here is that the pattern /// generated these literal sets. The idea here is that the pattern
/// returned by this method is much cheaper to search for. i.e., It is /// returned by this method is much cheaper to search for. i.e., It is
/// usually a single literal or an alternation of literals. /// usually a single literal or an alternation of literals.
pub fn one_regex(&self, _word: bool) -> Option<String> { pub(crate) fn one_regex(&self) -> Option<String> {
if !self.seq.is_finite() {
return None;
}
None None
} }
} }

View File

@ -1,3 +1,5 @@
use std::sync::Arc;
use { use {
grep_matcher::{ grep_matcher::{
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
@ -12,6 +14,7 @@ use {
use crate::{ use crate::{
config::{Config, ConfiguredHIR}, config::{Config, ConfiguredHIR},
error::Error, error::Error,
literal::InnerLiterals,
word::WordMatcher, word::WordMatcher,
}; };
@ -59,8 +62,26 @@ impl RegexMatcherBuilder {
patterns: &[P], patterns: &[P],
) -> Result<RegexMatcher, Error> { ) -> Result<RegexMatcher, Error> {
let chir = self.config.build_many(patterns)?; let chir = self.config.build_many(patterns)?;
let fast_line_regex = chir.to_fast_line_regex()?; let matcher = RegexMatcherImpl::new(chir)?;
let (chir, re) = (matcher.chir(), matcher.regex());
log::trace!("final regex: {:?}", chir.hir().to_string());
let non_matching_bytes = chir.non_matching_bytes(); let non_matching_bytes = chir.non_matching_bytes();
// If we can pick out some literals from the regex, then we might be
// able to build a faster regex that quickly identifies candidate
// matching lines. The regex engine will do what it can on its own, but
// we can specifically do a little more when a line terminator is set.
// For example, for a regex like `\w+foo\w+`, we can look for `foo`,
// and when a match is found, look for the line containing `foo` and
// then run the original regex on only that line. (In this case, the
// regex engine is likely to handle this case for us since it's so
// simple, but the idea applies.)
let fast_line_regex = match InnerLiterals::new(chir, re).one_regex() {
None => None,
Some(pattern) => {
Some(chir.config().build_many(&[pattern])?.to_regex()?)
}
};
if let Some(ref re) = fast_line_regex { if let Some(ref re) = fast_line_regex {
log::debug!("extracted fast line regex: {:?}", re); log::debug!("extracted fast line regex: {:?}", re);
} }
@ -69,8 +90,6 @@ impl RegexMatcherBuilder {
// support it. // support it.
let mut config = self.config.clone(); let mut config = self.config.clone();
config.line_terminator = chir.line_terminator(); config.line_terminator = chir.line_terminator();
let matcher = RegexMatcherImpl::new(chir)?;
log::trace!("final regex: {:?}", matcher.regex());
Ok(RegexMatcher { Ok(RegexMatcher {
config, config,
matcher, matcher,
@ -412,10 +431,18 @@ impl RegexMatcherImpl {
} }
/// Return the underlying regex object used. /// Return the underlying regex object used.
fn regex(&self) -> String { fn regex(&self) -> &Regex {
match *self { match *self {
RegexMatcherImpl::Word(ref x) => x.pattern().to_string(), RegexMatcherImpl::Word(ref x) => x.regex(),
RegexMatcherImpl::Standard(ref x) => x.pattern.clone(), RegexMatcherImpl::Standard(ref x) => &x.regex,
}
}
/// Return the underlying HIR of the regex used for searching.
fn chir(&self) -> &ConfiguredHIR {
match *self {
RegexMatcherImpl::Word(ref x) => x.chir(),
RegexMatcherImpl::Standard(ref x) => &x.chir,
} }
} }
} }
@ -666,15 +693,19 @@ struct StandardMatcher {
/// The regular expression compiled from the pattern provided by the /// The regular expression compiled from the pattern provided by the
/// caller. /// caller.
regex: Regex, regex: Regex,
/// The underlying pattern string for the regex. /// The HIR that produced this regex.
pattern: String, ///
/// We put this in an `Arc` because by the time it gets here, it won't
/// change. And because cloning and dropping an `Hir` is somewhat expensive
/// due to its deep recursive representation.
chir: Arc<ConfiguredHIR>,
} }
impl StandardMatcher { impl StandardMatcher {
fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> { fn new(chir: ConfiguredHIR) -> Result<StandardMatcher, Error> {
let chir = Arc::new(chir);
let regex = chir.to_regex()?; let regex = chir.to_regex()?;
let pattern = chir.to_pattern(); Ok(StandardMatcher { regex, chir })
Ok(StandardMatcher { regex, pattern })
} }
} }

View File

@ -22,8 +22,13 @@ type PoolFn =
pub(crate) struct WordMatcher { pub(crate) struct WordMatcher {
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`. /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
regex: Regex, regex: Regex,
/// The pattern string corresponding to the above regex. /// The HIR that produced the regex above. We don't keep the HIR for the
pattern: String, /// `original` regex.
///
/// We put this in an `Arc` because by the time it gets here, it won't
/// change. And because cloning and dropping an `Hir` is somewhat expensive
/// due to its deep recursive representation.
chir: Arc<ConfiguredHIR>,
/// The original regex supplied by the user, which we use in a fast path /// The original regex supplied by the user, which we use in a fast path
/// to try and detect matches before deferring to slower engines. /// to try and detect matches before deferring to slower engines.
original: Regex, original: Regex,
@ -45,7 +50,7 @@ impl Clone for WordMatcher {
let re = self.regex.clone(); let re = self.regex.clone();
WordMatcher { WordMatcher {
regex: self.regex.clone(), regex: self.regex.clone(),
pattern: self.pattern.clone(), chir: Arc::clone(&self.chir),
original: self.original.clone(), original: self.original.clone(),
names: self.names.clone(), names: self.names.clone(),
caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))), caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
@ -61,9 +66,8 @@ impl WordMatcher {
/// internally. /// internally.
pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> { pub(crate) fn new(chir: ConfiguredHIR) -> Result<WordMatcher, Error> {
let original = chir.clone().into_anchored().to_regex()?; let original = chir.clone().into_anchored().to_regex()?;
let word_chir = chir.into_word()?; let chir = Arc::new(chir.into_word()?);
let regex = word_chir.to_regex()?; let regex = chir.to_regex()?;
let pattern = word_chir.to_pattern();
let caps = Arc::new(Pool::new({ let caps = Arc::new(Pool::new({
let regex = regex.clone(); let regex = regex.clone();
Box::new(move || regex.create_captures()) as PoolFn Box::new(move || regex.create_captures()) as PoolFn
@ -76,13 +80,20 @@ impl WordMatcher {
names.insert(name.to_string(), i.checked_sub(1).unwrap()); names.insert(name.to_string(), i.checked_sub(1).unwrap());
} }
} }
Ok(WordMatcher { regex, pattern, original, names, caps }) Ok(WordMatcher { regex, chir, original, names, caps })
} }
/// Return the underlying pattern string for the regex used by this /// Return the underlying regex used to match at word boundaries.
/// matcher. ///
pub(crate) fn pattern(&self) -> &str { /// The original regex is in the capture group at index 1.
&self.pattern pub(crate) fn regex(&self) -> &Regex {
&self.regex
}
/// Return the underlying HIR for the regex used to match at word
/// boundaries.
pub(crate) fn chir(&self) -> &ConfiguredHIR {
&self.chir
} }
/// Attempt to do a fast confirmation of a word match that covers a subset /// Attempt to do a fast confirmation of a word match that covers a subset