This commit is contained in:
Andrew Gallant
2023-10-09 18:23:36 -04:00
parent f7ff34fdf9
commit 9626f16757
14 changed files with 113 additions and 808 deletions

View File

@@ -3,7 +3,7 @@ use {
regex_automata::meta::Regex,
regex_syntax::{
ast,
hir::{self, Hir, HirKind},
hir::{self, Hir},
},
};
@@ -296,35 +296,6 @@ impl ConfiguredHIR {
}
}
/// Turns this configured HIR into one that only matches when both sides of
/// the match correspond to a word boundary.
///
/// Note that the HIR returned is like turning `pat` into
/// `(?m:^|\W)(pat)(?m:$|\W)`. That is, the true match is at capture group
/// `1` and not `0`.
pub(crate) fn into_word(self) -> Result<ConfiguredHIR, Error> {
// In theory building the HIR for \W should never fail, but there are
// likely some pathological cases (particularly with respect to certain
// values of limits) where it could in theory fail.
let non_word = {
let mut config = self.config.clone();
config.fixed_strings = false;
ConfiguredHIR::new(config, &[r"\W"])?
};
let line_anchor_start = Hir::look(self.line_anchor_start());
let line_anchor_end = Hir::look(self.line_anchor_end());
let hir = Hir::concat(vec![
Hir::alternation(vec![line_anchor_start, non_word.hir.clone()]),
Hir::capture(hir::Capture {
index: 1,
name: None,
sub: Box::new(renumber_capture_indices(self.hir)?),
}),
Hir::alternation(vec![non_word.hir, line_anchor_end]),
]);
Ok(ConfiguredHIR { config: self.config, hir })
}
/// Turns this configured HIR into an equivalent one, but where it must
/// match at the start and end of a line.
pub(crate) fn into_whole_line(self) -> ConfiguredHIR {
@@ -336,12 +307,20 @@ impl ConfiguredHIR {
}
/// Turns this configured HIR into an equivalent one, but where it must
/// match at the start and end of the haystack.
pub(crate) fn into_anchored(self) -> ConfiguredHIR {
/// match at word boundaries.
pub(crate) fn into_word(self) -> ConfiguredHIR {
let hir = Hir::concat(vec![
Hir::look(hir::Look::Start),
Hir::look(if self.config.unicode {
hir::Look::WordStartHalfUnicode
} else {
hir::Look::WordStartHalfAscii
}),
self.hir,
Hir::look(hir::Look::End),
Hir::look(if self.config.unicode {
hir::Look::WordEndHalfUnicode
} else {
hir::Look::WordEndHalfAscii
}),
]);
ConfiguredHIR { config: self.config, hir }
}
@@ -365,50 +344,6 @@ impl ConfiguredHIR {
}
}
/// This increments the index of every capture group in the given hir by 1. If
/// any increment results in an overflow, then an error is returned.
fn renumber_capture_indices(hir: Hir) -> Result<Hir, Error> {
Ok(match hir.into_kind() {
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal(lit)) => Hir::literal(lit),
HirKind::Class(cls) => Hir::class(cls),
HirKind::Look(x) => Hir::look(x),
HirKind::Repetition(mut x) => {
x.sub = Box::new(renumber_capture_indices(*x.sub)?);
Hir::repetition(x)
}
HirKind::Capture(mut cap) => {
cap.index = match cap.index.checked_add(1) {
Some(index) => index,
None => {
// This error message kind of sucks, but it's probably
// impossible for it to happen. The only way a capture
// index can overflow addition is if the regex is huge
// (or something else has gone horribly wrong).
let msg = "could not renumber capture index, too big";
return Err(Error::any(msg));
}
};
cap.sub = Box::new(renumber_capture_indices(*cap.sub)?);
Hir::capture(cap)
}
HirKind::Concat(subs) => {
let subs = subs
.into_iter()
.map(|sub| renumber_capture_indices(sub))
.collect::<Result<Vec<Hir>, Error>>()?;
Hir::concat(subs)
}
HirKind::Alternation(subs) => {
let subs = subs
.into_iter()
.map(|sub| renumber_capture_indices(sub))
.collect::<Result<Vec<Hir>, Error>>()?;
Hir::alternation(subs)
}
})
}
/// Returns true if the given literal string contains any byte from the line
/// terminator given.
fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool {