mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
regex: small cleanups
Just some small polishing. We also get rid of thread_local in favor of using regex-automata, mostly just in the name of reducing dependencies. (We should eventually be able to drop thread_local completely.)
This commit is contained in:
parent
a6dbff502f
commit
a775b493fd
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -202,7 +202,6 @@ dependencies = [
|
|||||||
"regex",
|
"regex",
|
||||||
"regex-automata 0.3.0",
|
"regex-automata 0.3.0",
|
||||||
"regex-syntax",
|
"regex-syntax",
|
||||||
"thread_local",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -21,4 +21,3 @@ log = "0.4.5"
|
|||||||
regex = "1.8.3"
|
regex = "1.8.3"
|
||||||
regex-automata = { version = "0.3.0" }
|
regex-automata = { version = "0.3.0" }
|
||||||
regex-syntax = "0.7.2"
|
regex-syntax = "0.7.2"
|
||||||
thread_local = "1.1.7"
|
|
||||||
|
@ -21,21 +21,21 @@ use crate::{
|
|||||||
/// configuration which generated it, and provides transformation on that HIR
|
/// configuration which generated it, and provides transformation on that HIR
|
||||||
/// such that the configuration is preserved.
|
/// such that the configuration is preserved.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct Config {
|
pub(crate) struct Config {
|
||||||
pub case_insensitive: bool,
|
pub(crate) case_insensitive: bool,
|
||||||
pub case_smart: bool,
|
pub(crate) case_smart: bool,
|
||||||
pub multi_line: bool,
|
pub(crate) multi_line: bool,
|
||||||
pub dot_matches_new_line: bool,
|
pub(crate) dot_matches_new_line: bool,
|
||||||
pub swap_greed: bool,
|
pub(crate) swap_greed: bool,
|
||||||
pub ignore_whitespace: bool,
|
pub(crate) ignore_whitespace: bool,
|
||||||
pub unicode: bool,
|
pub(crate) unicode: bool,
|
||||||
pub octal: bool,
|
pub(crate) octal: bool,
|
||||||
pub size_limit: usize,
|
pub(crate) size_limit: usize,
|
||||||
pub dfa_size_limit: usize,
|
pub(crate) dfa_size_limit: usize,
|
||||||
pub nest_limit: u32,
|
pub(crate) nest_limit: u32,
|
||||||
pub line_terminator: Option<LineTerminator>,
|
pub(crate) line_terminator: Option<LineTerminator>,
|
||||||
pub crlf: bool,
|
pub(crate) crlf: bool,
|
||||||
pub word: bool,
|
pub(crate) word: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Config {
|
impl Default for Config {
|
||||||
@ -67,7 +67,7 @@ impl Config {
|
|||||||
///
|
///
|
||||||
/// If there was a problem parsing the given expression then an error
|
/// If there was a problem parsing the given expression then an error
|
||||||
/// is returned.
|
/// is returned.
|
||||||
pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
pub(crate) fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
||||||
let ast = self.ast(pattern)?;
|
let ast = self.ast(pattern)?;
|
||||||
let analysis = self.analysis(&ast)?;
|
let analysis = self.analysis(&ast)?;
|
||||||
let expr = hir::translate::TranslatorBuilder::new()
|
let expr = hir::translate::TranslatorBuilder::new()
|
||||||
@ -112,7 +112,7 @@ impl Config {
|
|||||||
/// Note that it is OK to return true even when settings like `multi_line`
|
/// Note that it is OK to return true even when settings like `multi_line`
|
||||||
/// are enabled, since if multi-line can impact the match semantics of a
|
/// are enabled, since if multi-line can impact the match semantics of a
|
||||||
/// regex, then it is by definition not a simple alternation of literals.
|
/// regex, then it is by definition not a simple alternation of literals.
|
||||||
pub fn can_plain_aho_corasick(&self) -> bool {
|
pub(crate) fn can_plain_aho_corasick(&self) -> bool {
|
||||||
!self.word && !self.case_insensitive && !self.case_smart
|
!self.word && !self.case_insensitive && !self.case_smart
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -149,7 +149,7 @@ impl Config {
|
|||||||
/// size limits set on the configured HIR will be propagated out to any
|
/// size limits set on the configured HIR will be propagated out to any
|
||||||
/// subsequently constructed HIR or regular expression.
|
/// subsequently constructed HIR or regular expression.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct ConfiguredHIR {
|
pub(crate) struct ConfiguredHIR {
|
||||||
original: String,
|
original: String,
|
||||||
config: Config,
|
config: Config,
|
||||||
analysis: AstAnalysis,
|
analysis: AstAnalysis,
|
||||||
@ -158,12 +158,12 @@ pub struct ConfiguredHIR {
|
|||||||
|
|
||||||
impl ConfiguredHIR {
|
impl ConfiguredHIR {
|
||||||
/// Return the configuration for this HIR expression.
|
/// Return the configuration for this HIR expression.
|
||||||
pub fn config(&self) -> &Config {
|
pub(crate) fn config(&self) -> &Config {
|
||||||
&self.config
|
&self.config
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compute the set of non-matching bytes for this HIR expression.
|
/// Compute the set of non-matching bytes for this HIR expression.
|
||||||
pub fn non_matching_bytes(&self) -> ByteSet {
|
pub(crate) fn non_matching_bytes(&self) -> ByteSet {
|
||||||
non_matching_bytes(&self.expr)
|
non_matching_bytes(&self.expr)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -184,7 +184,7 @@ impl ConfiguredHIR {
|
|||||||
/// without a line terminator, the fast search path can't be executed.
|
/// without a line terminator, the fast search path can't be executed.
|
||||||
///
|
///
|
||||||
/// See: <https://github.com/BurntSushi/ripgrep/issues/2260>
|
/// See: <https://github.com/BurntSushi/ripgrep/issues/2260>
|
||||||
pub fn line_terminator(&self) -> Option<LineTerminator> {
|
pub(crate) fn line_terminator(&self) -> Option<LineTerminator> {
|
||||||
if self.is_any_anchored() {
|
if self.is_any_anchored() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
@ -198,19 +198,19 @@ impl ConfiguredHIR {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Builds a regular expression from this HIR expression.
|
/// Builds a regular expression from this HIR expression.
|
||||||
pub fn regex(&self) -> Result<Regex, Error> {
|
pub(crate) fn regex(&self) -> Result<Regex, Error> {
|
||||||
self.pattern_to_regex(&self.pattern())
|
self.pattern_to_regex(&self.pattern())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the pattern string by converting this HIR to its concrete
|
/// Returns the pattern string by converting this HIR to its concrete
|
||||||
/// syntax.
|
/// syntax.
|
||||||
pub fn pattern(&self) -> String {
|
pub(crate) fn pattern(&self) -> String {
|
||||||
self.expr.to_string()
|
self.expr.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// If this HIR corresponds to an alternation of literals with no
|
/// If this HIR corresponds to an alternation of literals with no
|
||||||
/// capturing groups, then this returns those literals.
|
/// capturing groups, then this returns those literals.
|
||||||
pub fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
|
pub(crate) fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
|
||||||
if !self.config.can_plain_aho_corasick() {
|
if !self.config.can_plain_aho_corasick() {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@ -223,7 +223,7 @@ impl ConfiguredHIR {
|
|||||||
///
|
///
|
||||||
/// For example, this can be used to wrap a user provided regular
|
/// For example, this can be used to wrap a user provided regular
|
||||||
/// expression with additional semantics. e.g., See the `WordMatcher`.
|
/// expression with additional semantics. e.g., See the `WordMatcher`.
|
||||||
pub fn with_pattern<F: FnMut(&str) -> String>(
|
pub(crate) fn with_pattern<F: FnMut(&str) -> String>(
|
||||||
&self,
|
&self,
|
||||||
mut f: F,
|
mut f: F,
|
||||||
) -> Result<ConfiguredHIR, Error> {
|
) -> Result<ConfiguredHIR, Error> {
|
||||||
@ -246,7 +246,7 @@ impl ConfiguredHIR {
|
|||||||
/// match. This only works when the line terminator is set because the line
|
/// match. This only works when the line terminator is set because the line
|
||||||
/// terminator setting guarantees that the regex itself can never match
|
/// terminator setting guarantees that the regex itself can never match
|
||||||
/// through the line terminator byte.
|
/// through the line terminator byte.
|
||||||
pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
|
pub(crate) fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
|
||||||
if self.config.line_terminator.is_none() {
|
if self.config.line_terminator.is_none() {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ use {
|
|||||||
};
|
};
|
||||||
|
|
||||||
/// Return a confirmed set of non-matching bytes from the given expression.
|
/// Return a confirmed set of non-matching bytes from the given expression.
|
||||||
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
|
pub(crate) fn non_matching_bytes(expr: &Hir) -> ByteSet {
|
||||||
let mut set = ByteSet::full();
|
let mut set = ByteSet::full();
|
||||||
remove_matching_bytes(expr, &mut set);
|
remove_matching_bytes(expr, &mut set);
|
||||||
set
|
set
|
||||||
|
@ -1,18 +1,25 @@
|
|||||||
use std::{cell::RefCell, collections::HashMap, sync::Arc};
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
panic::{RefUnwindSafe, UnwindSafe},
|
||||||
|
sync::Arc,
|
||||||
|
};
|
||||||
|
|
||||||
use {
|
use {
|
||||||
grep_matcher::{Match, Matcher, NoError},
|
grep_matcher::{Match, Matcher, NoError},
|
||||||
regex_automata::{
|
regex_automata::{
|
||||||
meta::Regex, util::captures::Captures, Input, PatternID,
|
meta::Regex, util::captures::Captures, util::pool::Pool, Input,
|
||||||
|
PatternID,
|
||||||
},
|
},
|
||||||
thread_local::ThreadLocal,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
|
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
|
||||||
|
|
||||||
|
type PoolFn =
|
||||||
|
Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>;
|
||||||
|
|
||||||
/// A matcher for implementing "word match" semantics.
|
/// A matcher for implementing "word match" semantics.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct WordMatcher {
|
pub(crate) struct WordMatcher {
|
||||||
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
||||||
regex: Regex,
|
regex: Regex,
|
||||||
/// The pattern string corresponding to the above regex.
|
/// The pattern string corresponding to the above regex.
|
||||||
@ -22,21 +29,26 @@ pub struct WordMatcher {
|
|||||||
original: Regex,
|
original: Regex,
|
||||||
/// A map from capture group name to capture group index.
|
/// A map from capture group name to capture group index.
|
||||||
names: HashMap<String, usize>,
|
names: HashMap<String, usize>,
|
||||||
/// A reusable buffer for finding the match offset of the inner group.
|
/// A thread-safe pool of reusable buffers for finding the match offset of
|
||||||
caps: Arc<ThreadLocal<RefCell<Captures>>>,
|
/// the inner group.
|
||||||
|
caps: Arc<Pool<Captures, PoolFn>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for WordMatcher {
|
impl Clone for WordMatcher {
|
||||||
fn clone(&self) -> WordMatcher {
|
fn clone(&self) -> WordMatcher {
|
||||||
// We implement Clone manually so that we get a fresh ThreadLocal such
|
// We implement Clone manually so that we get a fresh Pool such that it
|
||||||
// that it can set its own thread owner. This permits each thread
|
// can set its own thread owner. This permits each thread usings `caps`
|
||||||
// usings `caps` to hit the fast path.
|
// to hit the fast path.
|
||||||
|
//
|
||||||
|
// Note that cloning a regex is "cheap" since it uses reference
|
||||||
|
// counting internally.
|
||||||
|
let re = self.regex.clone();
|
||||||
WordMatcher {
|
WordMatcher {
|
||||||
regex: self.regex.clone(),
|
regex: self.regex.clone(),
|
||||||
pattern: self.pattern.clone(),
|
pattern: self.pattern.clone(),
|
||||||
original: self.original.clone(),
|
original: self.original.clone(),
|
||||||
names: self.names.clone(),
|
names: self.names.clone(),
|
||||||
caps: Arc::new(ThreadLocal::new()),
|
caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -47,7 +59,7 @@ impl WordMatcher {
|
|||||||
///
|
///
|
||||||
/// The given options are used to construct the regular expression
|
/// The given options are used to construct the regular expression
|
||||||
/// internally.
|
/// internally.
|
||||||
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
|
pub(crate) fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
|
||||||
let original =
|
let original =
|
||||||
expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?;
|
expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?;
|
||||||
let word_expr = expr.with_pattern(|pat| {
|
let word_expr = expr.with_pattern(|pat| {
|
||||||
@ -57,7 +69,10 @@ impl WordMatcher {
|
|||||||
})?;
|
})?;
|
||||||
let regex = word_expr.regex()?;
|
let regex = word_expr.regex()?;
|
||||||
let pattern = word_expr.pattern();
|
let pattern = word_expr.pattern();
|
||||||
let caps = Arc::new(ThreadLocal::new());
|
let caps = Arc::new(Pool::new({
|
||||||
|
let regex = regex.clone();
|
||||||
|
Box::new(move || regex.create_captures()) as PoolFn
|
||||||
|
}));
|
||||||
|
|
||||||
let mut names = HashMap::new();
|
let mut names = HashMap::new();
|
||||||
let it = regex.group_info().pattern_names(PatternID::ZERO);
|
let it = regex.group_info().pattern_names(PatternID::ZERO);
|
||||||
@ -71,7 +86,7 @@ impl WordMatcher {
|
|||||||
|
|
||||||
/// Return the underlying pattern string for the regex used by this
|
/// Return the underlying pattern string for the regex used by this
|
||||||
/// matcher.
|
/// matcher.
|
||||||
pub fn pattern(&self) -> &str {
|
pub(crate) fn pattern(&self) -> &str {
|
||||||
&self.pattern
|
&self.pattern
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -85,12 +100,11 @@ impl WordMatcher {
|
|||||||
haystack: &[u8],
|
haystack: &[u8],
|
||||||
at: usize,
|
at: usize,
|
||||||
) -> Result<Option<Match>, ()> {
|
) -> Result<Option<Match>, ()> {
|
||||||
// This is a bit hairy. The whole point here is to avoid running an
|
// This is a bit hairy. The whole point here is to avoid running a
|
||||||
// NFA simulation in the regex engine. Remember, our word regex looks
|
// slower regex engine to extract capture groups. Remember, our word
|
||||||
// like this:
|
// regex looks like this:
|
||||||
//
|
//
|
||||||
// (^|\W)(<original regex>)($|\W)
|
// (^|\W)(<original regex>)($|\W)
|
||||||
// where ^ and $ have multiline mode DISABLED
|
|
||||||
//
|
//
|
||||||
// What we want are the match offsets of <original regex>. So in the
|
// What we want are the match offsets of <original regex>. So in the
|
||||||
// easy/common case, the original regex will be sandwiched between
|
// easy/common case, the original regex will be sandwiched between
|
||||||
@ -152,18 +166,17 @@ impl Matcher for WordMatcher {
|
|||||||
//
|
//
|
||||||
// OK, well, it turns out that it is worth it! But it is quite tricky.
|
// OK, well, it turns out that it is worth it! But it is quite tricky.
|
||||||
// See `fast_find` for details. Effectively, this lets us skip running
|
// See `fast_find` for details. Effectively, this lets us skip running
|
||||||
// the NFA simulation in the regex engine in the vast majority of
|
// a slower regex engine to extract capture groups in the vast majority
|
||||||
// cases. However, the NFA simulation is required for full correctness.
|
// of cases. However, the slower engine is I believe required for full
|
||||||
|
// correctness.
|
||||||
match self.fast_find(haystack, at) {
|
match self.fast_find(haystack, at) {
|
||||||
Ok(Some(m)) => return Ok(Some(m)),
|
Ok(Some(m)) => return Ok(Some(m)),
|
||||||
Ok(None) => return Ok(None),
|
Ok(None) => return Ok(None),
|
||||||
Err(()) => {}
|
Err(()) => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
let cell =
|
|
||||||
self.caps.get_or(|| RefCell::new(self.regex.create_captures()));
|
|
||||||
let input = Input::new(haystack).span(at..haystack.len());
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
let mut caps = cell.borrow_mut();
|
let mut caps = self.caps.get();
|
||||||
self.regex.search_captures(&input, &mut caps);
|
self.regex.search_captures(&input, &mut caps);
|
||||||
Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
|
Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user