diff --git a/Cargo.lock b/Cargo.lock index b86f49d6..3f4bc49b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -202,7 +202,6 @@ dependencies = [ "regex", "regex-automata 0.3.0", "regex-syntax", - "thread_local", ] [[package]] diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml index f564b0e6..13d2e5e1 100644 --- a/crates/regex/Cargo.toml +++ b/crates/regex/Cargo.toml @@ -21,4 +21,3 @@ log = "0.4.5" regex = "1.8.3" regex-automata = { version = "0.3.0" } regex-syntax = "0.7.2" -thread_local = "1.1.7" diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs index 8d785b6d..a2aa3243 100644 --- a/crates/regex/src/config.rs +++ b/crates/regex/src/config.rs @@ -21,21 +21,21 @@ use crate::{ /// configuration which generated it, and provides transformation on that HIR /// such that the configuration is preserved. #[derive(Clone, Debug)] -pub struct Config { - pub case_insensitive: bool, - pub case_smart: bool, - pub multi_line: bool, - pub dot_matches_new_line: bool, - pub swap_greed: bool, - pub ignore_whitespace: bool, - pub unicode: bool, - pub octal: bool, - pub size_limit: usize, - pub dfa_size_limit: usize, - pub nest_limit: u32, - pub line_terminator: Option, - pub crlf: bool, - pub word: bool, +pub(crate) struct Config { + pub(crate) case_insensitive: bool, + pub(crate) case_smart: bool, + pub(crate) multi_line: bool, + pub(crate) dot_matches_new_line: bool, + pub(crate) swap_greed: bool, + pub(crate) ignore_whitespace: bool, + pub(crate) unicode: bool, + pub(crate) octal: bool, + pub(crate) size_limit: usize, + pub(crate) dfa_size_limit: usize, + pub(crate) nest_limit: u32, + pub(crate) line_terminator: Option, + pub(crate) crlf: bool, + pub(crate) word: bool, } impl Default for Config { @@ -67,7 +67,7 @@ impl Config { /// /// If there was a problem parsing the given expression then an error /// is returned. - pub fn hir(&self, pattern: &str) -> Result { + pub(crate) fn hir(&self, pattern: &str) -> Result { let ast = self.ast(pattern)?; let analysis = self.analysis(&ast)?; let expr = hir::translate::TranslatorBuilder::new() @@ -112,7 +112,7 @@ impl Config { /// Note that it is OK to return true even when settings like `multi_line` /// are enabled, since if multi-line can impact the match semantics of a /// regex, then it is by definition not a simple alternation of literals. - pub fn can_plain_aho_corasick(&self) -> bool { + pub(crate) fn can_plain_aho_corasick(&self) -> bool { !self.word && !self.case_insensitive && !self.case_smart } @@ -149,7 +149,7 @@ impl Config { /// size limits set on the configured HIR will be propagated out to any /// subsequently constructed HIR or regular expression. #[derive(Clone, Debug)] -pub struct ConfiguredHIR { +pub(crate) struct ConfiguredHIR { original: String, config: Config, analysis: AstAnalysis, @@ -158,12 +158,12 @@ pub struct ConfiguredHIR { impl ConfiguredHIR { /// Return the configuration for this HIR expression. - pub fn config(&self) -> &Config { + pub(crate) fn config(&self) -> &Config { &self.config } /// Compute the set of non-matching bytes for this HIR expression. - pub fn non_matching_bytes(&self) -> ByteSet { + pub(crate) fn non_matching_bytes(&self) -> ByteSet { non_matching_bytes(&self.expr) } @@ -184,7 +184,7 @@ impl ConfiguredHIR { /// without a line terminator, the fast search path can't be executed. /// /// See: - pub fn line_terminator(&self) -> Option { + pub(crate) fn line_terminator(&self) -> Option { if self.is_any_anchored() { None } else { @@ -198,19 +198,19 @@ impl ConfiguredHIR { } /// Builds a regular expression from this HIR expression. - pub fn regex(&self) -> Result { + pub(crate) fn regex(&self) -> Result { self.pattern_to_regex(&self.pattern()) } /// Returns the pattern string by converting this HIR to its concrete /// syntax. - pub fn pattern(&self) -> String { + pub(crate) fn pattern(&self) -> String { self.expr.to_string() } /// If this HIR corresponds to an alternation of literals with no /// capturing groups, then this returns those literals. - pub fn alternation_literals(&self) -> Option>> { + pub(crate) fn alternation_literals(&self) -> Option>> { if !self.config.can_plain_aho_corasick() { return None; } @@ -223,7 +223,7 @@ impl ConfiguredHIR { /// /// For example, this can be used to wrap a user provided regular /// expression with additional semantics. e.g., See the `WordMatcher`. - pub fn with_pattern String>( + pub(crate) fn with_pattern String>( &self, mut f: F, ) -> Result { @@ -246,7 +246,7 @@ impl ConfiguredHIR { /// match. This only works when the line terminator is set because the line /// terminator setting guarantees that the regex itself can never match /// through the line terminator byte. - pub fn fast_line_regex(&self) -> Result, Error> { + pub(crate) fn fast_line_regex(&self) -> Result, Error> { if self.config.line_terminator.is_none() { return Ok(None); } diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs index d19119cc..7fde6c46 100644 --- a/crates/regex/src/non_matching.rs +++ b/crates/regex/src/non_matching.rs @@ -7,7 +7,7 @@ use { }; /// Return a confirmed set of non-matching bytes from the given expression. -pub fn non_matching_bytes(expr: &Hir) -> ByteSet { +pub(crate) fn non_matching_bytes(expr: &Hir) -> ByteSet { let mut set = ByteSet::full(); remove_matching_bytes(expr, &mut set); set diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs index 85f33b86..5aa603b0 100644 --- a/crates/regex/src/word.rs +++ b/crates/regex/src/word.rs @@ -1,18 +1,25 @@ -use std::{cell::RefCell, collections::HashMap, sync::Arc}; +use std::{ + collections::HashMap, + panic::{RefUnwindSafe, UnwindSafe}, + sync::Arc, +}; use { grep_matcher::{Match, Matcher, NoError}, regex_automata::{ - meta::Regex, util::captures::Captures, Input, PatternID, + meta::Regex, util::captures::Captures, util::pool::Pool, Input, + PatternID, }, - thread_local::ThreadLocal, }; use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures}; +type PoolFn = + Box Captures + Send + Sync + UnwindSafe + RefUnwindSafe>; + /// A matcher for implementing "word match" semantics. #[derive(Debug)] -pub struct WordMatcher { +pub(crate) struct WordMatcher { /// The regex which is roughly `(?:^|\W)()(?:$|\W)`. regex: Regex, /// The pattern string corresponding to the above regex. @@ -22,21 +29,26 @@ pub struct WordMatcher { original: Regex, /// A map from capture group name to capture group index. names: HashMap, - /// A reusable buffer for finding the match offset of the inner group. - caps: Arc>>, + /// A thread-safe pool of reusable buffers for finding the match offset of + /// the inner group. + caps: Arc>, } impl Clone for WordMatcher { fn clone(&self) -> WordMatcher { - // We implement Clone manually so that we get a fresh ThreadLocal such - // that it can set its own thread owner. This permits each thread - // usings `caps` to hit the fast path. + // We implement Clone manually so that we get a fresh Pool such that it + // can set its own thread owner. This permits each thread usings `caps` + // to hit the fast path. + // + // Note that cloning a regex is "cheap" since it uses reference + // counting internally. + let re = self.regex.clone(); WordMatcher { regex: self.regex.clone(), pattern: self.pattern.clone(), original: self.original.clone(), names: self.names.clone(), - caps: Arc::new(ThreadLocal::new()), + caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))), } } } @@ -47,7 +59,7 @@ impl WordMatcher { /// /// The given options are used to construct the regular expression /// internally. - pub fn new(expr: &ConfiguredHIR) -> Result { + pub(crate) fn new(expr: &ConfiguredHIR) -> Result { let original = expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?; let word_expr = expr.with_pattern(|pat| { @@ -57,7 +69,10 @@ impl WordMatcher { })?; let regex = word_expr.regex()?; let pattern = word_expr.pattern(); - let caps = Arc::new(ThreadLocal::new()); + let caps = Arc::new(Pool::new({ + let regex = regex.clone(); + Box::new(move || regex.create_captures()) as PoolFn + })); let mut names = HashMap::new(); let it = regex.group_info().pattern_names(PatternID::ZERO); @@ -71,7 +86,7 @@ impl WordMatcher { /// Return the underlying pattern string for the regex used by this /// matcher. - pub fn pattern(&self) -> &str { + pub(crate) fn pattern(&self) -> &str { &self.pattern } @@ -85,12 +100,11 @@ impl WordMatcher { haystack: &[u8], at: usize, ) -> Result, ()> { - // This is a bit hairy. The whole point here is to avoid running an - // NFA simulation in the regex engine. Remember, our word regex looks - // like this: + // This is a bit hairy. The whole point here is to avoid running a + // slower regex engine to extract capture groups. Remember, our word + // regex looks like this: // // (^|\W)()($|\W) - // where ^ and $ have multiline mode DISABLED // // What we want are the match offsets of . So in the // easy/common case, the original regex will be sandwiched between @@ -152,18 +166,17 @@ impl Matcher for WordMatcher { // // OK, well, it turns out that it is worth it! But it is quite tricky. // See `fast_find` for details. Effectively, this lets us skip running - // the NFA simulation in the regex engine in the vast majority of - // cases. However, the NFA simulation is required for full correctness. + // a slower regex engine to extract capture groups in the vast majority + // of cases. However, the slower engine is I believe required for full + // correctness. match self.fast_find(haystack, at) { Ok(Some(m)) => return Ok(Some(m)), Ok(None) => return Ok(None), Err(()) => {} } - let cell = - self.caps.get_or(|| RefCell::new(self.regex.create_captures())); let input = Input::new(haystack).span(at..haystack.len()); - let mut caps = cell.borrow_mut(); + let mut caps = self.caps.get(); self.regex.search_captures(&input, &mut caps); Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end))) }