From a775b493fd0d8dc23bdf43bbffe6d4c9f449e2ee Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 17 Jun 2023 21:47:37 -0400
Subject: [PATCH] regex: small cleanups

Just some small polishing. We also get rid of thread_local in favor of
using regex-automata, mostly just in the name of reducing dependencies.
(We should eventually be able to drop thread_local completely.)
---
 Cargo.lock                       |  1 -
 crates/regex/Cargo.toml          |  1 -
 crates/regex/src/config.rs       | 52 ++++++++++++++---------------
 crates/regex/src/non_matching.rs |  2 +-
 crates/regex/src/word.rs         | 57 ++++++++++++++++++++------------
 5 files changed, 62 insertions(+), 51 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b86f49d6..3f4bc49b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -202,7 +202,6 @@ dependencies = [
  "regex",
  "regex-automata 0.3.0",
  "regex-syntax",
- "thread_local",
 ]
 
 [[package]]
diff --git a/crates/regex/Cargo.toml b/crates/regex/Cargo.toml
index f564b0e6..13d2e5e1 100644
--- a/crates/regex/Cargo.toml
+++ b/crates/regex/Cargo.toml
@@ -21,4 +21,3 @@ log = "0.4.5"
 regex = "1.8.3"
 regex-automata = { version = "0.3.0" }
 regex-syntax = "0.7.2"
-thread_local = "1.1.7"
diff --git a/crates/regex/src/config.rs b/crates/regex/src/config.rs
index 8d785b6d..a2aa3243 100644
--- a/crates/regex/src/config.rs
+++ b/crates/regex/src/config.rs
@@ -21,21 +21,21 @@ use crate::{
 /// configuration which generated it, and provides transformation on that HIR
 /// such that the configuration is preserved.
 #[derive(Clone, Debug)]
-pub struct Config {
-    pub case_insensitive: bool,
-    pub case_smart: bool,
-    pub multi_line: bool,
-    pub dot_matches_new_line: bool,
-    pub swap_greed: bool,
-    pub ignore_whitespace: bool,
-    pub unicode: bool,
-    pub octal: bool,
-    pub size_limit: usize,
-    pub dfa_size_limit: usize,
-    pub nest_limit: u32,
-    pub line_terminator: Option<LineTerminator>,
-    pub crlf: bool,
-    pub word: bool,
+pub(crate) struct Config {
+    pub(crate) case_insensitive: bool,
+    pub(crate) case_smart: bool,
+    pub(crate) multi_line: bool,
+    pub(crate) dot_matches_new_line: bool,
+    pub(crate) swap_greed: bool,
+    pub(crate) ignore_whitespace: bool,
+    pub(crate) unicode: bool,
+    pub(crate) octal: bool,
+    pub(crate) size_limit: usize,
+    pub(crate) dfa_size_limit: usize,
+    pub(crate) nest_limit: u32,
+    pub(crate) line_terminator: Option<LineTerminator>,
+    pub(crate) crlf: bool,
+    pub(crate) word: bool,
 }
 
 impl Default for Config {
@@ -67,7 +67,7 @@ impl Config {
     ///
     /// If there was a problem parsing the given expression then an error
     /// is returned.
-    pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
+    pub(crate) fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
         let ast = self.ast(pattern)?;
         let analysis = self.analysis(&ast)?;
         let expr = hir::translate::TranslatorBuilder::new()
@@ -112,7 +112,7 @@ impl Config {
     /// Note that it is OK to return true even when settings like `multi_line`
     /// are enabled, since if multi-line can impact the match semantics of a
     /// regex, then it is by definition not a simple alternation of literals.
-    pub fn can_plain_aho_corasick(&self) -> bool {
+    pub(crate) fn can_plain_aho_corasick(&self) -> bool {
         !self.word && !self.case_insensitive && !self.case_smart
     }
 
@@ -149,7 +149,7 @@ impl Config {
 /// size limits set on the configured HIR will be propagated out to any
 /// subsequently constructed HIR or regular expression.
 #[derive(Clone, Debug)]
-pub struct ConfiguredHIR {
+pub(crate) struct ConfiguredHIR {
     original: String,
     config: Config,
     analysis: AstAnalysis,
@@ -158,12 +158,12 @@ pub struct ConfiguredHIR {
 
 impl ConfiguredHIR {
     /// Return the configuration for this HIR expression.
-    pub fn config(&self) -> &Config {
+    pub(crate) fn config(&self) -> &Config {
         &self.config
     }
 
     /// Compute the set of non-matching bytes for this HIR expression.
-    pub fn non_matching_bytes(&self) -> ByteSet {
+    pub(crate) fn non_matching_bytes(&self) -> ByteSet {
         non_matching_bytes(&self.expr)
     }
 
@@ -184,7 +184,7 @@ impl ConfiguredHIR {
     /// without a line terminator, the fast search path can't be executed.
     ///
     /// See: <https://github.com/BurntSushi/ripgrep/issues/2260>
-    pub fn line_terminator(&self) -> Option<LineTerminator> {
+    pub(crate) fn line_terminator(&self) -> Option<LineTerminator> {
         if self.is_any_anchored() {
             None
         } else {
@@ -198,19 +198,19 @@ impl ConfiguredHIR {
     }
 
     /// Builds a regular expression from this HIR expression.
-    pub fn regex(&self) -> Result<Regex, Error> {
+    pub(crate) fn regex(&self) -> Result<Regex, Error> {
         self.pattern_to_regex(&self.pattern())
     }
 
     /// Returns the pattern string by converting this HIR to its concrete
     /// syntax.
-    pub fn pattern(&self) -> String {
+    pub(crate) fn pattern(&self) -> String {
         self.expr.to_string()
     }
 
     /// If this HIR corresponds to an alternation of literals with no
     /// capturing groups, then this returns those literals.
-    pub fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
+    pub(crate) fn alternation_literals(&self) -> Option<Vec<Vec<u8>>> {
         if !self.config.can_plain_aho_corasick() {
             return None;
         }
@@ -223,7 +223,7 @@ impl ConfiguredHIR {
     ///
     /// For example, this can be used to wrap a user provided regular
     /// expression with additional semantics. e.g., See the `WordMatcher`.
-    pub fn with_pattern<F: FnMut(&str) -> String>(
+    pub(crate) fn with_pattern<F: FnMut(&str) -> String>(
         &self,
         mut f: F,
     ) -> Result<ConfiguredHIR, Error> {
@@ -246,7 +246,7 @@ impl ConfiguredHIR {
     /// match. This only works when the line terminator is set because the line
     /// terminator setting guarantees that the regex itself can never match
     /// through the line terminator byte.
-    pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
+    pub(crate) fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
         if self.config.line_terminator.is_none() {
             return Ok(None);
         }
diff --git a/crates/regex/src/non_matching.rs b/crates/regex/src/non_matching.rs
index d19119cc..7fde6c46 100644
--- a/crates/regex/src/non_matching.rs
+++ b/crates/regex/src/non_matching.rs
@@ -7,7 +7,7 @@ use {
 };
 
 /// Return a confirmed set of non-matching bytes from the given expression.
-pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
+pub(crate) fn non_matching_bytes(expr: &Hir) -> ByteSet {
     let mut set = ByteSet::full();
     remove_matching_bytes(expr, &mut set);
     set
diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs
index 85f33b86..5aa603b0 100644
--- a/crates/regex/src/word.rs
+++ b/crates/regex/src/word.rs
@@ -1,18 +1,25 @@
-use std::{cell::RefCell, collections::HashMap, sync::Arc};
+use std::{
+    collections::HashMap,
+    panic::{RefUnwindSafe, UnwindSafe},
+    sync::Arc,
+};
 
 use {
     grep_matcher::{Match, Matcher, NoError},
     regex_automata::{
-        meta::Regex, util::captures::Captures, Input, PatternID,
+        meta::Regex, util::captures::Captures, util::pool::Pool, Input,
+        PatternID,
     },
-    thread_local::ThreadLocal,
 };
 
 use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
 
+type PoolFn =
+    Box<dyn Fn() -> Captures + Send + Sync + UnwindSafe + RefUnwindSafe>;
+
 /// A matcher for implementing "word match" semantics.
 #[derive(Debug)]
-pub struct WordMatcher {
+pub(crate) struct WordMatcher {
     /// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
     regex: Regex,
     /// The pattern string corresponding to the above regex.
@@ -22,21 +29,26 @@ pub struct WordMatcher {
     original: Regex,
     /// A map from capture group name to capture group index.
     names: HashMap<String, usize>,
-    /// A reusable buffer for finding the match offset of the inner group.
-    caps: Arc<ThreadLocal<RefCell<Captures>>>,
+    /// A thread-safe pool of reusable buffers for finding the match offset of
+    /// the inner group.
+    caps: Arc<Pool<Captures, PoolFn>>,
 }
 
 impl Clone for WordMatcher {
     fn clone(&self) -> WordMatcher {
-        // We implement Clone manually so that we get a fresh ThreadLocal such
-        // that it can set its own thread owner. This permits each thread
-        // usings `caps` to hit the fast path.
+        // We implement Clone manually so that we get a fresh Pool such that it
+        // can set its own thread owner. This permits each thread usings `caps`
+        // to hit the fast path.
+        //
+        // Note that cloning a regex is "cheap" since it uses reference
+        // counting internally.
+        let re = self.regex.clone();
         WordMatcher {
             regex: self.regex.clone(),
             pattern: self.pattern.clone(),
             original: self.original.clone(),
             names: self.names.clone(),
-            caps: Arc::new(ThreadLocal::new()),
+            caps: Arc::new(Pool::new(Box::new(move || re.create_captures()))),
         }
     }
 }
@@ -47,7 +59,7 @@ impl WordMatcher {
     ///
     /// The given options are used to construct the regular expression
     /// internally.
-    pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
+    pub(crate) fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
         let original =
             expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?;
         let word_expr = expr.with_pattern(|pat| {
@@ -57,7 +69,10 @@ impl WordMatcher {
         })?;
         let regex = word_expr.regex()?;
         let pattern = word_expr.pattern();
-        let caps = Arc::new(ThreadLocal::new());
+        let caps = Arc::new(Pool::new({
+            let regex = regex.clone();
+            Box::new(move || regex.create_captures()) as PoolFn
+        }));
 
         let mut names = HashMap::new();
         let it = regex.group_info().pattern_names(PatternID::ZERO);
@@ -71,7 +86,7 @@ impl WordMatcher {
 
     /// Return the underlying pattern string for the regex used by this
     /// matcher.
-    pub fn pattern(&self) -> &str {
+    pub(crate) fn pattern(&self) -> &str {
         &self.pattern
     }
 
@@ -85,12 +100,11 @@ impl WordMatcher {
         haystack: &[u8],
         at: usize,
     ) -> Result<Option<Match>, ()> {
-        // This is a bit hairy. The whole point here is to avoid running an
-        // NFA simulation in the regex engine. Remember, our word regex looks
-        // like this:
+        // This is a bit hairy. The whole point here is to avoid running a
+        // slower regex engine to extract capture groups. Remember, our word
+        // regex looks like this:
         //
         //     (^|\W)(<original regex>)($|\W)
-        //     where ^ and $ have multiline mode DISABLED
         //
         // What we want are the match offsets of <original regex>. So in the
         // easy/common case, the original regex will be sandwiched between
@@ -152,18 +166,17 @@ impl Matcher for WordMatcher {
         //
         // OK, well, it turns out that it is worth it! But it is quite tricky.
         // See `fast_find` for details. Effectively, this lets us skip running
-        // the NFA simulation in the regex engine in the vast majority of
-        // cases. However, the NFA simulation is required for full correctness.
+        // a slower regex engine to extract capture groups in the vast majority
+        // of cases. However, the slower engine is I believe required for full
+        // correctness.
         match self.fast_find(haystack, at) {
             Ok(Some(m)) => return Ok(Some(m)),
             Ok(None) => return Ok(None),
             Err(()) => {}
         }
 
-        let cell =
-            self.caps.get_or(|| RefCell::new(self.regex.create_captures()));
         let input = Input::new(haystack).span(at..haystack.len());
-        let mut caps = cell.borrow_mut();
+        let mut caps = self.caps.get();
         self.regex.search_captures(&input, &mut caps);
         Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
     }