mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
regex: migrate grep-regex to regex-automata
We just do a "basic" dumb migration. We don't try to improve anything here.
This commit is contained in:
parent
1035f6b1ff
commit
e028ea3792
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -200,6 +200,7 @@ dependencies = [
|
|||||||
"grep-matcher",
|
"grep-matcher",
|
||||||
"log",
|
"log",
|
||||||
"regex",
|
"regex",
|
||||||
|
"regex-automata 0.3.0",
|
||||||
"regex-syntax",
|
"regex-syntax",
|
||||||
"thread_local",
|
"thread_local",
|
||||||
]
|
]
|
||||||
|
@ -498,13 +498,23 @@ impl GlobSetBuilder {
|
|||||||
/// Constructing candidates has a very small cost associated with it, so
|
/// Constructing candidates has a very small cost associated with it, so
|
||||||
/// callers may find it beneficial to amortize that cost when matching a single
|
/// callers may find it beneficial to amortize that cost when matching a single
|
||||||
/// path against multiple globs or sets of globs.
|
/// path against multiple globs or sets of globs.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone)]
|
||||||
pub struct Candidate<'a> {
|
pub struct Candidate<'a> {
|
||||||
path: Cow<'a, [u8]>,
|
path: Cow<'a, [u8]>,
|
||||||
basename: Cow<'a, [u8]>,
|
basename: Cow<'a, [u8]>,
|
||||||
ext: Cow<'a, [u8]>,
|
ext: Cow<'a, [u8]>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<'a> std::fmt::Debug for Candidate<'a> {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||||
|
f.debug_struct("Candidate")
|
||||||
|
.field("path", &self.path.as_bstr())
|
||||||
|
.field("basename", &self.basename.as_bstr())
|
||||||
|
.field("ext", &self.ext.as_bstr())
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<'a> Candidate<'a> {
|
impl<'a> Candidate<'a> {
|
||||||
/// Create a new candidate for matching from the given path.
|
/// Create a new candidate for matching from the given path.
|
||||||
pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
|
pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
|
||||||
|
@ -22,8 +22,8 @@ bench = false
|
|||||||
globset = { version = "0.4.10", path = "../globset" }
|
globset = { version = "0.4.10", path = "../globset" }
|
||||||
lazy_static = "1.1"
|
lazy_static = "1.1"
|
||||||
log = "0.4.5"
|
log = "0.4.5"
|
||||||
memchr = "2.1"
|
memchr = "2.5"
|
||||||
regex = "1.1"
|
regex = "1.8.3"
|
||||||
same-file = "1.0.4"
|
same-file = "1.0.4"
|
||||||
thread_local = "1"
|
thread_local = "1"
|
||||||
walkdir = "2.2.7"
|
walkdir = "2.2.7"
|
||||||
|
@ -19,5 +19,6 @@ bstr = "1.5.0"
|
|||||||
grep-matcher = { version = "0.1.6", path = "../matcher" }
|
grep-matcher = { version = "0.1.6", path = "../matcher" }
|
||||||
log = "0.4.5"
|
log = "0.4.5"
|
||||||
regex = "1.8.3"
|
regex = "1.8.3"
|
||||||
|
regex-automata = { version = "0.3.0" }
|
||||||
regex-syntax = "0.7.2"
|
regex-syntax = "0.7.2"
|
||||||
thread_local = "1.1.7"
|
thread_local = "1.1.7"
|
||||||
|
@ -1,15 +1,15 @@
|
|||||||
use grep_matcher::{ByteSet, LineTerminator};
|
use {
|
||||||
use regex::bytes::{Regex, RegexBuilder};
|
grep_matcher::{ByteSet, LineTerminator},
|
||||||
use regex_syntax::ast::{self, Ast};
|
regex_automata::meta::Regex,
|
||||||
use regex_syntax::hir::{self, Hir};
|
regex_syntax::ast::{self, Ast},
|
||||||
|
regex_syntax::hir::{self, Hir},
|
||||||
|
};
|
||||||
|
|
||||||
use crate::ast::AstAnalysis;
|
use crate::{
|
||||||
use crate::crlf::crlfify;
|
ast::AstAnalysis, crlf::crlfify, error::Error, literal::LiteralSets,
|
||||||
use crate::error::Error;
|
multi::alternation_literals, non_matching::non_matching_bytes,
|
||||||
use crate::literal::LiteralSets;
|
strip::strip_from_match,
|
||||||
use crate::multi::alternation_literals;
|
};
|
||||||
use crate::non_matching::non_matching_bytes;
|
|
||||||
use crate::strip::strip_from_match;
|
|
||||||
|
|
||||||
/// Config represents the configuration of a regex matcher in this crate.
|
/// Config represents the configuration of a regex matcher in this crate.
|
||||||
/// The configuration is itself a rough combination of the knobs found in
|
/// The configuration is itself a rough combination of the knobs found in
|
||||||
@ -79,7 +79,7 @@ impl Config {
|
|||||||
.unicode(self.unicode)
|
.unicode(self.unicode)
|
||||||
.build()
|
.build()
|
||||||
.translate(pattern, &ast)
|
.translate(pattern, &ast)
|
||||||
.map_err(Error::regex)?;
|
.map_err(Error::generic)?;
|
||||||
let expr = match self.line_terminator {
|
let expr = match self.line_terminator {
|
||||||
None => expr,
|
None => expr,
|
||||||
Some(line_term) => strip_from_match(expr, line_term)?,
|
Some(line_term) => strip_from_match(expr, line_term)?,
|
||||||
@ -133,7 +133,7 @@ impl Config {
|
|||||||
.ignore_whitespace(self.ignore_whitespace)
|
.ignore_whitespace(self.ignore_whitespace)
|
||||||
.build()
|
.build()
|
||||||
.parse(pattern)
|
.parse(pattern)
|
||||||
.map_err(Error::regex)
|
.map_err(Error::generic)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -212,7 +212,13 @@ impl ConfiguredHIR {
|
|||||||
|
|
||||||
/// Builds a regular expression from this HIR expression.
|
/// Builds a regular expression from this HIR expression.
|
||||||
pub fn regex(&self) -> Result<Regex, Error> {
|
pub fn regex(&self) -> Result<Regex, Error> {
|
||||||
self.pattern_to_regex(&self.expr.to_string())
|
self.pattern_to_regex(&self.pattern())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the pattern string by converting this HIR to its concrete
|
||||||
|
/// syntax.
|
||||||
|
pub fn pattern(&self) -> String {
|
||||||
|
self.expr.to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// If this HIR corresponds to an alternation of literals with no
|
/// If this HIR corresponds to an alternation of literals with no
|
||||||
@ -234,7 +240,7 @@ impl ConfiguredHIR {
|
|||||||
&self,
|
&self,
|
||||||
mut f: F,
|
mut f: F,
|
||||||
) -> Result<ConfiguredHIR, Error> {
|
) -> Result<ConfiguredHIR, Error> {
|
||||||
self.pattern_to_hir(&f(&self.expr.to_string()))
|
self.pattern_to_hir(&f(&self.pattern()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// If the current configuration has a line terminator set and if useful
|
/// If the current configuration has a line terminator set and if useful
|
||||||
@ -286,15 +292,21 @@ impl ConfiguredHIR {
|
|||||||
// intention of the original pattern. For example, the Unicode flag
|
// intention of the original pattern. For example, the Unicode flag
|
||||||
// will impact how the WordMatcher functions, namely, whether its
|
// will impact how the WordMatcher functions, namely, whether its
|
||||||
// word boundaries are Unicode aware or not.
|
// word boundaries are Unicode aware or not.
|
||||||
RegexBuilder::new(&pattern)
|
let syntax = regex_automata::util::syntax::Config::new()
|
||||||
|
.utf8(false)
|
||||||
.nest_limit(self.config.nest_limit)
|
.nest_limit(self.config.nest_limit)
|
||||||
.octal(self.config.octal)
|
.octal(self.config.octal)
|
||||||
.multi_line(self.config.multi_line)
|
.multi_line(self.config.multi_line)
|
||||||
.dot_matches_new_line(self.config.dot_matches_new_line)
|
.dot_matches_new_line(self.config.dot_matches_new_line)
|
||||||
.unicode(self.config.unicode)
|
.unicode(self.config.unicode);
|
||||||
.size_limit(self.config.size_limit)
|
let meta = Regex::config()
|
||||||
.dfa_size_limit(self.config.dfa_size_limit)
|
.utf8_empty(false)
|
||||||
.build()
|
.nfa_size_limit(Some(self.config.size_limit))
|
||||||
|
.hybrid_cache_capacity(self.config.dfa_size_limit);
|
||||||
|
Regex::builder()
|
||||||
|
.syntax(syntax)
|
||||||
|
.configure(meta)
|
||||||
|
.build(pattern)
|
||||||
.map_err(Error::regex)
|
.map_err(Error::regex)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -303,7 +315,7 @@ impl ConfiguredHIR {
|
|||||||
fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
||||||
// See `pattern_to_regex` comment for explanation of why we only set
|
// See `pattern_to_regex` comment for explanation of why we only set
|
||||||
// a subset of knobs here. e.g., `swap_greed` is explicitly left out.
|
// a subset of knobs here. e.g., `swap_greed` is explicitly left out.
|
||||||
let expr = ::regex_syntax::ParserBuilder::new()
|
let expr = regex_syntax::ParserBuilder::new()
|
||||||
.nest_limit(self.config.nest_limit)
|
.nest_limit(self.config.nest_limit)
|
||||||
.octal(self.config.octal)
|
.octal(self.config.octal)
|
||||||
.utf8(false)
|
.utf8(false)
|
||||||
@ -312,7 +324,7 @@ impl ConfiguredHIR {
|
|||||||
.unicode(self.config.unicode)
|
.unicode(self.config.unicode)
|
||||||
.build()
|
.build()
|
||||||
.parse(pattern)
|
.parse(pattern)
|
||||||
.map_err(Error::regex)?;
|
.map_err(Error::generic)?;
|
||||||
Ok(ConfiguredHIR {
|
Ok(ConfiguredHIR {
|
||||||
original: self.original.clone(),
|
original: self.original.clone(),
|
||||||
config: self.config.clone(),
|
config: self.config.clone(),
|
||||||
@ -320,4 +332,21 @@ impl ConfiguredHIR {
|
|||||||
expr,
|
expr,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
fn syntax_config(&self) -> regex_automata::util::syntax::Config {
|
||||||
|
regex_automata::util::syntax::Config::new()
|
||||||
|
.nest_limit(self.config.nest_limit)
|
||||||
|
.octal(self.config.octal)
|
||||||
|
.multi_line(self.config.multi_line)
|
||||||
|
.dot_matches_new_line(self.config.dot_matches_new_line)
|
||||||
|
.unicode(self.config.unicode)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn meta_config(&self) -> regex_automata::meta::Config {
|
||||||
|
Regex::config()
|
||||||
|
.nfa_size_limit(Some(self.config.size_limit))
|
||||||
|
.hybrid_cache_capacity(self.config.dfa_size_limit)
|
||||||
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,20 @@
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
use grep_matcher::{Match, Matcher, NoError};
|
use {
|
||||||
use regex::bytes::Regex;
|
grep_matcher::{Match, Matcher, NoError},
|
||||||
use regex_syntax::hir::{self, Hir, HirKind};
|
regex_automata::{meta::Regex, Input, PatternID},
|
||||||
|
regex_syntax::hir::{self, Hir, HirKind},
|
||||||
|
};
|
||||||
|
|
||||||
use crate::config::ConfiguredHIR;
|
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
|
||||||
use crate::error::Error;
|
|
||||||
use crate::matcher::RegexCaptures;
|
|
||||||
|
|
||||||
/// A matcher for implementing "word match" semantics.
|
/// A matcher for implementing "word match" semantics.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct CRLFMatcher {
|
pub struct CRLFMatcher {
|
||||||
/// The regex.
|
/// The regex.
|
||||||
regex: Regex,
|
regex: Regex,
|
||||||
|
/// The pattern string corresponding to the regex above.
|
||||||
|
pattern: String,
|
||||||
/// A map from capture group name to capture group index.
|
/// A map from capture group name to capture group index.
|
||||||
names: HashMap<String, usize>,
|
names: HashMap<String, usize>,
|
||||||
}
|
}
|
||||||
@ -26,18 +28,21 @@ impl CRLFMatcher {
|
|||||||
assert!(expr.needs_crlf_stripped());
|
assert!(expr.needs_crlf_stripped());
|
||||||
|
|
||||||
let regex = expr.regex()?;
|
let regex = expr.regex()?;
|
||||||
|
let pattern = expr.pattern();
|
||||||
let mut names = HashMap::new();
|
let mut names = HashMap::new();
|
||||||
for (i, optional_name) in regex.capture_names().enumerate() {
|
let it = regex.group_info().pattern_names(PatternID::ZERO);
|
||||||
|
for (i, optional_name) in it.enumerate() {
|
||||||
if let Some(name) = optional_name {
|
if let Some(name) = optional_name {
|
||||||
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(CRLFMatcher { regex, names })
|
Ok(CRLFMatcher { regex, pattern, names })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the underlying regex used by this matcher.
|
/// Return the underlying pattern string for the regex used by this
|
||||||
pub fn regex(&self) -> &Regex {
|
/// matcher.
|
||||||
&self.regex
|
pub fn pattern(&self) -> &str {
|
||||||
|
&self.pattern
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -50,7 +55,8 @@ impl Matcher for CRLFMatcher {
|
|||||||
haystack: &[u8],
|
haystack: &[u8],
|
||||||
at: usize,
|
at: usize,
|
||||||
) -> Result<Option<Match>, NoError> {
|
) -> Result<Option<Match>, NoError> {
|
||||||
let m = match self.regex.find_at(haystack, at) {
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
|
let m = match self.regex.find(input) {
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
Some(m) => Match::new(m.start(), m.end()),
|
Some(m) => Match::new(m.start(), m.end()),
|
||||||
};
|
};
|
||||||
@ -58,7 +64,7 @@ impl Matcher for CRLFMatcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
Ok(RegexCaptures::new(self.regex.create_captures()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn capture_count(&self) -> usize {
|
fn capture_count(&self) -> usize {
|
||||||
@ -76,15 +82,15 @@ impl Matcher for CRLFMatcher {
|
|||||||
caps: &mut RegexCaptures,
|
caps: &mut RegexCaptures,
|
||||||
) -> Result<bool, NoError> {
|
) -> Result<bool, NoError> {
|
||||||
caps.strip_crlf(false);
|
caps.strip_crlf(false);
|
||||||
let r =
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
self.regex.captures_read_at(caps.locations_mut(), haystack, at);
|
self.regex.search_captures(&input, caps.locations_mut());
|
||||||
if !r.is_some() {
|
if !caps.locations().is_match() {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the end of our match includes a `\r`, then strip it from all
|
// If the end of our match includes a `\r`, then strip it from all
|
||||||
// capture groups ending at the same location.
|
// capture groups ending at the same location.
|
||||||
let end = caps.locations().get(0).unwrap().1;
|
let end = caps.locations().get_match().unwrap().end();
|
||||||
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
||||||
caps.strip_crlf(true);
|
caps.strip_crlf(true);
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,21 @@ impl Error {
|
|||||||
Error { kind }
|
Error { kind }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn regex<E: error::Error>(err: E) -> Error {
|
pub(crate) fn regex(err: regex_automata::meta::BuildError) -> Error {
|
||||||
|
// Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||||
|
if let Some(size_limit) = err.size_limit() {
|
||||||
|
let kind = ErrorKind::Regex(format!(
|
||||||
|
"compiled regex exceeds size limit of {size_limit}",
|
||||||
|
));
|
||||||
|
Error { kind }
|
||||||
|
} else if let Some(ref err) = err.syntax_error() {
|
||||||
|
Error::generic(err)
|
||||||
|
} else {
|
||||||
|
Error::generic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn generic<E: error::Error>(err: E) -> Error {
|
||||||
Error { kind: ErrorKind::Regex(err.to_string()) }
|
Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,6 +44,7 @@ impl Error {
|
|||||||
|
|
||||||
/// The kind of an error that can occur.
|
/// The kind of an error that can occur.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
|
#[non_exhaustive]
|
||||||
pub enum ErrorKind {
|
pub enum ErrorKind {
|
||||||
/// An error that occurred as a result of parsing a regular expression.
|
/// An error that occurred as a result of parsing a regular expression.
|
||||||
/// This can be a syntax error or an error that results from attempting to
|
/// This can be a syntax error or an error that results from attempting to
|
||||||
@ -51,25 +66,9 @@ pub enum ErrorKind {
|
|||||||
///
|
///
|
||||||
/// The invalid byte is included in this error.
|
/// The invalid byte is included in this error.
|
||||||
InvalidLineTerminator(u8),
|
InvalidLineTerminator(u8),
|
||||||
/// Hints that destructuring should not be exhaustive.
|
|
||||||
///
|
|
||||||
/// This enum may grow additional variants, so this makes sure clients
|
|
||||||
/// don't count on exhaustive matching. (Otherwise, adding a new variant
|
|
||||||
/// could break existing code.)
|
|
||||||
#[doc(hidden)]
|
|
||||||
__Nonexhaustive,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl error::Error for Error {
|
impl error::Error for Error {}
|
||||||
fn description(&self) -> &str {
|
|
||||||
match self.kind {
|
|
||||||
ErrorKind::Regex(_) => "regex error",
|
|
||||||
ErrorKind::NotAllowed(_) => "literal not allowed",
|
|
||||||
ErrorKind::InvalidLineTerminator(_) => "invalid line terminator",
|
|
||||||
ErrorKind::__Nonexhaustive => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for Error {
|
impl fmt::Display for Error {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
@ -82,7 +81,6 @@ impl fmt::Display for Error {
|
|||||||
let x = util::show_bytes(&[byte]);
|
let x = util::show_bytes(&[byte]);
|
||||||
write!(f, "line terminators must be ASCII, but '{}' is not", x)
|
write!(f, "line terminators must be ASCII, but '{}' is not", x)
|
||||||
}
|
}
|
||||||
ErrorKind::__Nonexhaustive => unreachable!(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,25 @@
|
|||||||
use regex_syntax::hir::Hir;
|
use regex_syntax::hir::Hir;
|
||||||
|
|
||||||
|
// BREADCRUMBS:
|
||||||
|
//
|
||||||
|
// The way we deal with line terminators in the regex is clunky, but probably
|
||||||
|
// the least bad option for now unfortunately.
|
||||||
|
//
|
||||||
|
// The `non_matching_bytes` routine currently hardcodes line terminators for
|
||||||
|
// anchors. But it's not really clear it should even care about line terminators
|
||||||
|
// anyway, since anchors aren't actually part of a match. If we fix that
|
||||||
|
// though, that currently reveals a different bug elsewhere: '(?-m:^)' isn't
|
||||||
|
// implemented correctly in multi-line search, because it defers to the fast
|
||||||
|
// line-by-line strategy, which ends up being wrong. I think the way forward
|
||||||
|
// there is to:
|
||||||
|
//
|
||||||
|
// 1) Adding something in the grep-matcher interface that exposes a way to
|
||||||
|
// query for \A and \z specifically. If they're in the pattern, then we can
|
||||||
|
// decide how to handle them.
|
||||||
|
//
|
||||||
|
// 2) Perhaps provide a way to "translate \A/\z to ^/$" for cases when
|
||||||
|
// mulit-line search is not enabled.
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct LiteralSets {}
|
pub struct LiteralSets {}
|
||||||
|
|
||||||
|
@ -1,15 +1,21 @@
|
|||||||
use std::collections::HashMap;
|
use {
|
||||||
|
grep_matcher::{
|
||||||
use grep_matcher::{
|
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher,
|
||||||
ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError,
|
NoError,
|
||||||
|
},
|
||||||
|
regex_automata::{
|
||||||
|
meta::Regex, util::captures::Captures as AutomataCaptures, Input,
|
||||||
|
PatternID,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
use regex::bytes::{CaptureLocations, Regex};
|
|
||||||
|
|
||||||
use crate::config::{Config, ConfiguredHIR};
|
use crate::{
|
||||||
use crate::crlf::CRLFMatcher;
|
config::{Config, ConfiguredHIR},
|
||||||
use crate::error::Error;
|
crlf::CRLFMatcher,
|
||||||
use crate::multi::MultiLiteralMatcher;
|
error::Error,
|
||||||
use crate::word::WordMatcher;
|
multi::MultiLiteralMatcher,
|
||||||
|
word::WordMatcher,
|
||||||
|
};
|
||||||
|
|
||||||
/// A builder for constructing a `Matcher` using regular expressions.
|
/// A builder for constructing a `Matcher` using regular expressions.
|
||||||
///
|
///
|
||||||
@ -73,6 +79,33 @@ impl RegexMatcherBuilder {
|
|||||||
&self,
|
&self,
|
||||||
literals: &[B],
|
literals: &[B],
|
||||||
) -> Result<RegexMatcher, Error> {
|
) -> Result<RegexMatcher, Error> {
|
||||||
|
// BREADCRUMBS: Ideally we would remove this method and just let the
|
||||||
|
// underlying regex engine handle this case. But... this is tricky.
|
||||||
|
// Part of the problem is that ripgrep escapes all patterns by the
|
||||||
|
// time the regex engine is constructed, which is necessary for PCRE2
|
||||||
|
// for example. So that logic would need to change so that we don't
|
||||||
|
// escape things first.
|
||||||
|
//
|
||||||
|
// If we adjusted that, then I think we could just build an HIR value
|
||||||
|
// directly from the literals, thus skipping the parser altogether.
|
||||||
|
//
|
||||||
|
// But that still requires using and keeping this method. But we could
|
||||||
|
// at least get rid of the MultiLiteral matcher since the regex engine
|
||||||
|
// should now handle that case.
|
||||||
|
//
|
||||||
|
// Getting rid of this method is trickier, unless we make multi-pattern
|
||||||
|
// support a first class concept. But I don't think I want to go down
|
||||||
|
// that path? That implies we still need to accept a single pattern
|
||||||
|
// everywhere, which in turn means ripgrep would be forced to join
|
||||||
|
// the literals together using | and escape meta characters. By that
|
||||||
|
// point, we've lost. So I do think we still need this special method.
|
||||||
|
// But we can at least simplify the implementation.
|
||||||
|
//
|
||||||
|
// I still wonder if "fast parse" is still a good idea though.
|
||||||
|
// Basically, reject all nesting except for single-depth alternation.
|
||||||
|
// And reject character classes and all options. Just basically
|
||||||
|
// support `foo|bar|..|quux`. Maybe skip this for now I think.
|
||||||
|
|
||||||
let mut has_escape = false;
|
let mut has_escape = false;
|
||||||
let mut slices = vec![];
|
let mut slices = vec![];
|
||||||
for lit in literals {
|
for lit in literals {
|
||||||
@ -430,10 +463,10 @@ impl RegexMatcherImpl {
|
|||||||
/// Return the underlying regex object used.
|
/// Return the underlying regex object used.
|
||||||
fn regex(&self) -> String {
|
fn regex(&self) -> String {
|
||||||
match *self {
|
match *self {
|
||||||
RegexMatcherImpl::Word(ref x) => x.regex().to_string(),
|
RegexMatcherImpl::Word(ref x) => x.pattern().to_string(),
|
||||||
RegexMatcherImpl::CRLF(ref x) => x.regex().to_string(),
|
RegexMatcherImpl::CRLF(ref x) => x.pattern().to_string(),
|
||||||
RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(),
|
RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(),
|
||||||
RegexMatcherImpl::Standard(ref x) => x.regex.to_string(),
|
RegexMatcherImpl::Standard(ref x) => x.pattern.clone(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -706,7 +739,10 @@ impl Matcher for RegexMatcher {
|
|||||||
) -> Result<Option<LineMatchKind>, NoError> {
|
) -> Result<Option<LineMatchKind>, NoError> {
|
||||||
Ok(match self.fast_line_regex {
|
Ok(match self.fast_line_regex {
|
||||||
Some(ref regex) => {
|
Some(ref regex) => {
|
||||||
regex.shortest_match(haystack).map(LineMatchKind::Candidate)
|
let input = Input::new(haystack);
|
||||||
|
regex
|
||||||
|
.search_half(&input)
|
||||||
|
.map(|hm| LineMatchKind::Candidate(hm.offset()))
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
|
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
|
||||||
@ -721,20 +757,15 @@ struct StandardMatcher {
|
|||||||
/// The regular expression compiled from the pattern provided by the
|
/// The regular expression compiled from the pattern provided by the
|
||||||
/// caller.
|
/// caller.
|
||||||
regex: Regex,
|
regex: Regex,
|
||||||
/// A map from capture group name to its corresponding index.
|
/// The underlying pattern string for the regex.
|
||||||
names: HashMap<String, usize>,
|
pattern: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StandardMatcher {
|
impl StandardMatcher {
|
||||||
fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
|
fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
|
||||||
let regex = expr.regex()?;
|
let regex = expr.regex()?;
|
||||||
let mut names = HashMap::new();
|
let pattern = expr.pattern();
|
||||||
for (i, optional_name) in regex.capture_names().enumerate() {
|
Ok(StandardMatcher { regex, pattern })
|
||||||
if let Some(name) = optional_name {
|
|
||||||
names.insert(name.to_string(), i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(StandardMatcher { regex, names })
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -747,14 +778,12 @@ impl Matcher for StandardMatcher {
|
|||||||
haystack: &[u8],
|
haystack: &[u8],
|
||||||
at: usize,
|
at: usize,
|
||||||
) -> Result<Option<Match>, NoError> {
|
) -> Result<Option<Match>, NoError> {
|
||||||
Ok(self
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
.regex
|
Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end())))
|
||||||
.find_at(haystack, at)
|
|
||||||
.map(|m| Match::new(m.start(), m.end())))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
Ok(RegexCaptures::new(self.regex.create_captures()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn capture_count(&self) -> usize {
|
fn capture_count(&self) -> usize {
|
||||||
@ -762,7 +791,7 @@ impl Matcher for StandardMatcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn capture_index(&self, name: &str) -> Option<usize> {
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
self.names.get(name).map(|i| *i)
|
self.regex.group_info().to_index(PatternID::ZERO, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn try_find_iter<F, E>(
|
fn try_find_iter<F, E>(
|
||||||
@ -789,10 +818,10 @@ impl Matcher for StandardMatcher {
|
|||||||
at: usize,
|
at: usize,
|
||||||
caps: &mut RegexCaptures,
|
caps: &mut RegexCaptures,
|
||||||
) -> Result<bool, NoError> {
|
) -> Result<bool, NoError> {
|
||||||
Ok(self
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
.regex
|
let caps = caps.locations_mut();
|
||||||
.captures_read_at(&mut caps.locations_mut(), haystack, at)
|
self.regex.search_captures(&input, caps);
|
||||||
.is_some())
|
Ok(caps.is_match())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn shortest_match_at(
|
fn shortest_match_at(
|
||||||
@ -800,7 +829,8 @@ impl Matcher for StandardMatcher {
|
|||||||
haystack: &[u8],
|
haystack: &[u8],
|
||||||
at: usize,
|
at: usize,
|
||||||
) -> Result<Option<usize>, NoError> {
|
) -> Result<Option<usize>, NoError> {
|
||||||
Ok(self.regex.shortest_match_at(haystack, at))
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
|
Ok(self.regex.search_half(&input).map(|hm| hm.offset()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -829,7 +859,7 @@ enum RegexCapturesImp {
|
|||||||
},
|
},
|
||||||
Regex {
|
Regex {
|
||||||
/// Where the locations are stored.
|
/// Where the locations are stored.
|
||||||
locs: CaptureLocations,
|
locs: AutomataCaptures,
|
||||||
/// These captures behave as if the capturing groups begin at the given
|
/// These captures behave as if the capturing groups begin at the given
|
||||||
/// offset. When set to `0`, this has no affect and capture groups are
|
/// offset. When set to `0`, this has no affect and capture groups are
|
||||||
/// indexed like normal.
|
/// indexed like normal.
|
||||||
@ -852,7 +882,7 @@ impl Captures for RegexCaptures {
|
|||||||
match self.0 {
|
match self.0 {
|
||||||
RegexCapturesImp::AhoCorasick { .. } => 1,
|
RegexCapturesImp::AhoCorasick { .. } => 1,
|
||||||
RegexCapturesImp::Regex { ref locs, offset, .. } => {
|
RegexCapturesImp::Regex { ref locs, offset, .. } => {
|
||||||
locs.len().checked_sub(offset).unwrap()
|
locs.group_info().all_group_len().checked_sub(offset).unwrap()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -869,20 +899,25 @@ impl Captures for RegexCaptures {
|
|||||||
RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => {
|
RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => {
|
||||||
if !strip_crlf {
|
if !strip_crlf {
|
||||||
let actual = i.checked_add(offset).unwrap();
|
let actual = i.checked_add(offset).unwrap();
|
||||||
return locs.pos(actual).map(|(s, e)| Match::new(s, e));
|
return locs
|
||||||
|
.get_group(actual)
|
||||||
|
.map(|sp| Match::new(sp.start, sp.end));
|
||||||
}
|
}
|
||||||
|
|
||||||
// currently don't support capture offsetting with CRLF
|
// currently don't support capture offsetting with CRLF
|
||||||
// stripping
|
// stripping
|
||||||
assert_eq!(offset, 0);
|
assert_eq!(offset, 0);
|
||||||
let m = match locs.pos(i).map(|(s, e)| Match::new(s, e)) {
|
let m = match locs
|
||||||
|
.get_group(i)
|
||||||
|
.map(|sp| Match::new(sp.start, sp.end))
|
||||||
|
{
|
||||||
None => return None,
|
None => return None,
|
||||||
Some(m) => m,
|
Some(m) => m,
|
||||||
};
|
};
|
||||||
// If the end position of this match corresponds to the end
|
// If the end position of this match corresponds to the end
|
||||||
// position of the overall match, then we apply our CRLF
|
// position of the overall match, then we apply our CRLF
|
||||||
// stripping. Otherwise, we cannot assume stripping is correct.
|
// stripping. Otherwise, we cannot assume stripping is correct.
|
||||||
if i == 0 || m.end() == locs.pos(0).unwrap().1 {
|
if i == 0 || m.end() == locs.get_group(0).unwrap().end {
|
||||||
Some(m.with_end(m.end() - 1))
|
Some(m.with_end(m.end() - 1))
|
||||||
} else {
|
} else {
|
||||||
Some(m)
|
Some(m)
|
||||||
@ -897,12 +932,12 @@ impl RegexCaptures {
|
|||||||
RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None })
|
RegexCaptures(RegexCapturesImp::AhoCorasick { mat: None })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
|
pub(crate) fn new(locs: AutomataCaptures) -> RegexCaptures {
|
||||||
RegexCaptures::with_offset(locs, 0)
|
RegexCaptures::with_offset(locs, 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn with_offset(
|
pub(crate) fn with_offset(
|
||||||
locs: CaptureLocations,
|
locs: AutomataCaptures,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
) -> RegexCaptures {
|
) -> RegexCaptures {
|
||||||
RegexCaptures(RegexCapturesImp::Regex {
|
RegexCaptures(RegexCapturesImp::Regex {
|
||||||
@ -912,7 +947,7 @@ impl RegexCaptures {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn locations(&self) -> &CaptureLocations {
|
pub(crate) fn locations(&self) -> &AutomataCaptures {
|
||||||
match self.0 {
|
match self.0 {
|
||||||
RegexCapturesImp::AhoCorasick { .. } => {
|
RegexCapturesImp::AhoCorasick { .. } => {
|
||||||
panic!("getting locations for simple captures is invalid")
|
panic!("getting locations for simple captures is invalid")
|
||||||
@ -921,7 +956,7 @@ impl RegexCaptures {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn locations_mut(&mut self) -> &mut CaptureLocations {
|
pub(crate) fn locations_mut(&mut self) -> &mut AutomataCaptures {
|
||||||
match self.0 {
|
match self.0 {
|
||||||
RegexCapturesImp::AhoCorasick { .. } => {
|
RegexCapturesImp::AhoCorasick { .. } => {
|
||||||
panic!("getting locations for simple captures is invalid")
|
panic!("getting locations for simple captures is invalid")
|
||||||
|
@ -26,7 +26,7 @@ impl MultiLiteralMatcher {
|
|||||||
let ac = AhoCorasick::builder()
|
let ac = AhoCorasick::builder()
|
||||||
.match_kind(MatchKind::LeftmostFirst)
|
.match_kind(MatchKind::LeftmostFirst)
|
||||||
.build(literals)
|
.build(literals)
|
||||||
.map_err(Error::regex)?;
|
.map_err(Error::generic)?;
|
||||||
Ok(MultiLiteralMatcher { ac })
|
Ok(MultiLiteralMatcher { ac })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,6 @@ pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
|
|||||||
fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
|
fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) {
|
||||||
match *expr.kind() {
|
match *expr.kind() {
|
||||||
HirKind::Empty
|
HirKind::Empty
|
||||||
// | HirKind::Look(Look::Start | Look::End)
|
|
||||||
| HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
|
| HirKind::Look(Look::WordAscii | Look::WordAsciiNegate)
|
||||||
| HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
|
| HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) => {}
|
||||||
HirKind::Look(Look::Start | Look::End) => {
|
HirKind::Look(Look::Start | Look::End) => {
|
||||||
|
@ -1,27 +1,29 @@
|
|||||||
use std::cell::RefCell;
|
use std::{cell::RefCell, collections::HashMap, sync::Arc};
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
use grep_matcher::{Match, Matcher, NoError};
|
use {
|
||||||
use regex::bytes::{CaptureLocations, Regex};
|
grep_matcher::{Match, Matcher, NoError},
|
||||||
use thread_local::ThreadLocal;
|
regex_automata::{
|
||||||
|
meta::Regex, util::captures::Captures, Input, PatternID,
|
||||||
|
},
|
||||||
|
thread_local::ThreadLocal,
|
||||||
|
};
|
||||||
|
|
||||||
use crate::config::ConfiguredHIR;
|
use crate::{config::ConfiguredHIR, error::Error, matcher::RegexCaptures};
|
||||||
use crate::error::Error;
|
|
||||||
use crate::matcher::RegexCaptures;
|
|
||||||
|
|
||||||
/// A matcher for implementing "word match" semantics.
|
/// A matcher for implementing "word match" semantics.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct WordMatcher {
|
pub struct WordMatcher {
|
||||||
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
||||||
regex: Regex,
|
regex: Regex,
|
||||||
|
/// The pattern string corresponding to the above regex.
|
||||||
|
pattern: String,
|
||||||
/// The original regex supplied by the user, which we use in a fast path
|
/// The original regex supplied by the user, which we use in a fast path
|
||||||
/// to try and detect matches before deferring to slower engines.
|
/// to try and detect matches before deferring to slower engines.
|
||||||
original: Regex,
|
original: Regex,
|
||||||
/// A map from capture group name to capture group index.
|
/// A map from capture group name to capture group index.
|
||||||
names: HashMap<String, usize>,
|
names: HashMap<String, usize>,
|
||||||
/// A reusable buffer for finding the match location of the inner group.
|
/// A reusable buffer for finding the match location of the inner group.
|
||||||
locs: Arc<ThreadLocal<RefCell<CaptureLocations>>>,
|
locs: Arc<ThreadLocal<RefCell<Captures>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for WordMatcher {
|
impl Clone for WordMatcher {
|
||||||
@ -31,6 +33,7 @@ impl Clone for WordMatcher {
|
|||||||
// usings `locs` to hit the fast path.
|
// usings `locs` to hit the fast path.
|
||||||
WordMatcher {
|
WordMatcher {
|
||||||
regex: self.regex.clone(),
|
regex: self.regex.clone(),
|
||||||
|
pattern: self.pattern.clone(),
|
||||||
original: self.original.clone(),
|
original: self.original.clone(),
|
||||||
names: self.names.clone(),
|
names: self.names.clone(),
|
||||||
locs: Arc::new(ThreadLocal::new()),
|
locs: Arc::new(ThreadLocal::new()),
|
||||||
@ -53,20 +56,23 @@ impl WordMatcher {
|
|||||||
pat
|
pat
|
||||||
})?;
|
})?;
|
||||||
let regex = word_expr.regex()?;
|
let regex = word_expr.regex()?;
|
||||||
|
let pattern = word_expr.pattern();
|
||||||
let locs = Arc::new(ThreadLocal::new());
|
let locs = Arc::new(ThreadLocal::new());
|
||||||
|
|
||||||
let mut names = HashMap::new();
|
let mut names = HashMap::new();
|
||||||
for (i, optional_name) in regex.capture_names().enumerate() {
|
let it = regex.group_info().pattern_names(PatternID::ZERO);
|
||||||
|
for (i, optional_name) in it.enumerate() {
|
||||||
if let Some(name) = optional_name {
|
if let Some(name) = optional_name {
|
||||||
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(WordMatcher { regex, original, names, locs })
|
Ok(WordMatcher { regex, pattern, original, names, locs })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the underlying regex used by this matcher.
|
/// Return the underlying pattern string for the regex used by this
|
||||||
pub fn regex(&self) -> &Regex {
|
/// matcher.
|
||||||
&self.regex
|
pub fn pattern(&self) -> &str {
|
||||||
|
&self.pattern
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Attempt to do a fast confirmation of a word match that covers a subset
|
/// Attempt to do a fast confirmation of a word match that covers a subset
|
||||||
@ -102,7 +108,8 @@ impl WordMatcher {
|
|||||||
// The reason why we cannot handle the ^/$ cases here is because we
|
// The reason why we cannot handle the ^/$ cases here is because we
|
||||||
// can't assume anything about the original pattern. (Try commenting
|
// can't assume anything about the original pattern. (Try commenting
|
||||||
// out the checks for ^/$ below and run the tests to see examples.)
|
// out the checks for ^/$ below and run the tests to see examples.)
|
||||||
let mut cand = match self.regex.find_at(haystack, at) {
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
|
let mut cand = match self.regex.find(input) {
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
Some(m) => Match::new(m.start(), m.end()),
|
Some(m) => Match::new(m.start(), m.end()),
|
||||||
};
|
};
|
||||||
@ -154,14 +161,15 @@ impl Matcher for WordMatcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let cell =
|
let cell =
|
||||||
self.locs.get_or(|| RefCell::new(self.regex.capture_locations()));
|
self.locs.get_or(|| RefCell::new(self.regex.create_captures()));
|
||||||
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
let mut caps = cell.borrow_mut();
|
let mut caps = cell.borrow_mut();
|
||||||
self.regex.captures_read_at(&mut caps, haystack, at);
|
self.regex.search_captures(&input, &mut caps);
|
||||||
Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
|
Ok(caps.get_group(1).map(|sp| Match::new(sp.start, sp.end)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
|
Ok(RegexCaptures::with_offset(self.regex.create_captures(), 1))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn capture_count(&self) -> usize {
|
fn capture_count(&self) -> usize {
|
||||||
@ -178,9 +186,10 @@ impl Matcher for WordMatcher {
|
|||||||
at: usize,
|
at: usize,
|
||||||
caps: &mut RegexCaptures,
|
caps: &mut RegexCaptures,
|
||||||
) -> Result<bool, NoError> {
|
) -> Result<bool, NoError> {
|
||||||
let r =
|
let input = Input::new(haystack).span(at..haystack.len());
|
||||||
self.regex.captures_read_at(caps.locations_mut(), haystack, at);
|
let caps = caps.locations_mut();
|
||||||
Ok(r.is_some())
|
self.regex.search_captures(&input, caps);
|
||||||
|
Ok(caps.is_match())
|
||||||
}
|
}
|
||||||
|
|
||||||
// We specifically do not implement other methods like find_iter or
|
// We specifically do not implement other methods like find_iter or
|
||||||
|
Loading…
x
Reference in New Issue
Block a user