mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-08-03 05:32:04 -07:00
BREAKING: regex: finally remove CRLF hack
Now that Rust's regex crate finally supports a CRLF mode, we can remove this giant hack in ripgrep to enable it. (And assuredly did not work in all cases.) The way this works in the regex engine is actually subtly different than what ripgrep previously did. Namely, --crlf would previously treat either \r\n or \n as a line terminator. But now it treats \r\n, \n and \r as line terminators. In effect, it is implemented by treating \r and \n as line terminators, but ^ and $ will never match at a position between a \r and a \n. So basically this means that $ will end up matching in more cases than it might be intended too, but I don't expect this to be a big problem in practice. Note that passing --crlf to ripgrep and enabling CRLF mode in the regex via the `R` inline flag (e.g., `(?R:$)`) are subtly different. The `R` flag just controls the regex engine, but --crlf instructs all of ripgrep to use \r\n as a line terminator. There are likely some inconsistencies or corner cases that are wrong as a result of this cognitive dissonance, but we choose to leave well enough alone for now. Fixing this for real will probably require re-thinking how line terminators are handled in ripgrep. For example, one "problem" with how they're handled now is that ripgrep will re-insert its own line terminators when printing output instead of copying the input. This is maybe not so great and perhaps unexpected. (ripgrep probably can't get away with not inserting any line terminators. Users probably expect files that don't end with a line terminator whose last line matches to have a line terminator inserted.)
This commit is contained in:
@@ -11,7 +11,6 @@ use {
|
||||
|
||||
use crate::{
|
||||
config::{Config, ConfiguredHIR},
|
||||
crlf::CRLFMatcher,
|
||||
error::Error,
|
||||
multi::MultiLiteralMatcher,
|
||||
word::WordMatcher,
|
||||
@@ -428,11 +427,6 @@ enum RegexMatcherImpl {
|
||||
Standard(StandardMatcher),
|
||||
/// A matcher for an alternation of plain literals.
|
||||
MultiLiteral(MultiLiteralMatcher),
|
||||
/// A matcher that strips `\r` from the end of matches.
|
||||
///
|
||||
/// This is only used when the CRLF hack is enabled and the regex is line
|
||||
/// anchored at the end.
|
||||
CRLF(CRLFMatcher),
|
||||
/// A matcher that only matches at word boundaries. This transforms the
|
||||
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
||||
/// Because of this, the WordMatcher provides its own implementation of
|
||||
@@ -447,8 +441,6 @@ impl RegexMatcherImpl {
|
||||
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
||||
if expr.config().word {
|
||||
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
||||
} else if expr.needs_crlf_stripped() {
|
||||
Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
|
||||
} else {
|
||||
if let Some(lits) = expr.alternation_literals() {
|
||||
if lits.len() >= 40 {
|
||||
@@ -464,7 +456,6 @@ impl RegexMatcherImpl {
|
||||
fn regex(&self) -> String {
|
||||
match *self {
|
||||
RegexMatcherImpl::Word(ref x) => x.pattern().to_string(),
|
||||
RegexMatcherImpl::CRLF(ref x) => x.pattern().to_string(),
|
||||
RegexMatcherImpl::MultiLiteral(_) => "<N/A>".to_string(),
|
||||
RegexMatcherImpl::Standard(ref x) => x.pattern.clone(),
|
||||
}
|
||||
@@ -487,7 +478,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_at(haystack, at),
|
||||
MultiLiteral(ref m) => m.find_at(haystack, at),
|
||||
CRLF(ref m) => m.find_at(haystack, at),
|
||||
Word(ref m) => m.find_at(haystack, at),
|
||||
}
|
||||
}
|
||||
@@ -497,7 +487,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.new_captures(),
|
||||
MultiLiteral(ref m) => m.new_captures(),
|
||||
CRLF(ref m) => m.new_captures(),
|
||||
Word(ref m) => m.new_captures(),
|
||||
}
|
||||
}
|
||||
@@ -507,7 +496,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_count(),
|
||||
MultiLiteral(ref m) => m.capture_count(),
|
||||
CRLF(ref m) => m.capture_count(),
|
||||
Word(ref m) => m.capture_count(),
|
||||
}
|
||||
}
|
||||
@@ -517,7 +505,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.capture_index(name),
|
||||
MultiLiteral(ref m) => m.capture_index(name),
|
||||
CRLF(ref m) => m.capture_index(name),
|
||||
Word(ref m) => m.capture_index(name),
|
||||
}
|
||||
}
|
||||
@@ -527,7 +514,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find(haystack),
|
||||
MultiLiteral(ref m) => m.find(haystack),
|
||||
CRLF(ref m) => m.find(haystack),
|
||||
Word(ref m) => m.find(haystack),
|
||||
}
|
||||
}
|
||||
@@ -540,7 +526,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.find_iter(haystack, matched),
|
||||
MultiLiteral(ref m) => m.find_iter(haystack, matched),
|
||||
CRLF(ref m) => m.find_iter(haystack, matched),
|
||||
Word(ref m) => m.find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
@@ -557,7 +542,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.try_find_iter(haystack, matched),
|
||||
MultiLiteral(ref m) => m.try_find_iter(haystack, matched),
|
||||
CRLF(ref m) => m.try_find_iter(haystack, matched),
|
||||
Word(ref m) => m.try_find_iter(haystack, matched),
|
||||
}
|
||||
}
|
||||
@@ -571,7 +555,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures(haystack, caps),
|
||||
MultiLiteral(ref m) => m.captures(haystack, caps),
|
||||
CRLF(ref m) => m.captures(haystack, caps),
|
||||
Word(ref m) => m.captures(haystack, caps),
|
||||
}
|
||||
}
|
||||
@@ -589,7 +572,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
MultiLiteral(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
CRLF(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
@@ -609,7 +591,6 @@ impl Matcher for RegexMatcher {
|
||||
MultiLiteral(ref m) => {
|
||||
m.try_captures_iter(haystack, caps, matched)
|
||||
}
|
||||
CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||
}
|
||||
}
|
||||
@@ -624,7 +605,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.captures_at(haystack, at, caps),
|
||||
MultiLiteral(ref m) => m.captures_at(haystack, at, caps),
|
||||
CRLF(ref m) => m.captures_at(haystack, at, caps),
|
||||
Word(ref m) => m.captures_at(haystack, at, caps),
|
||||
}
|
||||
}
|
||||
@@ -642,7 +622,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.replace(haystack, dst, append),
|
||||
MultiLiteral(ref m) => m.replace(haystack, dst, append),
|
||||
CRLF(ref m) => m.replace(haystack, dst, append),
|
||||
Word(ref m) => m.replace(haystack, dst, append),
|
||||
}
|
||||
}
|
||||
@@ -665,9 +644,6 @@ impl Matcher for RegexMatcher {
|
||||
MultiLiteral(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
CRLF(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
Word(ref m) => {
|
||||
m.replace_with_captures(haystack, caps, dst, append)
|
||||
}
|
||||
@@ -679,7 +655,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match(haystack),
|
||||
MultiLiteral(ref m) => m.is_match(haystack),
|
||||
CRLF(ref m) => m.is_match(haystack),
|
||||
Word(ref m) => m.is_match(haystack),
|
||||
}
|
||||
}
|
||||
@@ -693,7 +668,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.is_match_at(haystack, at),
|
||||
MultiLiteral(ref m) => m.is_match_at(haystack, at),
|
||||
CRLF(ref m) => m.is_match_at(haystack, at),
|
||||
Word(ref m) => m.is_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
@@ -706,7 +680,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match(haystack),
|
||||
MultiLiteral(ref m) => m.shortest_match(haystack),
|
||||
CRLF(ref m) => m.shortest_match(haystack),
|
||||
Word(ref m) => m.shortest_match(haystack),
|
||||
}
|
||||
}
|
||||
@@ -720,7 +693,6 @@ impl Matcher for RegexMatcher {
|
||||
match self.matcher {
|
||||
Standard(ref m) => m.shortest_match_at(haystack, at),
|
||||
MultiLiteral(ref m) => m.shortest_match_at(haystack, at),
|
||||
CRLF(ref m) => m.shortest_match_at(haystack, at),
|
||||
Word(ref m) => m.shortest_match_at(haystack, at),
|
||||
}
|
||||
}
|
||||
@@ -871,9 +843,6 @@ enum RegexCapturesImp {
|
||||
/// the matcher and the capturing groups must behave as if `(re)` is
|
||||
/// the `0`th capture group.
|
||||
offset: usize,
|
||||
/// When enable, the end of a match has `\r` stripped from it, if one
|
||||
/// exists.
|
||||
strip_crlf: bool,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -896,32 +865,9 @@ impl Captures for RegexCaptures {
|
||||
None
|
||||
}
|
||||
}
|
||||
RegexCapturesImp::Regex { ref locs, offset, strip_crlf } => {
|
||||
if !strip_crlf {
|
||||
let actual = i.checked_add(offset).unwrap();
|
||||
return locs
|
||||
.get_group(actual)
|
||||
.map(|sp| Match::new(sp.start, sp.end));
|
||||
}
|
||||
|
||||
// currently don't support capture offsetting with CRLF
|
||||
// stripping
|
||||
assert_eq!(offset, 0);
|
||||
let m = match locs
|
||||
.get_group(i)
|
||||
.map(|sp| Match::new(sp.start, sp.end))
|
||||
{
|
||||
None => return None,
|
||||
Some(m) => m,
|
||||
};
|
||||
// If the end position of this match corresponds to the end
|
||||
// position of the overall match, then we apply our CRLF
|
||||
// stripping. Otherwise, we cannot assume stripping is correct.
|
||||
if i == 0 || m.end() == locs.get_group(0).unwrap().end {
|
||||
Some(m.with_end(m.end() - 1))
|
||||
} else {
|
||||
Some(m)
|
||||
}
|
||||
RegexCapturesImp::Regex { ref locs, offset } => {
|
||||
let actual = i.checked_add(offset).unwrap();
|
||||
locs.get_group(actual).map(|sp| Match::new(sp.start, sp.end))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -940,20 +886,7 @@ impl RegexCaptures {
|
||||
locs: AutomataCaptures,
|
||||
offset: usize,
|
||||
) -> RegexCaptures {
|
||||
RegexCaptures(RegexCapturesImp::Regex {
|
||||
locs,
|
||||
offset,
|
||||
strip_crlf: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn locations(&self) -> &AutomataCaptures {
|
||||
match self.0 {
|
||||
RegexCapturesImp::AhoCorasick { .. } => {
|
||||
panic!("getting locations for simple captures is invalid")
|
||||
}
|
||||
RegexCapturesImp::Regex { ref locs, .. } => locs,
|
||||
}
|
||||
RegexCaptures(RegexCapturesImp::Regex { locs, offset })
|
||||
}
|
||||
|
||||
pub(crate) fn locations_mut(&mut self) -> &mut AutomataCaptures {
|
||||
@@ -965,17 +898,6 @@ impl RegexCaptures {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn strip_crlf(&mut self, yes: bool) {
|
||||
match self.0 {
|
||||
RegexCapturesImp::AhoCorasick { .. } => {
|
||||
panic!("setting strip_crlf for simple captures is invalid")
|
||||
}
|
||||
RegexCapturesImp::Regex { ref mut strip_crlf, .. } => {
|
||||
*strip_crlf = yes;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_simple(&mut self, one: Option<Match>) {
|
||||
match self.0 {
|
||||
RegexCapturesImp::AhoCorasick { ref mut mat } => {
|
||||
|
Reference in New Issue
Block a user