diff --git a/CHANGELOG.md b/CHANGELOG.md index 900162bf..6ba9bd56 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,13 @@ discussion on this. Previously, every line in a match was duplicated, even when it spanned multiple lines. There are no changes to vimgrep output when multi-line mode is disabled. +**In multi-line mode, --count is now equivalent to --count-matches.** + +This appears to match how `pcre2grep` implements `--count`. Previously, ripgrep +would produce outright incorrect counts. Another alternative would be to simply +count the number of lines---even if it's more than the number of matches---but +that seems highly unintuitive. + Security fixes: * [CVE-2021-3013](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-3013): @@ -64,6 +71,8 @@ Bug fixes: Document cygwin path translation behavior in the FAQ. * [BUG #1311](https://github.com/BurntSushi/ripgrep/issues/1311): Fix multi-line bug where a search & replace for `\n` didn't work as expected. +* [BUG #1412](https://github.com/BurntSushi/ripgrep/issues/1412): + Fix multi-line bug with searches using look-around past matching lines. * [BUG #1642](https://github.com/BurntSushi/ripgrep/issues/1642): Fixes a bug where using `-m` and `-A` printed more matches than the limit. * [BUG #1703](https://github.com/BurntSushi/ripgrep/issues/1703): diff --git a/crates/core/app.rs b/crates/core/app.rs index 5058c0ea..96801194 100644 --- a/crates/core/app.rs +++ b/crates/core/app.rs @@ -1057,11 +1057,13 @@ fn flag_count(args: &mut Vec) { This flag suppresses normal output and shows the number of lines that match the given patterns for each file searched. Each file containing a match has its path and count printed on each line. Note that this reports the number of lines -that match and not the total number of matches. +that match and not the total number of matches, unless -U/--multiline is +enabled. In multiline mode, --count is equivalent to --count-matches. If only one file is given to ripgrep, then only the count is printed if there is a match. The --with-filename flag can be used to force printing the file -path in this case. +path in this case. If you need a count to be printed regardless of whether +there is a match, then use --include-zero. This overrides the --count-matches flag. Note that when --count is combined with --only-matching, then ripgrep behaves as if --count-matches was given. diff --git a/crates/matcher/src/lib.rs b/crates/matcher/src/lib.rs index 2bcd0c12..4859de39 100644 --- a/crates/matcher/src/lib.rs +++ b/crates/matcher/src/lib.rs @@ -618,12 +618,31 @@ pub trait Matcher { fn find_iter( &self, haystack: &[u8], + matched: F, + ) -> Result<(), Self::Error> + where + F: FnMut(Match) -> bool, + { + self.find_iter_at(haystack, 0, matched) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack`. If no match exists, then the given function is never + /// called. If the function returns `false`, then iteration stops. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn find_iter_at( + &self, + haystack: &[u8], + at: usize, mut matched: F, ) -> Result<(), Self::Error> where F: FnMut(Match) -> bool, { - self.try_find_iter(haystack, |m| Ok(matched(m))) + self.try_find_iter_at(haystack, at, |m| Ok(matched(m))) .map(|r: Result<(), ()>| r.unwrap()) } @@ -637,12 +656,35 @@ pub trait Matcher { fn try_find_iter( &self, haystack: &[u8], + matched: F, + ) -> Result, Self::Error> + where + F: FnMut(Match) -> Result, + { + self.try_find_iter_at(haystack, 0, matched) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack`. If no match exists, then the given function is never + /// called. If the function returns `false`, then iteration stops. + /// Similarly, if the function returns an error then iteration stops and + /// the error is yielded. If an error occurs while executing the search, + /// then it is converted to + /// `E`. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn try_find_iter_at( + &self, + haystack: &[u8], + at: usize, mut matched: F, ) -> Result, Self::Error> where F: FnMut(Match) -> Result, { - let mut last_end = 0; + let mut last_end = at; let mut last_match = None; loop { @@ -696,12 +738,33 @@ pub trait Matcher { &self, haystack: &[u8], caps: &mut Self::Captures, + matched: F, + ) -> Result<(), Self::Error> + where + F: FnMut(&Self::Captures) -> bool, + { + self.captures_iter_at(haystack, 0, caps, matched) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack` with capture groups extracted from each match. If no + /// match exists, then the given function is never called. If the function + /// returns `false`, then iteration stops. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn captures_iter_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, mut matched: F, ) -> Result<(), Self::Error> where F: FnMut(&Self::Captures) -> bool, { - self.try_captures_iter(haystack, caps, |caps| Ok(matched(caps))) + self.try_captures_iter_at(haystack, at, caps, |caps| Ok(matched(caps))) .map(|r: Result<(), ()>| r.unwrap()) } @@ -716,12 +779,36 @@ pub trait Matcher { &self, haystack: &[u8], caps: &mut Self::Captures, + matched: F, + ) -> Result, Self::Error> + where + F: FnMut(&Self::Captures) -> Result, + { + self.try_captures_iter_at(haystack, 0, caps, matched) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack` with capture groups extracted from each match. If no + /// match exists, then the given function is never called. If the function + /// returns `false`, then iteration stops. Similarly, if the function + /// returns an error then iteration stops and the error is yielded. If + /// an error occurs while executing the search, then it is converted to + /// `E`. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn try_captures_iter_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, mut matched: F, ) -> Result, Self::Error> where F: FnMut(&Self::Captures) -> Result, { - let mut last_end = 0; + let mut last_end = at; let mut last_match = None; loop { @@ -819,13 +906,35 @@ pub trait Matcher { haystack: &[u8], caps: &mut Self::Captures, dst: &mut Vec, + append: F, + ) -> Result<(), Self::Error> + where + F: FnMut(&Self::Captures, &mut Vec) -> bool, + { + self.replace_with_captures_at(haystack, 0, caps, dst, append) + } + + /// Replaces every match in the given haystack with the result of calling + /// `append` with the matching capture groups. + /// + /// If the given `append` function returns `false`, then replacement stops. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn replace_with_captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, + dst: &mut Vec, mut append: F, ) -> Result<(), Self::Error> where F: FnMut(&Self::Captures, &mut Vec) -> bool, { - let mut last_match = 0; - self.captures_iter(haystack, caps, |caps| { + let mut last_match = at; + self.captures_iter_at(haystack, at, caps, |caps| { let m = caps.get(0).unwrap(); dst.extend(&haystack[last_match..m.start]); last_match = m.end; @@ -1039,6 +1148,18 @@ impl<'a, M: Matcher> Matcher for &'a M { (*self).find_iter(haystack, matched) } + fn find_iter_at( + &self, + haystack: &[u8], + at: usize, + matched: F, + ) -> Result<(), Self::Error> + where + F: FnMut(Match) -> bool, + { + (*self).find_iter_at(haystack, at, matched) + } + fn try_find_iter( &self, haystack: &[u8], @@ -1050,6 +1171,18 @@ impl<'a, M: Matcher> Matcher for &'a M { (*self).try_find_iter(haystack, matched) } + fn try_find_iter_at( + &self, + haystack: &[u8], + at: usize, + matched: F, + ) -> Result, Self::Error> + where + F: FnMut(Match) -> Result, + { + (*self).try_find_iter_at(haystack, at, matched) + } + fn captures( &self, haystack: &[u8], @@ -1070,6 +1203,19 @@ impl<'a, M: Matcher> Matcher for &'a M { (*self).captures_iter(haystack, caps, matched) } + fn captures_iter_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, + matched: F, + ) -> Result<(), Self::Error> + where + F: FnMut(&Self::Captures) -> bool, + { + (*self).captures_iter_at(haystack, at, caps, matched) + } + fn try_captures_iter( &self, haystack: &[u8], @@ -1082,6 +1228,19 @@ impl<'a, M: Matcher> Matcher for &'a M { (*self).try_captures_iter(haystack, caps, matched) } + fn try_captures_iter_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, + matched: F, + ) -> Result, Self::Error> + where + F: FnMut(&Self::Captures) -> Result, + { + (*self).try_captures_iter_at(haystack, at, caps, matched) + } + fn replace( &self, haystack: &[u8], @@ -1107,6 +1266,20 @@ impl<'a, M: Matcher> Matcher for &'a M { (*self).replace_with_captures(haystack, caps, dst, append) } + fn replace_with_captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, + dst: &mut Vec, + append: F, + ) -> Result<(), Self::Error> + where + F: FnMut(&Self::Captures, &mut Vec) -> bool, + { + (*self).replace_with_captures_at(haystack, at, caps, dst, append) + } + fn is_match(&self, haystack: &[u8]) -> Result { (*self).is_match(haystack) } diff --git a/crates/printer/src/json.rs b/crates/printer/src/json.rs index b1a6fa2e..8500e6a1 100644 --- a/crates/printer/src/json.rs +++ b/crates/printer/src/json.rs @@ -4,14 +4,14 @@ use std::time::Instant; use grep_matcher::{Match, Matcher}; use grep_searcher::{ - Searcher, Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, - SinkMatch, + Searcher, Sink, SinkContext, SinkContextKind, SinkFinish, SinkMatch, }; use serde_json as json; use counter::CounterWriter; use jsont; use stats::Stats; +use util::find_iter_at_in_context; /// The configuration for the JSON printer. /// @@ -603,7 +603,12 @@ impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { /// Execute the matcher over the given bytes and record the match /// locations if the current configuration demands match granularity. - fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { + fn record_matches( + &mut self, + searcher: &Searcher, + bytes: &[u8], + range: std::ops::Range, + ) -> io::Result<()> { self.json.matches.clear(); // If printing requires knowing the location of each individual match, // then compute and stored those right now for use later. While this @@ -612,12 +617,17 @@ impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { // the extent that it's easy to ensure that we never do more than // one search to find the matches. let matches = &mut self.json.matches; - self.matcher - .find_iter(bytes, |m| { - matches.push(m); + find_iter_at_in_context( + searcher, + &self.matcher, + bytes, + range.clone(), + |m| { + let (s, e) = (m.start() - range.start, m.end() - range.start); + matches.push(Match::new(s, e)); true - }) - .map_err(io::Error::error_message)?; + }, + )?; // Don't report empty matches appearing at the end of the bytes. if !matches.is_empty() && matches.last().unwrap().is_empty() @@ -691,7 +701,11 @@ impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { self.after_context_remaining = searcher.after_context() as u64; } - self.record_matches(mat.bytes())?; + self.record_matches( + searcher, + mat.buffer(), + mat.bytes_range_in_buffer(), + )?; self.stats.add_matches(self.json.matches.len() as u64); self.stats.add_matched_lines(mat.lines().count() as u64); @@ -720,7 +734,7 @@ impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { self.after_context_remaining.saturating_sub(1); } let submatches = if searcher.invert_match() { - self.record_matches(ctx.bytes())?; + self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?; SubMatches::new(ctx.bytes(), &self.json.matches) } else { SubMatches::empty() diff --git a/crates/printer/src/lib.rs b/crates/printer/src/lib.rs index fa321dcd..abb1087f 100644 --- a/crates/printer/src/lib.rs +++ b/crates/printer/src/lib.rs @@ -92,6 +92,17 @@ pub use stats::Stats; pub use summary::{Summary, SummaryBuilder, SummaryKind, SummarySink}; pub use util::PrinterPath; +// The maximum number of bytes to execute a search to account for look-ahead. +// +// This is an unfortunate kludge since PCRE2 doesn't provide a way to search +// a substring of some input while accounting for look-ahead. In theory, we +// could refactor the various 'grep' interfaces to account for it, but it would +// be a large change. So for now, we just let PCRE2 go looking a bit for a +// match without searching the entire rest of the contents. +// +// Note that this kludge is only active in multi-line mode. +const MAX_LOOK_AHEAD: usize = 128; + #[macro_use] mod macros; diff --git a/crates/printer/src/standard.rs b/crates/printer/src/standard.rs index e7fe7c38..0c853f1b 100644 --- a/crates/printer/src/standard.rs +++ b/crates/printer/src/standard.rs @@ -8,15 +8,17 @@ use std::time::Instant; use bstr::ByteSlice; use grep_matcher::{Match, Matcher}; use grep_searcher::{ - LineStep, Searcher, Sink, SinkContext, SinkContextKind, SinkError, - SinkFinish, SinkMatch, + LineStep, Searcher, Sink, SinkContext, SinkContextKind, SinkFinish, + SinkMatch, }; use termcolor::{ColorSpec, NoColor, WriteColor}; use color::ColorSpecs; use counter::CounterWriter; use stats::Stats; -use util::{trim_ascii_prefix, PrinterPath, Replacer, Sunk}; +use util::{ + find_iter_at_in_context, trim_ascii_prefix, PrinterPath, Replacer, Sunk, +}; /// The configuration for the standard printer. /// @@ -682,7 +684,12 @@ impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { /// Execute the matcher over the given bytes and record the match /// locations if the current configuration demands match granularity. - fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { + fn record_matches( + &mut self, + searcher: &Searcher, + bytes: &[u8], + range: std::ops::Range, + ) -> io::Result<()> { self.standard.matches.clear(); if !self.needs_match_granularity { return Ok(()); @@ -695,16 +702,21 @@ impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { // one search to find the matches (well, for replacements, we do one // additional search to perform the actual replacement). let matches = &mut self.standard.matches; - self.matcher - .find_iter(bytes, |m| { - matches.push(m); + find_iter_at_in_context( + searcher, + &self.matcher, + bytes, + range.clone(), + |m| { + let (s, e) = (m.start() - range.start, m.end() - range.start); + matches.push(Match::new(s, e)); true - }) - .map_err(io::Error::error_message)?; + }, + )?; // Don't report empty matches appearing at the end of the bytes. if !matches.is_empty() && matches.last().unwrap().is_empty() - && matches.last().unwrap().start() >= bytes.len() + && matches.last().unwrap().start() >= range.end { matches.pop().unwrap(); } @@ -715,14 +727,25 @@ impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { /// replacement, lazily allocating memory if necessary. /// /// To access the result of a replacement, use `replacer.replacement()`. - fn replace(&mut self, bytes: &[u8]) -> io::Result<()> { + fn replace( + &mut self, + searcher: &Searcher, + bytes: &[u8], + range: std::ops::Range, + ) -> io::Result<()> { self.replacer.clear(); if self.standard.config.replacement.is_some() { let replacement = (*self.standard.config.replacement) .as_ref() .map(|r| &*r) .unwrap(); - self.replacer.replace_all(&self.matcher, bytes, replacement)?; + self.replacer.replace_all( + searcher, + &self.matcher, + bytes, + range, + replacement, + )?; } Ok(()) } @@ -777,8 +800,12 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { self.after_context_remaining = searcher.after_context() as u64; } - self.record_matches(mat.bytes())?; - self.replace(mat.bytes())?; + self.record_matches( + searcher, + mat.buffer(), + mat.bytes_range_in_buffer(), + )?; + self.replace(searcher, mat.buffer(), mat.bytes_range_in_buffer())?; if let Some(ref mut stats) = self.stats { stats.add_matches(self.standard.matches.len() as u64); @@ -807,8 +834,8 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { self.after_context_remaining.saturating_sub(1); } if searcher.invert_match() { - self.record_matches(ctx.bytes())?; - self.replace(ctx.bytes())?; + self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?; + self.replace(searcher, ctx.bytes(), 0..ctx.bytes().len())?; } if searcher.binary_detection().convert_byte().is_some() { if self.binary_byte_offset.is_some() { diff --git a/crates/printer/src/summary.rs b/crates/printer/src/summary.rs index 5ce087ef..e062662d 100644 --- a/crates/printer/src/summary.rs +++ b/crates/printer/src/summary.rs @@ -11,7 +11,7 @@ use termcolor::{ColorSpec, NoColor, WriteColor}; use color::ColorSpecs; use counter::CounterWriter; use stats::Stats; -use util::PrinterPath; +use util::{find_iter_at_in_context, PrinterPath}; /// The configuration for the summary printer. /// @@ -504,6 +504,17 @@ impl<'p, 's, M: Matcher, W: WriteColor> SummarySink<'p, 's, M, W> { self.stats.as_ref() } + /// Returns true if and only if the searcher may report matches over + /// multiple lines. + /// + /// Note that this doesn't just return whether the searcher is in multi + /// line mode, but also checks if the mater can match over multiple lines. + /// If it can't, then we don't need multi line handling, even if the + /// searcher has multi line mode enabled. + fn multi_line(&self, searcher: &Searcher) -> bool { + searcher.multi_line_with_matcher(&self.matcher) + } + /// Returns true if this printer should quit. /// /// This implements the logic for handling quitting after seeing a certain @@ -579,32 +590,39 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for SummarySink<'p, 's, M, W> { fn matched( &mut self, - _searcher: &Searcher, + searcher: &Searcher, mat: &SinkMatch, ) -> Result { - self.match_count += 1; - if let Some(ref mut stats) = self.stats { - let mut match_count = 0; - self.matcher - .find_iter(mat.bytes(), |_| { - match_count += 1; + let is_multi_line = self.multi_line(searcher); + let sink_match_count = if self.stats.is_none() && !is_multi_line { + 1 + } else { + // This gives us as many bytes as the searcher can offer. This + // isn't guaranteed to hold the necessary context to get match + // detection correct (because of look-around), but it does in + // practice. + let buf = mat.buffer(); + let range = mat.bytes_range_in_buffer(); + let mut count = 0; + find_iter_at_in_context( + searcher, + &self.matcher, + buf, + range, + |_| { + count += 1; true - }) - .map_err(io::Error::error_message)?; - if match_count == 0 { - // It is possible for the match count to be zero when - // look-around is used. Since `SinkMatch` won't necessarily - // contain the look-around in its match span, the search here - // could fail to find anything. - // - // It seems likely that setting match_count=1 here is probably - // wrong in some cases, but I don't think we can do any - // better. (Because this printer cannot assume that subsequent - // contents have been loaded into memory, so we have no way of - // increasing the search span here.) - match_count = 1; - } - stats.add_matches(match_count); + }, + )?; + count + }; + if is_multi_line { + self.match_count += sink_match_count; + } else { + self.match_count += 1; + } + if let Some(ref mut stats) = self.stats { + stats.add_matches(sink_match_count); stats.add_matched_lines(mat.lines().count() as u64); } else if self.summary.config.kind.quit_early() { return Ok(false); diff --git a/crates/printer/src/util.rs b/crates/printer/src/util.rs index 3948d970..37e56529 100644 --- a/crates/printer/src/util.rs +++ b/crates/printer/src/util.rs @@ -7,11 +7,13 @@ use std::time; use bstr::{ByteSlice, ByteVec}; use grep_matcher::{Captures, LineTerminator, Match, Matcher}; use grep_searcher::{ - LineIter, SinkContext, SinkContextKind, SinkError, SinkMatch, + LineIter, Searcher, SinkContext, SinkContextKind, SinkError, SinkMatch, }; #[cfg(feature = "serde1")] use serde::{Serialize, Serializer}; +use MAX_LOOK_AHEAD; + /// A type for handling replacements while amortizing allocation. pub struct Replacer { space: Option>, @@ -52,10 +54,22 @@ impl Replacer { /// This can fail if the underlying matcher reports an error. pub fn replace_all<'a>( &'a mut self, + searcher: &Searcher, matcher: &M, - subject: &[u8], + mut subject: &[u8], + range: std::ops::Range, replacement: &[u8], ) -> io::Result<()> { + // See the giant comment in 'find_iter_at_in_context' below for why we + // do this dance. + let is_multi_line = searcher.multi_line_with_matcher(&matcher); + if is_multi_line { + if subject[range.end..].len() >= MAX_LOOK_AHEAD { + subject = &subject[..range.end + MAX_LOOK_AHEAD]; + } + } else { + subject = &subject[..range.end]; + } { let &mut Space { ref mut dst, ref mut caps, ref mut matches } = self.allocate(matcher)?; @@ -63,18 +77,24 @@ impl Replacer { matches.clear(); matcher - .replace_with_captures(subject, caps, dst, |caps, dst| { - let start = dst.len(); - caps.interpolate( - |name| matcher.capture_index(name), - subject, - replacement, - dst, - ); - let end = dst.len(); - matches.push(Match::new(start, end)); - true - }) + .replace_with_captures_at( + subject, + range.start, + caps, + dst, + |caps, dst| { + let start = dst.len(); + caps.interpolate( + |name| matcher.capture_index(name), + subject, + replacement, + dst, + ); + let end = dst.len(); + matches.push(Match::new(start, end)); + true + }, + ) .map_err(io::Error::error_message)?; } Ok(()) @@ -357,3 +377,55 @@ pub fn trim_ascii_prefix( .count(); range.with_start(range.start() + count) } + +pub fn find_iter_at_in_context( + searcher: &Searcher, + matcher: M, + mut bytes: &[u8], + range: std::ops::Range, + mut matched: F, +) -> io::Result<()> +where + M: Matcher, + F: FnMut(Match) -> bool, +{ + // This strange dance is to account for the possibility of look-ahead in + // the regex. The problem here is that mat.bytes() doesn't include the + // lines beyond the match boundaries in mulit-line mode, which means that + // when we try to rediscover the full set of matches here, the regex may no + // longer match if it required some look-ahead beyond the matching lines. + // + // PCRE2 (and the grep-matcher interfaces) has no way of specifying an end + // bound of the search. So we kludge it and let the regex engine search the + // rest of the buffer... But to avoid things getting too crazy, we cap the + // buffer. + // + // If it weren't for multi-line mode, then none of this would be needed. + // Alternatively, if we refactored the grep interfaces to pass along the + // full set of matches (if available) from the searcher, then that might + // also help here. But that winds up paying an upfront unavoidable cost for + // the case where matches don't need to be counted. So then you'd have to + // introduce a way to pass along matches conditionally, only when needed. + // Yikes. + // + // Maybe the bigger picture thing here is that the searcher should be + // responsible for finding matches when necessary, and the printer + // shouldn't be involved in this business in the first place. Sigh. Live + // and learn. Abstraction boundaries are hard. + let is_multi_line = searcher.multi_line_with_matcher(&matcher); + if is_multi_line { + if bytes[range.end..].len() >= MAX_LOOK_AHEAD { + bytes = &bytes[..range.end + MAX_LOOK_AHEAD]; + } + } else { + bytes = &bytes[..range.end]; + } + matcher + .find_iter_at(bytes, range.start, |m| { + if m.start() >= range.end { + return false; + } + matched(m) + }) + .map_err(io::Error::error_message) +} diff --git a/crates/regex/src/word.rs b/crates/regex/src/word.rs index 8f744516..1a75ba48 100644 --- a/crates/regex/src/word.rs +++ b/crates/regex/src/word.rs @@ -48,7 +48,7 @@ impl WordMatcher { let original = expr.with_pattern(|pat| format!("^(?:{})$", pat))?.regex()?; let word_expr = expr.with_pattern(|pat| { - let pat = format!(r"(?:(?-m:^)|\W)({})(?:(?-m:$)|\W)", pat); + let pat = format!(r"(?:(?m:^)|\W)({})(?:\W|(?m:$))", pat); debug!("word regex: {:?}", pat); pat })?; @@ -237,6 +237,8 @@ mod tests { assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a")); assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n")); + assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!##")); + assert_eq!(Some((3, 8)), find(r"!?foo!?", "##\n!foo!\n##")); assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##")); assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa")); } diff --git a/crates/searcher/src/searcher/core.rs b/crates/searcher/src/searcher/core.rs index fe4254ea..21c29303 100644 --- a/crates/searcher/src/searcher/core.rs +++ b/crates/searcher/src/searcher/core.rs @@ -441,6 +441,8 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { bytes: linebuf, absolute_byte_offset: offset, line_number: self.line_number, + buffer: buf, + bytes_range_in_buffer: range.start()..range.end(), }, )?; if !keepgoing { diff --git a/crates/searcher/src/sink.rs b/crates/searcher/src/sink.rs index 750aefbe..f82ffda7 100644 --- a/crates/searcher/src/sink.rs +++ b/crates/searcher/src/sink.rs @@ -365,6 +365,8 @@ pub struct SinkMatch<'b> { pub(crate) bytes: &'b [u8], pub(crate) absolute_byte_offset: u64, pub(crate) line_number: Option, + pub(crate) buffer: &'b [u8], + pub(crate) bytes_range_in_buffer: std::ops::Range, } impl<'b> SinkMatch<'b> { @@ -405,6 +407,18 @@ impl<'b> SinkMatch<'b> { pub fn line_number(&self) -> Option { self.line_number } + + /// TODO + #[inline] + pub fn buffer(&self) -> &'b [u8] { + self.buffer + } + + /// TODO + #[inline] + pub fn bytes_range_in_buffer(&self) -> std::ops::Range { + self.bytes_range_in_buffer.clone() + } } /// The type of context reported by a searcher. diff --git a/tests/json.rs b/tests/json.rs index 97d8e719..ff0b5aae 100644 --- a/tests/json.rs +++ b/tests/json.rs @@ -339,3 +339,22 @@ rgtest!(r1095_crlf_empty_match, |dir: Dir, mut cmd: TestCommand| { assert_eq!(m.submatches[0].m, Data::text("\n")); assert_eq!(m.submatches[1].m, Data::text("\n")); }); + +// See: https://github.com/BurntSushi/ripgrep/issues/1412 +rgtest!(r1412_look_behind_match_missing, |dir: Dir, mut cmd: TestCommand| { + // Only PCRE2 supports look-around. + if !dir.is_pcre2() { + return; + } + + dir.create("test", "foo\nbar\n"); + + let msgs = json_decode( + &cmd.arg("-U").arg("--json").arg(r"(?<=foo\n)bar").stdout(), + ); + assert_eq!(msgs.len(), 4); + + let m = msgs[1].unwrap_match(); + assert_eq!(m.lines, Data::text("bar\n")); + assert_eq!(m.submatches.len(), 1); +}); diff --git a/tests/regression.rs b/tests/regression.rs index 94e62969..d110c99c 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -806,6 +806,19 @@ rgtest!(r1389_bad_symlinks_no_biscuit, |dir: Dir, mut cmd: TestCommand| { eqnice!("mylink/file.txt:test\n", stdout); }); +// printf "foo\nbar\n" | rg -PU '(?<=foo\n)bar' -r quux +// See: https://github.com/BurntSushi/ripgrep/issues/1412 +rgtest!(r1412_look_behind_no_replacement, |dir: Dir, mut cmd: TestCommand| { + // Only PCRE2 supports look-around. + if !dir.is_pcre2() { + return; + } + + dir.create("test", "foo\nbar\n"); + cmd.args(&["-nU", "-rquux", r"(?<=foo\n)bar", "test"]); + eqnice!("2:quux\n", cmd.stdout()); +}); + // See: https://github.com/BurntSushi/ripgrep/pull/1446 rgtest!( r1446_respect_excludes_in_worktree,