repo: move all source code in crates directory

The top-level listing was just getting a bit too long for my taste. So
put all of the code in one directory and shrink the large top-level mess
to a small top-level mess.

NOTE: This commit only contains renames. The subsequent commit will
actually make ripgrep build again. We do it this way with the naive hope
that this will make it easier for git history to track the renames.
Sigh.
This commit is contained in:
Andrew Gallant
2020-02-17 18:19:19 -05:00
parent 0bc4f0447b
commit fdd8510fdd
113 changed files with 0 additions and 0 deletions

131
crates/searcher/src/lib.rs Normal file
View File

@@ -0,0 +1,131 @@
/*!
This crate provides an implementation of line oriented search, with optional
support for multi-line search.
# Brief overview
The principle type in this crate is a
[`Searcher`](struct.Searcher.html),
which can be configured and built by a
[`SearcherBuilder`](struct.SearcherBuilder.html).
A `Searcher` is responsible for reading bytes from a source (e.g., a file),
executing a search of those bytes using a `Matcher` (e.g., a regex) and then
reporting the results of that search to a
[`Sink`](trait.Sink.html)
(e.g., stdout). The `Searcher` itself is principally responsible for managing
the consumption of bytes from a source and applying a `Matcher` over those
bytes in an efficient way. The `Searcher` is also responsible for inverting
a search, counting lines, reporting contextual lines, detecting binary data
and even deciding whether or not to use memory maps.
A `Matcher` (which is defined in the
[`grep-matcher`](https://crates.io/crates/grep-matcher)
crate) is a trait for describing the lowest levels of pattern search in a
generic way. The interface itself is very similar to the interface of a regular
expression. For example, the
[`grep-regex`](https://crates.io/crates/grep-regex)
crate provides an implementation of the `Matcher` trait using Rust's
[`regex`](https://crates.io/crates/regex)
crate.
Finally, a `Sink` describes how callers receive search results producer by a
`Searcher`. This includes routines that are called at the beginning and end of
a search, in addition to routines that are called when matching or contextual
lines are found by the `Searcher`. Implementations of `Sink` can be trivially
simple, or extraordinarily complex, such as the
`Standard` printer found in the
[`grep-printer`](https://crates.io/crates/grep-printer)
crate, which effectively implements grep-like output.
This crate also provides convenience `Sink` implementations in the
[`sinks`](sinks/index.html)
sub-module for easy searching with closures.
# Example
This example shows how to execute the searcher and read the search results
using the
[`UTF8`](sinks/struct.UTF8.html)
implementation of `Sink`.
```
extern crate grep_matcher;
extern crate grep_regex;
extern crate grep_searcher;
use std::error::Error;
use grep_matcher::Matcher;
use grep_regex::RegexMatcher;
use grep_searcher::Searcher;
use grep_searcher::sinks::UTF8;
const SHERLOCK: &'static [u8] = b"\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.
";
# fn main() { example().unwrap() }
fn example() -> Result<(), Box<Error>> {
let matcher = RegexMatcher::new(r"Doctor \w+")?;
let mut matches: Vec<(u64, String)> = vec![];
Searcher::new().search_slice(&matcher, SHERLOCK, UTF8(|lnum, line| {
// We are guaranteed to find a match, so the unwrap is OK.
let mymatch = matcher.find(line.as_bytes())?.unwrap();
matches.push((lnum, line[mymatch].to_string()));
Ok(true)
}))?;
assert_eq!(matches.len(), 2);
assert_eq!(
matches[0],
(1, "Doctor Watsons".to_string())
);
assert_eq!(
matches[1],
(5, "Doctor Watson".to_string())
);
Ok(())
}
```
See also `examples/search-stdin.rs` from the root of this crate's directory
to see a similar example that accepts a pattern on the command line and
searches stdin.
*/
#![deny(missing_docs)]
extern crate bstr;
extern crate bytecount;
extern crate encoding_rs;
extern crate encoding_rs_io;
extern crate grep_matcher;
#[macro_use]
extern crate log;
extern crate memmap;
#[cfg(test)]
extern crate regex;
pub use lines::{LineIter, LineStep};
pub use searcher::{
BinaryDetection, ConfigError, Encoding, MmapChoice, Searcher,
SearcherBuilder,
};
pub use sink::sinks;
pub use sink::{
Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, SinkMatch,
};
#[macro_use]
mod macros;
mod line_buffer;
mod lines;
mod searcher;
mod sink;
#[cfg(test)]
mod testutil;

View File

@@ -0,0 +1,950 @@
use std::cmp;
use std::io;
use bstr::ByteSlice;
/// The default buffer capacity that we use for the line buffer.
pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB
/// The behavior of a searcher in the face of long lines and big contexts.
///
/// When searching data incrementally using a fixed size buffer, this controls
/// the amount of *additional* memory to allocate beyond the size of the buffer
/// to accommodate lines (which may include the lines in a context window, when
/// enabled) that do not fit in the buffer.
///
/// The default is to eagerly allocate without a limit.
#[derive(Clone, Copy, Debug)]
pub enum BufferAllocation {
/// Attempt to expand the size of the buffer until either at least the next
/// line fits into memory or until all available memory is exhausted.
///
/// This is the default.
Eager,
/// Limit the amount of additional memory allocated to the given size. If
/// a line is found that requires more memory than is allowed here, then
/// stop reading and return an error.
Error(usize),
}
impl Default for BufferAllocation {
fn default() -> BufferAllocation {
BufferAllocation::Eager
}
}
/// Create a new error to be used when a configured allocation limit has been
/// reached.
pub fn alloc_error(limit: usize) -> io::Error {
let msg = format!("configured allocation limit ({}) exceeded", limit);
io::Error::new(io::ErrorKind::Other, msg)
}
/// The behavior of binary detection in the line buffer.
///
/// Binary detection is the process of _heuristically_ identifying whether a
/// given chunk of data is binary or not, and then taking an action based on
/// the result of that heuristic. The motivation behind detecting binary data
/// is that binary data often indicates data that is undesirable to search
/// using textual patterns. Of course, there are many cases in which this isn't
/// true, which is why binary detection is disabled by default.
#[derive(Clone, Copy, Debug)]
pub enum BinaryDetection {
/// No binary detection is performed. Data reported by the line buffer may
/// contain arbitrary bytes.
None,
/// The given byte is searched in all contents read by the line buffer. If
/// it occurs, then the data is considered binary and the line buffer acts
/// as if it reached EOF. The line buffer guarantees that this byte will
/// never be observable by callers.
Quit(u8),
/// The given byte is searched in all contents read by the line buffer. If
/// it occurs, then it is replaced by the line terminator. The line buffer
/// guarantees that this byte will never be observable by callers.
Convert(u8),
}
impl Default for BinaryDetection {
fn default() -> BinaryDetection {
BinaryDetection::None
}
}
impl BinaryDetection {
/// Returns true if and only if the detection heuristic demands that
/// the line buffer stop read data once binary data is observed.
fn is_quit(&self) -> bool {
match *self {
BinaryDetection::Quit(_) => true,
_ => false,
}
}
}
/// The configuration of a buffer. This contains options that are fixed once
/// a buffer has been constructed.
#[derive(Clone, Copy, Debug)]
struct Config {
/// The number of bytes to attempt to read at a time.
capacity: usize,
/// The line terminator.
lineterm: u8,
/// The behavior for handling long lines.
buffer_alloc: BufferAllocation,
/// When set, the presence of the given byte indicates binary content.
binary: BinaryDetection,
}
impl Default for Config {
fn default() -> Config {
Config {
capacity: DEFAULT_BUFFER_CAPACITY,
lineterm: b'\n',
buffer_alloc: BufferAllocation::default(),
binary: BinaryDetection::default(),
}
}
}
/// A builder for constructing line buffers.
#[derive(Clone, Debug, Default)]
pub struct LineBufferBuilder {
config: Config,
}
impl LineBufferBuilder {
/// Create a new builder for a buffer.
pub fn new() -> LineBufferBuilder {
LineBufferBuilder { config: Config::default() }
}
/// Create a new line buffer from this builder's configuration.
pub fn build(&self) -> LineBuffer {
LineBuffer {
config: self.config,
buf: vec![0; self.config.capacity],
pos: 0,
last_lineterm: 0,
end: 0,
absolute_byte_offset: 0,
binary_byte_offset: None,
}
}
/// Set the default capacity to use for a buffer.
///
/// In general, the capacity of a buffer corresponds to the amount of data
/// to hold in memory, and the size of the reads to make to the underlying
/// reader.
///
/// This is set to a reasonable default and probably shouldn't be changed
/// unless there's a specific reason to do so.
pub fn capacity(&mut self, capacity: usize) -> &mut LineBufferBuilder {
self.config.capacity = capacity;
self
}
/// Set the line terminator for the buffer.
///
/// Every buffer has a line terminator, and this line terminator is used
/// to determine how to roll the buffer forward. For example, when a read
/// to the buffer's underlying reader occurs, the end of the data that is
/// read is likely to correspond to an incomplete line. As a line buffer,
/// callers should not access this data since it is incomplete. The line
/// terminator is how the line buffer determines the part of the read that
/// is incomplete.
///
/// By default, this is set to `b'\n'`.
pub fn line_terminator(&mut self, lineterm: u8) -> &mut LineBufferBuilder {
self.config.lineterm = lineterm;
self
}
/// Set the maximum amount of additional memory to allocate for long lines.
///
/// In order to enable line oriented search, a fundamental requirement is
/// that, at a minimum, each line must be able to fit into memory. This
/// setting controls how big that line is allowed to be. By default, this
/// is set to `BufferAllocation::Eager`, which means a line buffer will
/// attempt to allocate as much memory as possible to fit a line, and will
/// only be limited by available memory.
///
/// Note that this setting only applies to the amount of *additional*
/// memory to allocate, beyond the capacity of the buffer. That means that
/// a value of `0` is sensible, and in particular, will guarantee that a
/// line buffer will never allocate additional memory beyond its initial
/// capacity.
pub fn buffer_alloc(
&mut self,
behavior: BufferAllocation,
) -> &mut LineBufferBuilder {
self.config.buffer_alloc = behavior;
self
}
/// Whether to enable binary detection or not. Depending on the setting,
/// this can either cause the line buffer to report EOF early or it can
/// cause the line buffer to clean the data.
///
/// By default, this is disabled. In general, binary detection should be
/// viewed as an imperfect heuristic.
pub fn binary_detection(
&mut self,
detection: BinaryDetection,
) -> &mut LineBufferBuilder {
self.config.binary = detection;
self
}
}
/// A line buffer reader efficiently reads a line oriented buffer from an
/// arbitrary reader.
#[derive(Debug)]
pub struct LineBufferReader<'b, R> {
rdr: R,
line_buffer: &'b mut LineBuffer,
}
impl<'b, R: io::Read> LineBufferReader<'b, R> {
/// Create a new buffered reader that reads from `rdr` and uses the given
/// `line_buffer` as an intermediate buffer.
///
/// This does not change the binary detection behavior of the given line
/// buffer.
pub fn new(
rdr: R,
line_buffer: &'b mut LineBuffer,
) -> LineBufferReader<'b, R> {
line_buffer.clear();
LineBufferReader { rdr, line_buffer }
}
/// The absolute byte offset which corresponds to the starting offsets
/// of the data returned by `buffer` relative to the beginning of the
/// underlying reader's contents. As such, this offset does not generally
/// correspond to an offset in memory. It is typically used for reporting
/// purposes. It can also be used for counting the number of bytes that
/// have been searched.
pub fn absolute_byte_offset(&self) -> u64 {
self.line_buffer.absolute_byte_offset()
}
/// If binary data was detected, then this returns the absolute byte offset
/// at which binary data was initially found.
pub fn binary_byte_offset(&self) -> Option<u64> {
self.line_buffer.binary_byte_offset()
}
/// Fill the contents of this buffer by discarding the part of the buffer
/// that has been consumed. The free space created by discarding the
/// consumed part of the buffer is then filled with new data from the
/// reader.
///
/// If EOF is reached, then `false` is returned. Otherwise, `true` is
/// returned. (Note that if this line buffer's binary detection is set to
/// `Quit`, then the presence of binary data will cause this buffer to
/// behave as if it had seen EOF at the first occurrence of binary data.)
///
/// This forwards any errors returned by the underlying reader, and will
/// also return an error if the buffer must be expanded past its allocation
/// limit, as governed by the buffer allocation strategy.
pub fn fill(&mut self) -> Result<bool, io::Error> {
self.line_buffer.fill(&mut self.rdr)
}
/// Return the contents of this buffer.
pub fn buffer(&self) -> &[u8] {
self.line_buffer.buffer()
}
/// Return the buffer as a BStr, used for convenient equality checking
/// in tests only.
#[cfg(test)]
fn bstr(&self) -> &::bstr::BStr {
self.buffer().as_bstr()
}
/// Consume the number of bytes provided. This must be less than or equal
/// to the number of bytes returned by `buffer`.
pub fn consume(&mut self, amt: usize) {
self.line_buffer.consume(amt);
}
/// Consumes the remainder of the buffer. Subsequent calls to `buffer` are
/// guaranteed to return an empty slice until the buffer is refilled.
///
/// This is a convenience function for `consume(buffer.len())`.
#[cfg(test)]
fn consume_all(&mut self) {
self.line_buffer.consume_all();
}
}
/// A line buffer manages a (typically fixed) buffer for holding lines.
///
/// Callers should create line buffers sparingly and reuse them when possible.
/// Line buffers cannot be used directly, but instead must be used via the
/// LineBufferReader.
#[derive(Clone, Debug)]
pub struct LineBuffer {
/// The configuration of this buffer.
config: Config,
/// The primary buffer with which to hold data.
buf: Vec<u8>,
/// The current position of this buffer. This is always a valid sliceable
/// index into `buf`, and its maximum value is the length of `buf`.
pos: usize,
/// The end position of searchable content in this buffer. This is either
/// set to just after the final line terminator in the buffer, or to just
/// after the end of the last byte emitted by the reader when the reader
/// has been exhausted.
last_lineterm: usize,
/// The end position of the buffer. This is always greater than or equal to
/// last_lineterm. The bytes between last_lineterm and end, if any, always
/// correspond to a partial line.
end: usize,
/// The absolute byte offset corresponding to `pos`. This is most typically
/// not a valid index into addressable memory, but rather, an offset that
/// is relative to all data that passes through a line buffer (since
/// construction or since the last time `clear` was called).
///
/// When the line buffer reaches EOF, this is set to the position just
/// after the last byte read from the underlying reader. That is, it
/// becomes the total count of bytes that have been read.
absolute_byte_offset: u64,
/// If binary data was found, this records the absolute byte offset at
/// which it was first detected.
binary_byte_offset: Option<u64>,
}
impl LineBuffer {
/// Set the binary detection method used on this line buffer.
///
/// This permits dynamically changing the binary detection strategy on
/// an existing line buffer without needing to create a new one.
pub fn set_binary_detection(&mut self, binary: BinaryDetection) {
self.config.binary = binary;
}
/// Reset this buffer, such that it can be used with a new reader.
fn clear(&mut self) {
self.pos = 0;
self.last_lineterm = 0;
self.end = 0;
self.absolute_byte_offset = 0;
self.binary_byte_offset = None;
}
/// The absolute byte offset which corresponds to the starting offsets
/// of the data returned by `buffer` relative to the beginning of the
/// reader's contents. As such, this offset does not generally correspond
/// to an offset in memory. It is typically used for reporting purposes,
/// particularly in error messages.
///
/// This is reset to `0` when `clear` is called.
fn absolute_byte_offset(&self) -> u64 {
self.absolute_byte_offset
}
/// If binary data was detected, then this returns the absolute byte offset
/// at which binary data was initially found.
fn binary_byte_offset(&self) -> Option<u64> {
self.binary_byte_offset
}
/// Return the contents of this buffer.
fn buffer(&self) -> &[u8] {
&self.buf[self.pos..self.last_lineterm]
}
/// Return the contents of the free space beyond the end of the buffer as
/// a mutable slice.
fn free_buffer(&mut self) -> &mut [u8] {
&mut self.buf[self.end..]
}
/// Consume the number of bytes provided. This must be less than or equal
/// to the number of bytes returned by `buffer`.
fn consume(&mut self, amt: usize) {
assert!(amt <= self.buffer().len());
self.pos += amt;
self.absolute_byte_offset += amt as u64;
}
/// Consumes the remainder of the buffer. Subsequent calls to `buffer` are
/// guaranteed to return an empty slice until the buffer is refilled.
///
/// This is a convenience function for `consume(buffer.len())`.
#[cfg(test)]
fn consume_all(&mut self) {
let amt = self.buffer().len();
self.consume(amt);
}
/// Fill the contents of this buffer by discarding the part of the buffer
/// that has been consumed. The free space created by discarding the
/// consumed part of the buffer is then filled with new data from the given
/// reader.
///
/// Callers should provide the same reader to this line buffer in
/// subsequent calls to fill. A different reader can only be used
/// immediately following a call to `clear`.
///
/// If EOF is reached, then `false` is returned. Otherwise, `true` is
/// returned. (Note that if this line buffer's binary detection is set to
/// `Quit`, then the presence of binary data will cause this buffer to
/// behave as if it had seen EOF.)
///
/// This forwards any errors returned by `rdr`, and will also return an
/// error if the buffer must be expanded past its allocation limit, as
/// governed by the buffer allocation strategy.
fn fill<R: io::Read>(&mut self, mut rdr: R) -> Result<bool, io::Error> {
// If the binary detection heuristic tells us to quit once binary data
// has been observed, then we no longer read new data and reach EOF
// once the current buffer has been consumed.
if self.config.binary.is_quit() && self.binary_byte_offset.is_some() {
return Ok(!self.buffer().is_empty());
}
self.roll();
assert_eq!(self.pos, 0);
loop {
self.ensure_capacity()?;
let readlen = rdr.read(self.free_buffer().as_bytes_mut())?;
if readlen == 0 {
// We're only done reading for good once the caller has
// consumed everything.
self.last_lineterm = self.end;
return Ok(!self.buffer().is_empty());
}
// Get a mutable view into the bytes we've just read. These are
// the bytes that we do binary detection on, and also the bytes we
// search to find the last line terminator. We need a mutable slice
// in the case of binary conversion.
let oldend = self.end;
self.end += readlen;
let newbytes = &mut self.buf[oldend..self.end];
// Binary detection.
match self.config.binary {
BinaryDetection::None => {} // nothing to do
BinaryDetection::Quit(byte) => {
if let Some(i) = newbytes.find_byte(byte) {
self.end = oldend + i;
self.last_lineterm = self.end;
self.binary_byte_offset =
Some(self.absolute_byte_offset + self.end as u64);
// If the first byte in our buffer is a binary byte,
// then our buffer is empty and we should report as
// such to the caller.
return Ok(self.pos < self.end);
}
}
BinaryDetection::Convert(byte) => {
if let Some(i) =
replace_bytes(newbytes, byte, self.config.lineterm)
{
// Record only the first binary offset.
if self.binary_byte_offset.is_none() {
self.binary_byte_offset = Some(
self.absolute_byte_offset
+ (oldend + i) as u64,
);
}
}
}
}
// Update our `last_lineterm` positions if we read one.
if let Some(i) = newbytes.rfind_byte(self.config.lineterm) {
self.last_lineterm = oldend + i + 1;
return Ok(true);
}
// At this point, if we couldn't find a line terminator, then we
// don't have a complete line. Therefore, we try to read more!
}
}
/// Roll the unconsumed parts of the buffer to the front.
///
/// This operation is idempotent.
///
/// After rolling, `last_lineterm` and `end` point to the same location,
/// and `pos` is always set to `0`.
fn roll(&mut self) {
if self.pos == self.end {
self.pos = 0;
self.last_lineterm = 0;
self.end = 0;
return;
}
let roll_len = self.end - self.pos;
self.buf.copy_within_str(self.pos..self.end, 0);
self.pos = 0;
self.last_lineterm = roll_len;
self.end = roll_len;
}
/// Ensures that the internal buffer has a non-zero amount of free space
/// in which to read more data. If there is no free space, then more is
/// allocated. If the allocation must exceed the configured limit, then
/// this returns an error.
fn ensure_capacity(&mut self) -> Result<(), io::Error> {
if !self.free_buffer().is_empty() {
return Ok(());
}
// `len` is used for computing the next allocation size. The capacity
// is permitted to start at `0`, so we make sure it's at least `1`.
let len = cmp::max(1, self.buf.len());
let additional = match self.config.buffer_alloc {
BufferAllocation::Eager => len * 2,
BufferAllocation::Error(limit) => {
let used = self.buf.len() - self.config.capacity;
let n = cmp::min(len * 2, limit - used);
if n == 0 {
return Err(alloc_error(self.config.capacity + limit));
}
n
}
};
assert!(additional > 0);
let newlen = self.buf.len() + additional;
self.buf.resize(newlen, 0);
assert!(!self.free_buffer().is_empty());
Ok(())
}
}
/// Replaces `src` with `replacement` in bytes, and return the offset of the
/// first replacement, if one exists.
fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
if src == replacement {
return None;
}
let mut first_pos = None;
let mut pos = 0;
while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) {
if first_pos.is_none() {
first_pos = Some(i);
}
bytes[i] = replacement;
pos = i + 1;
while bytes.get(pos) == Some(&src) {
bytes[pos] = replacement;
pos += 1;
}
}
first_pos
}
#[cfg(test)]
mod tests {
use super::*;
use bstr::{ByteSlice, ByteVec};
use std::str;
const SHERLOCK: &'static str = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.\
";
fn s(slice: &str) -> String {
slice.to_string()
}
fn replace_str(
slice: &str,
src: u8,
replacement: u8,
) -> (String, Option<usize>) {
let mut dst = Vec::from(slice);
let result = replace_bytes(&mut dst, src, replacement);
(dst.into_string().unwrap(), result)
}
#[test]
fn replace() {
assert_eq!(replace_str("abc", b'b', b'z'), (s("azc"), Some(1)));
assert_eq!(replace_str("abb", b'b', b'z'), (s("azz"), Some(1)));
assert_eq!(replace_str("aba", b'a', b'z'), (s("zbz"), Some(0)));
assert_eq!(replace_str("bbb", b'b', b'z'), (s("zzz"), Some(0)));
assert_eq!(replace_str("bac", b'b', b'z'), (s("zac"), Some(0)));
}
#[test]
fn buffer_basics1() {
let bytes = "homer\nlisa\nmaggie";
let mut linebuf = LineBufferBuilder::new().build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nlisa\n");
assert_eq!(rdr.absolute_byte_offset(), 0);
rdr.consume(5);
assert_eq!(rdr.absolute_byte_offset(), 5);
rdr.consume_all();
assert_eq!(rdr.absolute_byte_offset(), 11);
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "maggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_basics2() {
let bytes = "homer\nlisa\nmaggie\n";
let mut linebuf = LineBufferBuilder::new().build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_basics3() {
let bytes = "\n";
let mut linebuf = LineBufferBuilder::new().build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_basics4() {
let bytes = "\n\n";
let mut linebuf = LineBufferBuilder::new().build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_empty() {
let bytes = "";
let mut linebuf = LineBufferBuilder::new().build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_zero_capacity() {
let bytes = "homer\nlisa\nmaggie";
let mut linebuf = LineBufferBuilder::new().capacity(0).build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
while rdr.fill().unwrap() {
rdr.consume_all();
}
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_small_capacity() {
let bytes = "homer\nlisa\nmaggie";
let mut linebuf = LineBufferBuilder::new().capacity(1).build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
let mut got = vec![];
while rdr.fill().unwrap() {
got.push_str(rdr.buffer());
rdr.consume_all();
}
assert_eq!(bytes, got.as_bstr());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_limited_capacity1() {
let bytes = "homer\nlisa\nmaggie";
let mut linebuf = LineBufferBuilder::new()
.capacity(1)
.buffer_alloc(BufferAllocation::Error(5))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\n");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "lisa\n");
rdr.consume_all();
// This returns an error because while we have just enough room to
// store maggie in the buffer, we *don't* have enough room to read one
// more byte, so we don't know whether we're at EOF or not, and
// therefore must give up.
assert!(rdr.fill().is_err());
// We can mush on though!
assert_eq!(rdr.bstr(), "m");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "aggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
}
#[test]
fn buffer_limited_capacity2() {
let bytes = "homer\nlisa\nmaggie";
let mut linebuf = LineBufferBuilder::new()
.capacity(1)
.buffer_alloc(BufferAllocation::Error(6))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\n");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "lisa\n");
rdr.consume_all();
// We have just enough space.
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "maggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
}
#[test]
fn buffer_limited_capacity3() {
let bytes = "homer\nlisa\nmaggie";
let mut linebuf = LineBufferBuilder::new()
.capacity(1)
.buffer_alloc(BufferAllocation::Error(0))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().is_err());
assert_eq!(rdr.bstr(), "");
}
#[test]
fn buffer_binary_none() {
let bytes = "homer\nli\x00sa\nmaggie\n";
let mut linebuf = LineBufferBuilder::new().build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nli\x00sa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
#[test]
fn buffer_binary_quit1() {
let bytes = "homer\nli\x00sa\nmaggie\n";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Quit(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nli");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), 8);
assert_eq!(rdr.binary_byte_offset(), Some(8));
}
#[test]
fn buffer_binary_quit2() {
let bytes = "\x00homer\nlisa\nmaggie\n";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Quit(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "");
assert_eq!(rdr.absolute_byte_offset(), 0);
assert_eq!(rdr.binary_byte_offset(), Some(0));
}
#[test]
fn buffer_binary_quit3() {
let bytes = "homer\nlisa\nmaggie\n\x00";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Quit(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 1);
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1));
}
#[test]
fn buffer_binary_quit4() {
let bytes = "homer\nlisa\nmaggie\x00\n";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Quit(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 2);
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2));
}
#[test]
fn buffer_binary_quit5() {
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Quit(b'u'))
.build();
let mut rdr = LineBufferReader::new(SHERLOCK.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(
rdr.bstr(),
"\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, s\
"
);
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), 76);
assert_eq!(rdr.binary_byte_offset(), Some(76));
assert_eq!(SHERLOCK.as_bytes()[76], b'u');
}
#[test]
fn buffer_binary_convert1() {
let bytes = "homer\nli\x00sa\nmaggie\n";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Convert(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nli\nsa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), Some(8));
}
#[test]
fn buffer_binary_convert2() {
let bytes = "\x00homer\nlisa\nmaggie\n";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Convert(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "\nhomer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), Some(0));
}
#[test]
fn buffer_binary_convert3() {
let bytes = "homer\nlisa\nmaggie\n\x00";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Convert(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1));
}
#[test]
fn buffer_binary_convert4() {
let bytes = "homer\nlisa\nmaggie\x00\n";
let mut linebuf = LineBufferBuilder::new()
.binary_detection(BinaryDetection::Convert(b'\x00'))
.build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2));
}
}

View File

@@ -0,0 +1,467 @@
/*!
A collection of routines for performing operations on lines.
*/
use bstr::ByteSlice;
use bytecount;
use grep_matcher::{LineTerminator, Match};
/// An iterator over lines in a particular slice of bytes.
///
/// Line terminators are considered part of the line they terminate. All lines
/// yielded by the iterator are guaranteed to be non-empty.
///
/// `'b` refers to the lifetime of the underlying bytes.
#[derive(Debug)]
pub struct LineIter<'b> {
bytes: &'b [u8],
stepper: LineStep,
}
impl<'b> LineIter<'b> {
/// Create a new line iterator that yields lines in the given bytes that
/// are terminated by `line_term`.
pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> {
LineIter {
bytes: bytes,
stepper: LineStep::new(line_term, 0, bytes.len()),
}
}
}
impl<'b> Iterator for LineIter<'b> {
type Item = &'b [u8];
fn next(&mut self) -> Option<&'b [u8]> {
self.stepper.next_match(self.bytes).map(|m| &self.bytes[m])
}
}
/// An explicit iterator over lines in a particular slice of bytes.
///
/// This iterator avoids borrowing the bytes themselves, and instead requires
/// callers to explicitly provide the bytes when moving through the iterator.
/// While not idiomatic, this provides a simple way of iterating over lines
/// that doesn't require borrowing the slice itself, which can be convenient.
///
/// Line terminators are considered part of the line they terminate. All lines
/// yielded by the iterator are guaranteed to be non-empty.
#[derive(Debug)]
pub struct LineStep {
line_term: u8,
pos: usize,
end: usize,
}
impl LineStep {
/// Create a new line iterator over the given range of bytes using the
/// given line terminator.
///
/// Callers should provide the actual bytes for each call to `next`. The
/// same slice must be provided to each call.
///
/// This panics if `start` is not less than or equal to `end`.
pub fn new(line_term: u8, start: usize, end: usize) -> LineStep {
LineStep { line_term, pos: start, end: end }
}
/// Return the start and end position of the next line in the given bytes.
///
/// The caller must past exactly the same slice of bytes for each call to
/// `next`.
///
/// The range returned includes the line terminator. Ranges are always
/// non-empty.
pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
self.next_impl(bytes)
}
/// Like next, but returns a `Match` instead of a tuple.
#[inline(always)]
pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option<Match> {
self.next_impl(bytes).map(|(s, e)| Match::new(s, e))
}
#[inline(always)]
fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
bytes = &bytes[..self.end];
match bytes[self.pos..].find_byte(self.line_term) {
None => {
if self.pos < bytes.len() {
let m = (self.pos, bytes.len());
assert!(m.0 <= m.1);
self.pos = m.1;
Some(m)
} else {
None
}
}
Some(line_end) => {
let m = (self.pos, self.pos + line_end + 1);
assert!(m.0 <= m.1);
self.pos = m.1;
Some(m)
}
}
}
}
/// Count the number of occurrences of `line_term` in `bytes`.
pub fn count(bytes: &[u8], line_term: u8) -> u64 {
bytecount::count(bytes, line_term) as u64
}
/// Given a line that possibly ends with a terminator, return that line without
/// the terminator.
#[inline(always)]
pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] {
let line_term = line_term.as_bytes();
let start = bytes.len().saturating_sub(line_term.len());
if bytes.get(start..) == Some(line_term) {
return &bytes[..bytes.len() - line_term.len()];
}
bytes
}
/// Return the start and end offsets of the lines containing the given range
/// of bytes.
///
/// Line terminators are considered part of the line they terminate.
#[inline(always)]
pub fn locate(bytes: &[u8], line_term: u8, range: Match) -> Match {
let line_start =
bytes[..range.start()].rfind_byte(line_term).map_or(0, |i| i + 1);
let line_end =
if range.end() > line_start && bytes[range.end() - 1] == line_term {
range.end()
} else {
bytes[range.end()..]
.find_byte(line_term)
.map_or(bytes.len(), |i| range.end() + i + 1)
};
Match::new(line_start, line_end)
}
/// Returns the minimal starting offset of the line that occurs `count` lines
/// before the last line in `bytes`.
///
/// Lines are terminated by `line_term`. If `count` is zero, then this returns
/// the starting offset of the last line in `bytes`.
///
/// If `bytes` ends with a line terminator, then the terminator itself is
/// considered part of the last line.
pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize {
preceding_by_pos(bytes, bytes.len(), line_term, count)
}
/// Returns the minimal starting offset of the line that occurs `count` lines
/// before the line containing `pos`. Lines are terminated by `line_term`.
/// If `count` is zero, then this returns the starting offset of the line
/// containing `pos`.
///
/// If `pos` points just past a line terminator, then it is considered part of
/// the line that it terminates. For example, given `bytes = b"abc\nxyz\n"`
/// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos
/// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`.
fn preceding_by_pos(
bytes: &[u8],
mut pos: usize,
line_term: u8,
mut count: usize,
) -> usize {
if pos == 0 {
return 0;
} else if bytes[pos - 1] == line_term {
pos -= 1;
}
loop {
match bytes[..pos].rfind_byte(line_term) {
None => {
return 0;
}
Some(i) => {
if count == 0 {
return i + 1;
} else if i == 0 {
return 0;
}
count -= 1;
pos = i;
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use grep_matcher::Match;
use std::ops::Range;
use std::str;
const SHERLOCK: &'static str = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.\
";
fn m(start: usize, end: usize) -> Match {
Match::new(start, end)
}
fn lines(text: &str) -> Vec<&str> {
let mut results = vec![];
let mut it = LineStep::new(b'\n', 0, text.len());
while let Some(m) = it.next_match(text.as_bytes()) {
results.push(&text[m]);
}
results
}
fn line_ranges(text: &str) -> Vec<Range<usize>> {
let mut results = vec![];
let mut it = LineStep::new(b'\n', 0, text.len());
while let Some(m) = it.next_match(text.as_bytes()) {
results.push(m.start()..m.end());
}
results
}
fn prev(text: &str, pos: usize, count: usize) -> usize {
preceding_by_pos(text.as_bytes(), pos, b'\n', count)
}
fn loc(text: &str, start: usize, end: usize) -> Match {
locate(text.as_bytes(), b'\n', Match::new(start, end))
}
#[test]
fn line_count() {
assert_eq!(0, count(b"", b'\n'));
assert_eq!(1, count(b"\n", b'\n'));
assert_eq!(2, count(b"\n\n", b'\n'));
assert_eq!(2, count(b"a\nb\nc", b'\n'));
}
#[test]
fn line_locate() {
let t = SHERLOCK;
let lines = line_ranges(t);
assert_eq!(
loc(t, lines[0].start, lines[0].end),
m(lines[0].start, lines[0].end)
);
assert_eq!(
loc(t, lines[0].start + 1, lines[0].end),
m(lines[0].start, lines[0].end)
);
assert_eq!(
loc(t, lines[0].end - 1, lines[0].end),
m(lines[0].start, lines[0].end)
);
assert_eq!(
loc(t, lines[0].end, lines[0].end),
m(lines[1].start, lines[1].end)
);
assert_eq!(
loc(t, lines[5].start, lines[5].end),
m(lines[5].start, lines[5].end)
);
assert_eq!(
loc(t, lines[5].start + 1, lines[5].end),
m(lines[5].start, lines[5].end)
);
assert_eq!(
loc(t, lines[5].end - 1, lines[5].end),
m(lines[5].start, lines[5].end)
);
assert_eq!(
loc(t, lines[5].end, lines[5].end),
m(lines[5].start, lines[5].end)
);
}
#[test]
fn line_locate_weird() {
assert_eq!(loc("", 0, 0), m(0, 0));
assert_eq!(loc("\n", 0, 1), m(0, 1));
assert_eq!(loc("\n", 1, 1), m(1, 1));
assert_eq!(loc("\n\n", 0, 0), m(0, 1));
assert_eq!(loc("\n\n", 0, 1), m(0, 1));
assert_eq!(loc("\n\n", 1, 1), m(1, 2));
assert_eq!(loc("\n\n", 1, 2), m(1, 2));
assert_eq!(loc("\n\n", 2, 2), m(2, 2));
assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2));
assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2));
assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4));
assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4));
assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5));
assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5));
}
#[test]
fn line_iter() {
assert_eq!(lines("abc"), vec!["abc"]);
assert_eq!(lines("abc\n"), vec!["abc\n"]);
assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]);
assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]);
assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]);
assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]);
assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]);
assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]);
assert_eq!(lines("\n"), vec!["\n"]);
assert_eq!(lines(""), Vec::<&str>::new());
}
#[test]
fn line_iter_empty() {
let mut it = LineStep::new(b'\n', 0, 0);
assert_eq!(it.next(b"abc"), None);
}
#[test]
fn preceding_lines_doc() {
// These are the examples mentions in the documentation of `preceding`.
let bytes = b"abc\nxyz\n";
assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0));
assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0));
assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1));
assert_eq!(0, preceding_by_pos(bytes, 8, b'\n', 1));
}
#[test]
fn preceding_lines_sherlock() {
let t = SHERLOCK;
let lines = line_ranges(t);
// The following tests check the count == 0 case, i.e., finding the
// beginning of the line containing the given position.
assert_eq!(0, prev(t, 0, 0));
assert_eq!(0, prev(t, 1, 0));
// The line terminator is addressed by `end-1` and terminates the line
// it is part of.
assert_eq!(0, prev(t, lines[0].end - 1, 0));
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
// The end position of line addresses the byte immediately following a
// line terminator, which puts it on the following line.
assert_eq!(lines[1].start, prev(t, lines[0].end + 1, 0));
// Now tests for count > 0.
assert_eq!(0, prev(t, 0, 1));
assert_eq!(0, prev(t, 0, 2));
assert_eq!(0, prev(t, 1, 1));
assert_eq!(0, prev(t, 1, 2));
assert_eq!(0, prev(t, lines[0].end - 1, 1));
assert_eq!(0, prev(t, lines[0].end - 1, 2));
assert_eq!(0, prev(t, lines[0].end, 1));
assert_eq!(0, prev(t, lines[0].end, 2));
assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
assert_eq!(lines[3].start, prev(t, lines[4].end, 1));
assert_eq!(lines[4].start, prev(t, lines[4].end + 1, 1));
// The last line has no line terminator.
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
}
#[test]
fn preceding_lines_short() {
let t = "a\nb\nc\nd\ne\nf\n";
let lines = line_ranges(t);
assert_eq!(12, t.len());
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
assert_eq!(lines[4].start, prev(t, lines[5].end - 1, 1));
assert_eq!(lines[3].start, prev(t, lines[5].end - 1, 2));
assert_eq!(lines[2].start, prev(t, lines[5].end - 1, 3));
assert_eq!(lines[1].start, prev(t, lines[5].end - 1, 4));
assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 5));
assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 6));
assert_eq!(lines[4].start, prev(t, lines[5].start, 0));
assert_eq!(lines[3].start, prev(t, lines[5].start, 1));
assert_eq!(lines[2].start, prev(t, lines[5].start, 2));
assert_eq!(lines[1].start, prev(t, lines[5].start, 3));
assert_eq!(lines[0].start, prev(t, lines[5].start, 4));
assert_eq!(lines[0].start, prev(t, lines[5].start, 5));
assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
assert_eq!(lines[2].start, prev(t, lines[4].start, 1));
assert_eq!(lines[2].start, prev(t, lines[3].end - 1, 1));
assert_eq!(lines[1].start, prev(t, lines[3].start, 1));
assert_eq!(lines[1].start, prev(t, lines[2].end - 1, 1));
assert_eq!(lines[0].start, prev(t, lines[2].start, 1));
assert_eq!(lines[0].start, prev(t, lines[1].end - 1, 1));
assert_eq!(lines[0].start, prev(t, lines[1].start, 1));
assert_eq!(lines[0].start, prev(t, lines[0].end - 1, 1));
assert_eq!(lines[0].start, prev(t, lines[0].start, 1));
}
#[test]
fn preceding_lines_empty1() {
let t = "\n\n\nd\ne\nf\n";
let lines = line_ranges(t);
assert_eq!(9, t.len());
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
}
#[test]
fn preceding_lines_empty2() {
let t = "a\n\n\nd\ne\nf\n";
let lines = line_ranges(t);
assert_eq!(10, t.len());
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
}
}

View File

@@ -0,0 +1,25 @@
/// Like assert_eq, but nicer output for long strings.
#[cfg(test)]
#[macro_export]
macro_rules! assert_eq_printed {
($expected:expr, $got:expr, $($tt:tt)*) => {
let expected = &*$expected;
let got = &*$got;
let label = format!($($tt)*);
if expected != got {
panic!("
printed outputs differ! (label: {})
expected:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
got:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
", label, expected, got);
}
}
}

View File

@@ -0,0 +1,591 @@
use std::cmp;
use bstr::ByteSlice;
use grep_matcher::{LineMatchKind, Matcher};
use line_buffer::BinaryDetection;
use lines::{self, LineStep};
use searcher::{Config, Range, Searcher};
use sink::{
Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, SinkMatch,
};
#[derive(Debug)]
pub struct Core<'s, M: 's, S> {
config: &'s Config,
matcher: M,
searcher: &'s Searcher,
sink: S,
binary: bool,
pos: usize,
absolute_byte_offset: u64,
binary_byte_offset: Option<usize>,
line_number: Option<u64>,
last_line_counted: usize,
last_line_visited: usize,
after_context_left: usize,
has_sunk: bool,
}
impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
pub fn new(
searcher: &'s Searcher,
matcher: M,
sink: S,
binary: bool,
) -> Core<'s, M, S> {
let line_number =
if searcher.config.line_number { Some(1) } else { None };
let core = Core {
config: &searcher.config,
matcher: matcher,
searcher: searcher,
sink: sink,
binary: binary,
pos: 0,
absolute_byte_offset: 0,
binary_byte_offset: None,
line_number: line_number,
last_line_counted: 0,
last_line_visited: 0,
after_context_left: 0,
has_sunk: false,
};
if !core.searcher.multi_line_with_matcher(&core.matcher) {
if core.is_line_by_line_fast() {
trace!("searcher core: will use fast line searcher");
} else {
trace!("searcher core: will use slow line searcher");
}
}
core
}
pub fn pos(&self) -> usize {
self.pos
}
pub fn set_pos(&mut self, pos: usize) {
self.pos = pos;
}
pub fn binary_byte_offset(&self) -> Option<u64> {
self.binary_byte_offset.map(|offset| offset as u64)
}
pub fn matcher(&self) -> &M {
&self.matcher
}
pub fn matched(
&mut self,
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
self.sink_matched(buf, range)
}
pub fn binary_data(
&mut self,
binary_byte_offset: u64,
) -> Result<bool, S::Error> {
self.sink.binary_data(&self.searcher, binary_byte_offset)
}
pub fn begin(&mut self) -> Result<bool, S::Error> {
self.sink.begin(&self.searcher)
}
pub fn finish(
&mut self,
byte_count: u64,
binary_byte_offset: Option<u64>,
) -> Result<(), S::Error> {
self.sink.finish(
&self.searcher,
&SinkFinish { byte_count, binary_byte_offset },
)
}
pub fn match_by_line(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
if self.is_line_by_line_fast() {
self.match_by_line_fast(buf)
} else {
self.match_by_line_slow(buf)
}
}
pub fn roll(&mut self, buf: &[u8]) -> usize {
let consumed = if self.config.max_context() == 0 {
buf.len()
} else {
// It might seem like all we need to care about here is just
// the "before context," but in order to sink the context
// separator (when before_context==0 and after_context>0), we
// need to know something about the position of the previous
// line visited, even if we're at the beginning of the buffer.
let context_start = lines::preceding(
buf,
self.config.line_term.as_byte(),
self.config.max_context(),
);
let consumed = cmp::max(context_start, self.last_line_visited);
consumed
};
self.count_lines(buf, consumed);
self.absolute_byte_offset += consumed as u64;
self.last_line_counted = 0;
self.last_line_visited = 0;
self.set_pos(buf.len() - consumed);
consumed
}
pub fn detect_binary(
&mut self,
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
if self.binary_byte_offset.is_some() {
return Ok(self.config.binary.quit_byte().is_some());
}
let binary_byte = match self.config.binary.0 {
BinaryDetection::Quit(b) => b,
BinaryDetection::Convert(b) => b,
_ => return Ok(false),
};
if let Some(i) = buf[*range].find_byte(binary_byte) {
let offset = range.start() + i;
self.binary_byte_offset = Some(offset);
if !self.binary_data(offset as u64)? {
return Ok(true);
}
Ok(self.config.binary.quit_byte().is_some())
} else {
Ok(false)
}
}
pub fn before_context_by_line(
&mut self,
buf: &[u8],
upto: usize,
) -> Result<bool, S::Error> {
if self.config.before_context == 0 {
return Ok(true);
}
let range = Range::new(self.last_line_visited, upto);
if range.is_empty() {
return Ok(true);
}
let before_context_start = range.start()
+ lines::preceding(
&buf[range],
self.config.line_term.as_byte(),
self.config.before_context - 1,
);
let range = Range::new(before_context_start, range.end());
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
range.start(),
range.end(),
);
while let Some(line) = stepper.next_match(buf) {
if !self.sink_break_context(line.start())? {
return Ok(false);
}
if !self.sink_before_context(buf, &line)? {
return Ok(false);
}
}
Ok(true)
}
pub fn after_context_by_line(
&mut self,
buf: &[u8],
upto: usize,
) -> Result<bool, S::Error> {
if self.after_context_left == 0 {
return Ok(true);
}
let range = Range::new(self.last_line_visited, upto);
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
range.start(),
range.end(),
);
while let Some(line) = stepper.next_match(buf) {
if !self.sink_after_context(buf, &line)? {
return Ok(false);
}
if self.after_context_left == 0 {
break;
}
}
Ok(true)
}
pub fn other_context_by_line(
&mut self,
buf: &[u8],
upto: usize,
) -> Result<bool, S::Error> {
let range = Range::new(self.last_line_visited, upto);
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
range.start(),
range.end(),
);
while let Some(line) = stepper.next_match(buf) {
if !self.sink_other_context(buf, &line)? {
return Ok(false);
}
}
Ok(true)
}
fn match_by_line_slow(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
let range = Range::new(self.pos(), buf.len());
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
range.start(),
range.end(),
);
while let Some(line) = stepper.next_match(buf) {
let matched = {
// Stripping the line terminator is necessary to prevent some
// classes of regexes from matching the empty position *after*
// the end of the line. For example, `(?m)^$` will match at
// position (2, 2) in the string `a\n`.
let slice = lines::without_terminator(
&buf[line],
self.config.line_term,
);
match self.matcher.shortest_match(slice) {
Err(err) => return Err(S::Error::error_message(err)),
Ok(result) => result.is_some(),
}
};
self.set_pos(line.end());
if matched != self.config.invert_match {
if !self.before_context_by_line(buf, line.start())? {
return Ok(false);
}
if !self.sink_matched(buf, &line)? {
return Ok(false);
}
} else if self.after_context_left >= 1 {
if !self.sink_after_context(buf, &line)? {
return Ok(false);
}
} else if self.config.passthru {
if !self.sink_other_context(buf, &line)? {
return Ok(false);
}
}
}
Ok(true)
}
fn match_by_line_fast(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
debug_assert!(!self.config.passthru);
while !buf[self.pos()..].is_empty() {
if self.config.invert_match {
if !self.match_by_line_fast_invert(buf)? {
return Ok(false);
}
} else if let Some(line) = self.find_by_line_fast(buf)? {
if self.config.max_context() > 0 {
if !self.after_context_by_line(buf, line.start())? {
return Ok(false);
}
if !self.before_context_by_line(buf, line.start())? {
return Ok(false);
}
}
self.set_pos(line.end());
if !self.sink_matched(buf, &line)? {
return Ok(false);
}
} else {
break;
}
}
if !self.after_context_by_line(buf, buf.len())? {
return Ok(false);
}
self.set_pos(buf.len());
Ok(true)
}
#[inline(always)]
fn match_by_line_fast_invert(
&mut self,
buf: &[u8],
) -> Result<bool, S::Error> {
assert!(self.config.invert_match);
let invert_match = match self.find_by_line_fast(buf)? {
None => {
let range = Range::new(self.pos(), buf.len());
self.set_pos(range.end());
range
}
Some(line) => {
let range = Range::new(self.pos(), line.start());
self.set_pos(line.end());
range
}
};
if invert_match.is_empty() {
return Ok(true);
}
if !self.after_context_by_line(buf, invert_match.start())? {
return Ok(false);
}
if !self.before_context_by_line(buf, invert_match.start())? {
return Ok(false);
}
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
invert_match.start(),
invert_match.end(),
);
while let Some(line) = stepper.next_match(buf) {
if !self.sink_matched(buf, &line)? {
return Ok(false);
}
}
Ok(true)
}
#[inline(always)]
fn find_by_line_fast(
&self,
buf: &[u8],
) -> Result<Option<Range>, S::Error> {
debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
debug_assert!(self.is_line_by_line_fast());
let mut pos = self.pos();
while !buf[pos..].is_empty() {
match self.matcher.find_candidate_line(&buf[pos..]) {
Err(err) => return Err(S::Error::error_message(err)),
Ok(None) => return Ok(None),
Ok(Some(LineMatchKind::Confirmed(i))) => {
let line = lines::locate(
buf,
self.config.line_term.as_byte(),
Range::zero(i).offset(pos),
);
// If we matched beyond the end of the buffer, then we
// don't report this as a match.
if line.start() == buf.len() {
pos = buf.len();
continue;
}
return Ok(Some(line));
}
Ok(Some(LineMatchKind::Candidate(i))) => {
let line = lines::locate(
buf,
self.config.line_term.as_byte(),
Range::zero(i).offset(pos),
);
// We need to strip the line terminator here to match the
// semantics of line-by-line searching. Namely, regexes
// like `(?m)^$` can match at the final position beyond a
// line terminator, which is non-sensical in line oriented
// matching.
let slice = lines::without_terminator(
&buf[line],
self.config.line_term,
);
match self.matcher.is_match(slice) {
Err(err) => return Err(S::Error::error_message(err)),
Ok(true) => return Ok(Some(line)),
Ok(false) => {
pos = line.end();
continue;
}
}
}
}
}
Ok(None)
}
#[inline(always)]
fn sink_matched(
&mut self,
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
if !self.sink_break_context(range.start())? {
return Ok(false);
}
self.count_lines(buf, range.start());
let offset = self.absolute_byte_offset + range.start() as u64;
let linebuf = &buf[*range];
let keepgoing = self.sink.matched(
&self.searcher,
&SinkMatch {
line_term: self.config.line_term,
bytes: linebuf,
absolute_byte_offset: offset,
line_number: self.line_number,
},
)?;
if !keepgoing {
return Ok(false);
}
self.last_line_visited = range.end();
self.after_context_left = self.config.after_context;
self.has_sunk = true;
Ok(true)
}
fn sink_before_context(
&mut self,
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
self.count_lines(buf, range.start());
let offset = self.absolute_byte_offset + range.start() as u64;
let keepgoing = self.sink.context(
&self.searcher,
&SinkContext {
line_term: self.config.line_term,
bytes: &buf[*range],
kind: SinkContextKind::Before,
absolute_byte_offset: offset,
line_number: self.line_number,
},
)?;
if !keepgoing {
return Ok(false);
}
self.last_line_visited = range.end();
self.has_sunk = true;
Ok(true)
}
fn sink_after_context(
&mut self,
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
assert!(self.after_context_left >= 1);
if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
self.count_lines(buf, range.start());
let offset = self.absolute_byte_offset + range.start() as u64;
let keepgoing = self.sink.context(
&self.searcher,
&SinkContext {
line_term: self.config.line_term,
bytes: &buf[*range],
kind: SinkContextKind::After,
absolute_byte_offset: offset,
line_number: self.line_number,
},
)?;
if !keepgoing {
return Ok(false);
}
self.last_line_visited = range.end();
self.after_context_left -= 1;
self.has_sunk = true;
Ok(true)
}
fn sink_other_context(
&mut self,
buf: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
if self.binary && self.detect_binary(buf, range)? {
return Ok(false);
}
self.count_lines(buf, range.start());
let offset = self.absolute_byte_offset + range.start() as u64;
let keepgoing = self.sink.context(
&self.searcher,
&SinkContext {
line_term: self.config.line_term,
bytes: &buf[*range],
kind: SinkContextKind::Other,
absolute_byte_offset: offset,
line_number: self.line_number,
},
)?;
if !keepgoing {
return Ok(false);
}
self.last_line_visited = range.end();
self.has_sunk = true;
Ok(true)
}
fn sink_break_context(
&mut self,
start_of_line: usize,
) -> Result<bool, S::Error> {
let is_gap = self.last_line_visited < start_of_line;
let any_context =
self.config.before_context > 0 || self.config.after_context > 0;
if !any_context || !self.has_sunk || !is_gap {
Ok(true)
} else {
self.sink.context_break(&self.searcher)
}
}
fn count_lines(&mut self, buf: &[u8], upto: usize) {
if let Some(ref mut line_number) = self.line_number {
if self.last_line_counted >= upto {
return;
}
let slice = &buf[self.last_line_counted..upto];
let count = lines::count(slice, self.config.line_term.as_byte());
*line_number += count;
self.last_line_counted = upto;
}
}
fn is_line_by_line_fast(&self) -> bool {
debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
if self.config.passthru {
return false;
}
if let Some(line_term) = self.matcher.line_terminator() {
if line_term == self.config.line_term {
return true;
}
}
if let Some(non_matching) = self.matcher.non_matching_bytes() {
// If the line terminator is CRLF, we don't actually need to care
// whether the regex can match `\r` or not. Namely, a `\r` is
// neither necessary nor sufficient to terminate a line. A `\n` is
// always required.
if non_matching.contains(self.config.line_term.as_byte()) {
return true;
}
}
false
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,106 @@
use std::fs::File;
use std::path::Path;
use memmap::Mmap;
/// Controls the strategy used for determining when to use memory maps.
///
/// If a searcher is called in circumstances where it is possible to use memory
/// maps, and memory maps are enabled, then it will attempt to do so if it
/// believes it will make the search faster.
///
/// By default, memory maps are disabled.
#[derive(Clone, Debug)]
pub struct MmapChoice(MmapChoiceImpl);
#[derive(Clone, Debug)]
enum MmapChoiceImpl {
Auto,
Never,
}
impl Default for MmapChoice {
fn default() -> MmapChoice {
MmapChoice(MmapChoiceImpl::Never)
}
}
impl MmapChoice {
/// Use memory maps when they are believed to be advantageous.
///
/// The heuristics used to determine whether to use a memory map or not
/// may depend on many things, including but not limited to, file size
/// and platform.
///
/// If memory maps are unavailable or cannot be used for a specific input,
/// then normal OS read calls are used instead.
///
/// # Safety
///
/// This constructor is not safe because there is no obvious way to
/// encapsulate the safety of file backed memory maps on all platforms
/// without simultaneously negating some or all of their benefits.
///
/// The specific contract the caller is required to uphold isn't precise,
/// but it basically amounts to something like, "the caller guarantees that
/// the underlying file won't be mutated." This, of course, isn't feasible
/// in many environments. However, command line tools may still decide to
/// take the risk of, say, a `SIGBUS` occurring while attempting to read a
/// memory map.
pub unsafe fn auto() -> MmapChoice {
MmapChoice(MmapChoiceImpl::Auto)
}
/// Never use memory maps, no matter what. This is the default.
pub fn never() -> MmapChoice {
MmapChoice(MmapChoiceImpl::Never)
}
/// Return a memory map if memory maps are enabled and if creating a
/// memory from the given file succeeded and if memory maps are believed
/// to be advantageous for performance.
///
/// If this does attempt to open a memory map and it fails, then `None`
/// is returned and the corresponding error (along with the file path, if
/// present) is logged at the debug level.
pub(crate) fn open(
&self,
file: &File,
path: Option<&Path>,
) -> Option<Mmap> {
if !self.is_enabled() {
return None;
}
if cfg!(target_os = "macos") {
// I guess memory maps on macOS aren't great. Should re-evaluate.
return None;
}
// SAFETY: This is acceptable because the only way `MmapChoiceImpl` can
// be `Auto` is if the caller invoked the `auto` constructor, which
// is itself not safe. Thus, this is a propagation of the caller's
// assertion that using memory maps is safe.
match unsafe { Mmap::map(file) } {
Ok(mmap) => Some(mmap),
Err(err) => {
if let Some(path) = path {
debug!(
"{}: failed to open memory map: {}",
path.display(),
err
);
} else {
debug!("failed to open memory map: {}", err);
}
None
}
}
}
/// Whether this strategy may employ memory maps or not.
pub(crate) fn is_enabled(&self) -> bool {
match self.0 {
MmapChoiceImpl::Auto => true,
MmapChoiceImpl::Never => false,
}
}
}

File diff suppressed because it is too large Load Diff

644
crates/searcher/src/sink.rs Normal file
View File

@@ -0,0 +1,644 @@
use std::error;
use std::fmt;
use std::io;
use grep_matcher::LineTerminator;
use lines::LineIter;
use searcher::{ConfigError, Searcher};
/// A trait that describes errors that can be reported by searchers and
/// implementations of `Sink`.
///
/// Unless you have a specialized use case, you probably don't need to
/// implement this trait explicitly. It's likely that using `io::Error` (which
/// implements this trait) for your error type is good enough, largely because
/// most errors that occur during search will likely be an `io::Error`.
pub trait SinkError: Sized {
/// A constructor for converting any value that satisfies the
/// `fmt::Display` trait into an error.
fn error_message<T: fmt::Display>(message: T) -> Self;
/// A constructor for converting I/O errors that occur while searching into
/// an error of this type.
///
/// By default, this is implemented via the `error_message` constructor.
fn error_io(err: io::Error) -> Self {
Self::error_message(err)
}
/// A constructor for converting configuration errors that occur while
/// building a searcher into an error of this type.
///
/// By default, this is implemented via the `error_message` constructor.
fn error_config(err: ConfigError) -> Self {
Self::error_message(err)
}
}
/// An `io::Error` can be used as an error for `Sink` implementations out of
/// the box.
impl SinkError for io::Error {
fn error_message<T: fmt::Display>(message: T) -> io::Error {
io::Error::new(io::ErrorKind::Other, message.to_string())
}
fn error_io(err: io::Error) -> io::Error {
err
}
}
/// A `Box<std::error::Error>` can be used as an error for `Sink`
/// implementations out of the box.
impl SinkError for Box<dyn error::Error> {
fn error_message<T: fmt::Display>(message: T) -> Box<dyn error::Error> {
Box::<dyn error::Error>::from(message.to_string())
}
}
/// A trait that defines how results from searchers are handled.
///
/// In this crate, a searcher follows the "push" model. What that means is that
/// the searcher drives execution, and pushes results back to the caller. This
/// is in contrast to a "pull" model where the caller drives execution and
/// takes results as they need them. These are also known as "internal" and
/// "external" iteration strategies, respectively.
///
/// For a variety of reasons, including the complexity of the searcher
/// implementation, this crate chooses the "push" or "internal" model of
/// execution. Thus, in order to act on search results, callers must provide
/// an implementation of this trait to a searcher, and the searcher is then
/// responsible for calling the methods on this trait.
///
/// This trait defines several behaviors:
///
/// * What to do when a match is found. Callers must provide this.
/// * What to do when an error occurs. Callers must provide this via the
/// [`SinkError`](trait.SinkError.html) trait. Generally, callers can just
/// use `io::Error` for this, which already implements `SinkError`.
/// * What to do when a contextual line is found. By default, these are
/// ignored.
/// * What to do when a gap between contextual lines has been found. By
/// default, this is ignored.
/// * What to do when a search has started. By default, this does nothing.
/// * What to do when a search has finished successfully. By default, this does
/// nothing.
///
/// Callers must, at minimum, specify the behavior when an error occurs and
/// the behavior when a match occurs. The rest is optional. For each behavior,
/// callers may report an error (say, if writing the result to another
/// location failed) or simply return `false` if they want the search to stop
/// (e.g., when implementing a cap on the number of search results to show).
///
/// When errors are reported (whether in the searcher or in the implementation
/// of `Sink`), then searchers quit immediately without calling `finish`.
///
/// For simpler uses of `Sink`, callers may elect to use one of
/// the more convenient but less flexible implementations in the
/// [`sinks`](sinks/index.html) module.
pub trait Sink {
/// The type of an error that should be reported by a searcher.
///
/// Errors of this type are not only returned by the methods on this
/// trait, but the constructors defined in `SinkError` are also used in
/// the searcher implementation itself. e.g., When a I/O error occurs when
/// reading data from a file.
type Error: SinkError;
/// This method is called whenever a match is found.
///
/// If multi line is enabled on the searcher, then the match reported here
/// may span multiple lines and it may include multiple matches. When multi
/// line is disabled, then the match is guaranteed to span exactly one
/// non-empty line (where a single line is, at minimum, a line terminator).
///
/// If this returns `true`, then searching continues. If this returns
/// `false`, then searching is stopped immediately and `finish` is called.
///
/// If this returns an error, then searching is stopped immediately,
/// `finish` is not called and the error is bubbled back up to the caller
/// of the searcher.
fn matched(
&mut self,
_searcher: &Searcher,
_mat: &SinkMatch,
) -> Result<bool, Self::Error>;
/// This method is called whenever a context line is found, and is optional
/// to implement. By default, it does nothing and returns `true`.
///
/// In all cases, the context given is guaranteed to span exactly one
/// non-empty line (where a single line is, at minimum, a line terminator).
///
/// If this returns `true`, then searching continues. If this returns
/// `false`, then searching is stopped immediately and `finish` is called.
///
/// If this returns an error, then searching is stopped immediately,
/// `finish` is not called and the error is bubbled back up to the caller
/// of the searcher.
#[inline]
fn context(
&mut self,
_searcher: &Searcher,
_context: &SinkContext,
) -> Result<bool, Self::Error> {
Ok(true)
}
/// This method is called whenever a break in contextual lines is found,
/// and is optional to implement. By default, it does nothing and returns
/// `true`.
///
/// A break can only occur when context reporting is enabled (that is,
/// either or both of `before_context` or `after_context` are greater than
/// `0`). More precisely, a break occurs between non-contiguous groups of
/// lines.
///
/// If this returns `true`, then searching continues. If this returns
/// `false`, then searching is stopped immediately and `finish` is called.
///
/// If this returns an error, then searching is stopped immediately,
/// `finish` is not called and the error is bubbled back up to the caller
/// of the searcher.
#[inline]
fn context_break(
&mut self,
_searcher: &Searcher,
) -> Result<bool, Self::Error> {
Ok(true)
}
/// This method is called whenever binary detection is enabled and binary
/// data is found. If binary data is found, then this is called at least
/// once for the first occurrence with the absolute byte offset at which
/// the binary data begins.
///
/// If this returns `true`, then searching continues. If this returns
/// `false`, then searching is stopped immediately and `finish` is called.
///
/// If this returns an error, then searching is stopped immediately,
/// `finish` is not called and the error is bubbled back up to the caller
/// of the searcher.
///
/// By default, it does nothing and returns `true`.
#[inline]
fn binary_data(
&mut self,
_searcher: &Searcher,
_binary_byte_offset: u64,
) -> Result<bool, Self::Error> {
Ok(true)
}
/// This method is called when a search has begun, before any search is
/// executed. By default, this does nothing.
///
/// If this returns `true`, then searching continues. If this returns
/// `false`, then searching is stopped immediately and `finish` is called.
///
/// If this returns an error, then searching is stopped immediately,
/// `finish` is not called and the error is bubbled back up to the caller
/// of the searcher.
#[inline]
fn begin(&mut self, _searcher: &Searcher) -> Result<bool, Self::Error> {
Ok(true)
}
/// This method is called when a search has completed. By default, this
/// does nothing.
///
/// If this returns an error, the error is bubbled back up to the caller of
/// the searcher.
#[inline]
fn finish(
&mut self,
_searcher: &Searcher,
_: &SinkFinish,
) -> Result<(), Self::Error> {
Ok(())
}
}
impl<'a, S: Sink> Sink for &'a mut S {
type Error = S::Error;
#[inline]
fn matched(
&mut self,
searcher: &Searcher,
mat: &SinkMatch,
) -> Result<bool, S::Error> {
(**self).matched(searcher, mat)
}
#[inline]
fn context(
&mut self,
searcher: &Searcher,
context: &SinkContext,
) -> Result<bool, S::Error> {
(**self).context(searcher, context)
}
#[inline]
fn context_break(
&mut self,
searcher: &Searcher,
) -> Result<bool, S::Error> {
(**self).context_break(searcher)
}
#[inline]
fn binary_data(
&mut self,
searcher: &Searcher,
binary_byte_offset: u64,
) -> Result<bool, S::Error> {
(**self).binary_data(searcher, binary_byte_offset)
}
#[inline]
fn begin(&mut self, searcher: &Searcher) -> Result<bool, S::Error> {
(**self).begin(searcher)
}
#[inline]
fn finish(
&mut self,
searcher: &Searcher,
sink_finish: &SinkFinish,
) -> Result<(), S::Error> {
(**self).finish(searcher, sink_finish)
}
}
impl<S: Sink + ?Sized> Sink for Box<S> {
type Error = S::Error;
#[inline]
fn matched(
&mut self,
searcher: &Searcher,
mat: &SinkMatch,
) -> Result<bool, S::Error> {
(**self).matched(searcher, mat)
}
#[inline]
fn context(
&mut self,
searcher: &Searcher,
context: &SinkContext,
) -> Result<bool, S::Error> {
(**self).context(searcher, context)
}
#[inline]
fn context_break(
&mut self,
searcher: &Searcher,
) -> Result<bool, S::Error> {
(**self).context_break(searcher)
}
#[inline]
fn binary_data(
&mut self,
searcher: &Searcher,
binary_byte_offset: u64,
) -> Result<bool, S::Error> {
(**self).binary_data(searcher, binary_byte_offset)
}
#[inline]
fn begin(&mut self, searcher: &Searcher) -> Result<bool, S::Error> {
(**self).begin(searcher)
}
#[inline]
fn finish(
&mut self,
searcher: &Searcher,
sink_finish: &SinkFinish,
) -> Result<(), S::Error> {
(**self).finish(searcher, sink_finish)
}
}
/// Summary data reported at the end of a search.
///
/// This reports data such as the total number of bytes searched and the
/// absolute offset of the first occurrence of binary data, if any were found.
///
/// A searcher that stops early because of an error does not call `finish`.
/// A searcher that stops early because the `Sink` implementor instructed it
/// to will still call `finish`.
#[derive(Clone, Debug)]
pub struct SinkFinish {
pub(crate) byte_count: u64,
pub(crate) binary_byte_offset: Option<u64>,
}
impl SinkFinish {
/// Return the total number of bytes searched.
#[inline]
pub fn byte_count(&self) -> u64 {
self.byte_count
}
/// If binary detection is enabled and if binary data was found, then this
/// returns the absolute byte offset of the first detected byte of binary
/// data.
///
/// Note that since this is an absolute byte offset, it cannot be relied
/// upon to index into any addressable memory.
#[inline]
pub fn binary_byte_offset(&self) -> Option<u64> {
self.binary_byte_offset
}
}
/// A type that describes a match reported by a searcher.
#[derive(Clone, Debug)]
pub struct SinkMatch<'b> {
pub(crate) line_term: LineTerminator,
pub(crate) bytes: &'b [u8],
pub(crate) absolute_byte_offset: u64,
pub(crate) line_number: Option<u64>,
}
impl<'b> SinkMatch<'b> {
/// Returns the bytes for all matching lines, including the line
/// terminators, if they exist.
#[inline]
pub fn bytes(&self) -> &'b [u8] {
self.bytes
}
/// Return an iterator over the lines in this match.
///
/// If multi line search is enabled, then this may yield more than one
/// line (but always at least one line). If multi line search is disabled,
/// then this always reports exactly one line (but may consist of just
/// the line terminator).
///
/// Lines yielded by this iterator include their terminators.
#[inline]
pub fn lines(&self) -> LineIter<'b> {
LineIter::new(self.line_term.as_byte(), self.bytes)
}
/// Returns the absolute byte offset of the start of this match. This
/// offset is absolute in that it is relative to the very beginning of the
/// input in a search, and can never be relied upon to be a valid index
/// into an in-memory slice.
#[inline]
pub fn absolute_byte_offset(&self) -> u64 {
self.absolute_byte_offset
}
/// Returns the line number of the first line in this match, if available.
///
/// Line numbers are only available when the search builder is instructed
/// to compute them.
#[inline]
pub fn line_number(&self) -> Option<u64> {
self.line_number
}
}
/// The type of context reported by a searcher.
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum SinkContextKind {
/// The line reported occurred before a match.
Before,
/// The line reported occurred after a match.
After,
/// Any other type of context reported, e.g., as a result of a searcher's
/// "passthru" mode.
Other,
}
/// A type that describes a contextual line reported by a searcher.
#[derive(Clone, Debug)]
pub struct SinkContext<'b> {
pub(crate) line_term: LineTerminator,
pub(crate) bytes: &'b [u8],
pub(crate) kind: SinkContextKind,
pub(crate) absolute_byte_offset: u64,
pub(crate) line_number: Option<u64>,
}
impl<'b> SinkContext<'b> {
/// Returns the context bytes, including line terminators.
#[inline]
pub fn bytes(&self) -> &'b [u8] {
self.bytes
}
/// Returns the type of context.
#[inline]
pub fn kind(&self) -> &SinkContextKind {
&self.kind
}
/// Return an iterator over the lines in this match.
///
/// This always yields exactly one line (and that one line may contain just
/// the line terminator).
///
/// Lines yielded by this iterator include their terminators.
#[cfg(test)]
pub(crate) fn lines(&self) -> LineIter<'b> {
LineIter::new(self.line_term.as_byte(), self.bytes)
}
/// Returns the absolute byte offset of the start of this context. This
/// offset is absolute in that it is relative to the very beginning of the
/// input in a search, and can never be relied upon to be a valid index
/// into an in-memory slice.
#[inline]
pub fn absolute_byte_offset(&self) -> u64 {
self.absolute_byte_offset
}
/// Returns the line number of the first line in this context, if
/// available.
///
/// Line numbers are only available when the search builder is instructed
/// to compute them.
#[inline]
pub fn line_number(&self) -> Option<u64> {
self.line_number
}
}
/// A collection of convenience implementations of `Sink`.
///
/// Each implementation in this module makes some kind of sacrifice in the name
/// of making common cases easier to use. Most frequently, each type is a
/// wrapper around a closure specified by the caller that provides limited
/// access to the full suite of information available to implementors of
/// `Sink`.
///
/// For example, the `UTF8` sink makes the following sacrifices:
///
/// * All matches must be UTF-8. An arbitrary `Sink` does not have this
/// restriction and can deal with arbitrary data. If this sink sees invalid
/// UTF-8, then an error is returned and searching stops. (Use the `Lossy`
/// sink instead to suppress this error.)
/// * The searcher must be configured to report line numbers. If it isn't,
/// an error is reported at the first match and searching stops.
/// * Context lines, context breaks and summary data reported at the end of
/// a search are all ignored.
/// * Implementors are forced to use `io::Error` as their error type.
///
/// If you need more flexibility, then you're advised to implement the `Sink`
/// trait directly.
pub mod sinks {
use std::io;
use std::str;
use super::{Sink, SinkError, SinkMatch};
use searcher::Searcher;
/// A sink that provides line numbers and matches as strings while ignoring
/// everything else.
///
/// This implementation will return an error if a match contains invalid
/// UTF-8 or if the searcher was not configured to count lines. Errors
/// on invalid UTF-8 can be suppressed by using the `Lossy` sink instead
/// of this one.
///
/// The closure accepts two parameters: a line number and a UTF-8 string
/// containing the matched data. The closure returns a
/// `Result<bool, io::Error>`. If the `bool` is `false`, then the search
/// stops immediately. Otherwise, searching continues.
///
/// If multi line mode was enabled, the line number refers to the line
/// number of the first line in the match.
#[derive(Clone, Debug)]
pub struct UTF8<F>(pub F)
where
F: FnMut(u64, &str) -> Result<bool, io::Error>;
impl<F> Sink for UTF8<F>
where
F: FnMut(u64, &str) -> Result<bool, io::Error>,
{
type Error = io::Error;
fn matched(
&mut self,
_searcher: &Searcher,
mat: &SinkMatch,
) -> Result<bool, io::Error> {
let matched = match str::from_utf8(mat.bytes()) {
Ok(matched) => matched,
Err(err) => return Err(io::Error::error_message(err)),
};
let line_number = match mat.line_number() {
Some(line_number) => line_number,
None => {
let msg = "line numbers not enabled";
return Err(io::Error::error_message(msg));
}
};
(self.0)(line_number, &matched)
}
}
/// A sink that provides line numbers and matches as (lossily converted)
/// strings while ignoring everything else.
///
/// This is like `UTF8`, except that if a match contains invalid UTF-8,
/// then it will be lossily converted to valid UTF-8 by substituting
/// invalid UTF-8 with Unicode replacement characters.
///
/// This implementation will return an error on the first match if the
/// searcher was not configured to count lines.
///
/// The closure accepts two parameters: a line number and a UTF-8 string
/// containing the matched data. The closure returns a
/// `Result<bool, io::Error>`. If the `bool` is `false`, then the search
/// stops immediately. Otherwise, searching continues.
///
/// If multi line mode was enabled, the line number refers to the line
/// number of the first line in the match.
#[derive(Clone, Debug)]
pub struct Lossy<F>(pub F)
where
F: FnMut(u64, &str) -> Result<bool, io::Error>;
impl<F> Sink for Lossy<F>
where
F: FnMut(u64, &str) -> Result<bool, io::Error>,
{
type Error = io::Error;
fn matched(
&mut self,
_searcher: &Searcher,
mat: &SinkMatch,
) -> Result<bool, io::Error> {
use std::borrow::Cow;
let matched = match str::from_utf8(mat.bytes()) {
Ok(matched) => Cow::Borrowed(matched),
// TODO: In theory, it should be possible to amortize
// allocation here, but `std` doesn't provide such an API.
// Regardless, this only happens on matches with invalid UTF-8,
// which should be pretty rare.
Err(_) => String::from_utf8_lossy(mat.bytes()),
};
let line_number = match mat.line_number() {
Some(line_number) => line_number,
None => {
let msg = "line numbers not enabled";
return Err(io::Error::error_message(msg));
}
};
(self.0)(line_number, &matched)
}
}
/// A sink that provides line numbers and matches as raw bytes while
/// ignoring everything else.
///
/// This implementation will return an error on the first match if the
/// searcher was not configured to count lines.
///
/// The closure accepts two parameters: a line number and a raw byte string
/// containing the matched data. The closure returns a `Result<bool,
/// io::Error>`. If the `bool` is `false`, then the search stops
/// immediately. Otherwise, searching continues.
///
/// If multi line mode was enabled, the line number refers to the line
/// number of the first line in the match.
#[derive(Clone, Debug)]
pub struct Bytes<F>(pub F)
where
F: FnMut(u64, &[u8]) -> Result<bool, io::Error>;
impl<F> Sink for Bytes<F>
where
F: FnMut(u64, &[u8]) -> Result<bool, io::Error>,
{
type Error = io::Error;
fn matched(
&mut self,
_searcher: &Searcher,
mat: &SinkMatch,
) -> Result<bool, io::Error> {
let line_number = match mat.line_number() {
Some(line_number) => line_number,
None => {
let msg = "line numbers not enabled";
return Err(io::Error::error_message(msg));
}
};
(self.0)(line_number, mat.bytes())
}
}
}

View File

@@ -0,0 +1,788 @@
use std::io::{self, Write};
use std::str;
use bstr::ByteSlice;
use grep_matcher::{
LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError,
};
use regex::bytes::{Regex, RegexBuilder};
use searcher::{BinaryDetection, Searcher, SearcherBuilder};
use sink::{Sink, SinkContext, SinkFinish, SinkMatch};
/// A simple regex matcher.
///
/// This supports setting the matcher's line terminator configuration directly,
/// which we use for testing purposes. That is, the caller explicitly
/// determines whether the line terminator optimization is enabled. (In reality
/// this optimization is detected automatically by inspecting and possibly
/// modifying the regex itself.)
#[derive(Clone, Debug)]
pub struct RegexMatcher {
regex: Regex,
line_term: Option<LineTerminator>,
every_line_is_candidate: bool,
}
impl RegexMatcher {
/// Create a new regex matcher.
pub fn new(pattern: &str) -> RegexMatcher {
let regex = RegexBuilder::new(pattern)
.multi_line(true) // permits ^ and $ to match at \n boundaries
.build()
.unwrap();
RegexMatcher {
regex: regex,
line_term: None,
every_line_is_candidate: false,
}
}
/// Forcefully set the line terminator of this matcher.
///
/// By default, this matcher has no line terminator set.
pub fn set_line_term(
&mut self,
line_term: Option<LineTerminator>,
) -> &mut RegexMatcher {
self.line_term = line_term;
self
}
/// Whether to return every line as a candidate or not.
///
/// This forces searchers to handle the case of reporting a false positive.
pub fn every_line_is_candidate(&mut self, yes: bool) -> &mut RegexMatcher {
self.every_line_is_candidate = yes;
self
}
}
impl Matcher for RegexMatcher {
type Captures = NoCaptures;
type Error = NoError;
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, NoError> {
Ok(self
.regex
.find_at(haystack, at)
.map(|m| Match::new(m.start(), m.end())))
}
fn new_captures(&self) -> Result<NoCaptures, NoError> {
Ok(NoCaptures::new())
}
fn line_terminator(&self) -> Option<LineTerminator> {
self.line_term
}
fn find_candidate_line(
&self,
haystack: &[u8],
) -> Result<Option<LineMatchKind>, NoError> {
if self.every_line_is_candidate {
assert!(self.line_term.is_some());
if haystack.is_empty() {
return Ok(None);
}
// Make it interesting and return the last byte in the current
// line.
let i = haystack
.find_byte(self.line_term.unwrap().as_byte())
.map(|i| i)
.unwrap_or(haystack.len() - 1);
Ok(Some(LineMatchKind::Candidate(i)))
} else {
Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
}
}
}
/// An implementation of Sink that prints all available information.
///
/// This is useful for tests because it lets us easily confirm whether data
/// is being passed to Sink correctly.
#[derive(Clone, Debug)]
pub struct KitchenSink(Vec<u8>);
impl KitchenSink {
/// Create a new implementation of Sink that includes everything in the
/// kitchen.
pub fn new() -> KitchenSink {
KitchenSink(vec![])
}
/// Return the data written to this sink.
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
}
impl Sink for KitchenSink {
type Error = io::Error;
fn matched(
&mut self,
_searcher: &Searcher,
mat: &SinkMatch,
) -> Result<bool, io::Error> {
assert!(!mat.bytes().is_empty());
assert!(mat.lines().count() >= 1);
let mut line_number = mat.line_number();
let mut byte_offset = mat.absolute_byte_offset();
for line in mat.lines() {
if let Some(ref mut n) = line_number {
write!(self.0, "{}:", n)?;
*n += 1;
}
write!(self.0, "{}:", byte_offset)?;
byte_offset += line.len() as u64;
self.0.write_all(line)?;
}
Ok(true)
}
fn context(
&mut self,
_searcher: &Searcher,
context: &SinkContext,
) -> Result<bool, io::Error> {
assert!(!context.bytes().is_empty());
assert!(context.lines().count() == 1);
if let Some(line_number) = context.line_number() {
write!(self.0, "{}-", line_number)?;
}
write!(self.0, "{}-", context.absolute_byte_offset)?;
self.0.write_all(context.bytes())?;
Ok(true)
}
fn context_break(
&mut self,
_searcher: &Searcher,
) -> Result<bool, io::Error> {
self.0.write_all(b"--\n")?;
Ok(true)
}
fn finish(
&mut self,
_searcher: &Searcher,
sink_finish: &SinkFinish,
) -> Result<(), io::Error> {
writeln!(self.0, "")?;
writeln!(self.0, "byte count:{}", sink_finish.byte_count())?;
if let Some(offset) = sink_finish.binary_byte_offset() {
writeln!(self.0, "binary offset:{}", offset)?;
}
Ok(())
}
}
/// A type for expressing tests on a searcher.
///
/// The searcher code has a lot of different code paths, mostly for the
/// purposes of optimizing a bunch of different use cases. The intent of the
/// searcher is to pick the best code path based on the configuration, which
/// means there is no obviously direct way to ask that a specific code path
/// be exercised. Thus, the purpose of this tester is to explicitly check as
/// many code paths that make sense.
///
/// The tester works by assuming you want to test all pertinent code paths.
/// These can be trimmed down as necessary via the various builder methods.
#[derive(Debug)]
pub struct SearcherTester {
haystack: String,
pattern: String,
filter: Option<::regex::Regex>,
print_labels: bool,
expected_no_line_number: Option<String>,
expected_with_line_number: Option<String>,
expected_slice_no_line_number: Option<String>,
expected_slice_with_line_number: Option<String>,
by_line: bool,
multi_line: bool,
invert_match: bool,
line_number: bool,
binary: BinaryDetection,
auto_heap_limit: bool,
after_context: usize,
before_context: usize,
passthru: bool,
}
impl SearcherTester {
/// Create a new tester for testing searchers.
pub fn new(haystack: &str, pattern: &str) -> SearcherTester {
SearcherTester {
haystack: haystack.to_string(),
pattern: pattern.to_string(),
filter: None,
print_labels: false,
expected_no_line_number: None,
expected_with_line_number: None,
expected_slice_no_line_number: None,
expected_slice_with_line_number: None,
by_line: true,
multi_line: true,
invert_match: false,
line_number: true,
binary: BinaryDetection::none(),
auto_heap_limit: true,
after_context: 0,
before_context: 0,
passthru: false,
}
}
/// Execute the test. If the test succeeds, then this returns successfully.
/// If the test fails, then it panics with an informative message.
pub fn test(&self) {
// Check for configuration errors.
if self.expected_no_line_number.is_none() {
panic!("an 'expected' string with NO line numbers must be given");
}
if self.line_number && self.expected_with_line_number.is_none() {
panic!(
"an 'expected' string with line numbers must be given, \
or disable testing with line numbers"
);
}
let configs = self.configs();
if configs.is_empty() {
panic!("test configuration resulted in nothing being tested");
}
if self.print_labels {
for config in &configs {
let labels = vec![
format!("reader-{}", config.label),
format!("slice-{}", config.label),
];
for label in &labels {
if self.include(label) {
println!("{}", label);
} else {
println!("{} (ignored)", label);
}
}
}
}
for config in &configs {
let label = format!("reader-{}", config.label);
if self.include(&label) {
let got = config.search_reader(&self.haystack);
assert_eq_printed!(config.expected_reader, got, "{}", label);
}
let label = format!("slice-{}", config.label);
if self.include(&label) {
let got = config.search_slice(&self.haystack);
assert_eq_printed!(config.expected_slice, got, "{}", label);
}
}
}
/// Set a regex pattern to filter the tests that are run.
///
/// By default, no filter is present. When a filter is set, only test
/// configurations with a label matching the given pattern will be run.
///
/// This is often useful when debugging tests, e.g., when you want to do
/// printf debugging and only want one particular test configuration to
/// execute.
#[allow(dead_code)]
pub fn filter(&mut self, pattern: &str) -> &mut SearcherTester {
self.filter = Some(::regex::Regex::new(pattern).unwrap());
self
}
/// When set, the labels for all test configurations are printed before
/// executing any test.
///
/// Note that in order to see these in tests that aren't failing, you'll
/// want to use `cargo test -- --nocapture`.
#[allow(dead_code)]
pub fn print_labels(&mut self, yes: bool) -> &mut SearcherTester {
self.print_labels = yes;
self
}
/// Set the expected search results, without line numbers.
pub fn expected_no_line_number(
&mut self,
exp: &str,
) -> &mut SearcherTester {
self.expected_no_line_number = Some(exp.to_string());
self
}
/// Set the expected search results, with line numbers.
pub fn expected_with_line_number(
&mut self,
exp: &str,
) -> &mut SearcherTester {
self.expected_with_line_number = Some(exp.to_string());
self
}
/// Set the expected search results, without line numbers, when performing
/// a search on a slice. When not present, `expected_no_line_number` is
/// used instead.
pub fn expected_slice_no_line_number(
&mut self,
exp: &str,
) -> &mut SearcherTester {
self.expected_slice_no_line_number = Some(exp.to_string());
self
}
/// Set the expected search results, with line numbers, when performing a
/// search on a slice. When not present, `expected_with_line_number` is
/// used instead.
#[allow(dead_code)]
pub fn expected_slice_with_line_number(
&mut self,
exp: &str,
) -> &mut SearcherTester {
self.expected_slice_with_line_number = Some(exp.to_string());
self
}
/// Whether to test search with line numbers or not.
///
/// This is enabled by default. When enabled, the string that is expected
/// when line numbers are present must be provided. Otherwise, the expected
/// string isn't required.
pub fn line_number(&mut self, yes: bool) -> &mut SearcherTester {
self.line_number = yes;
self
}
/// Whether to test search using the line-by-line searcher or not.
///
/// By default, this is enabled.
pub fn by_line(&mut self, yes: bool) -> &mut SearcherTester {
self.by_line = yes;
self
}
/// Whether to test search using the multi line searcher or not.
///
/// By default, this is enabled.
#[allow(dead_code)]
pub fn multi_line(&mut self, yes: bool) -> &mut SearcherTester {
self.multi_line = yes;
self
}
/// Whether to perform an inverted search or not.
///
/// By default, this is disabled.
pub fn invert_match(&mut self, yes: bool) -> &mut SearcherTester {
self.invert_match = yes;
self
}
/// Whether to enable binary detection on all searches.
///
/// By default, this is disabled.
pub fn binary_detection(
&mut self,
detection: BinaryDetection,
) -> &mut SearcherTester {
self.binary = detection;
self
}
/// Whether to automatically attempt to test the heap limit setting or not.
///
/// By default, one of the test configurations includes setting the heap
/// limit to its minimal value for normal operation, which checks that
/// everything works even at the extremes. However, in some cases, the heap
/// limit can (expectedly) alter the output slightly. For example, it can
/// impact the number of bytes searched when performing binary detection.
/// For convenience, it can be useful to disable the automatic heap limit
/// test.
pub fn auto_heap_limit(&mut self, yes: bool) -> &mut SearcherTester {
self.auto_heap_limit = yes;
self
}
/// Set the number of lines to include in the "after" context.
///
/// The default is `0`, which is equivalent to not printing any context.
pub fn after_context(&mut self, lines: usize) -> &mut SearcherTester {
self.after_context = lines;
self
}
/// Set the number of lines to include in the "before" context.
///
/// The default is `0`, which is equivalent to not printing any context.
pub fn before_context(&mut self, lines: usize) -> &mut SearcherTester {
self.before_context = lines;
self
}
/// Whether to enable the "passthru" feature or not.
///
/// When passthru is enabled, it effectively treats all non-matching lines
/// as contextual lines. In other words, enabling this is akin to
/// requesting an unbounded number of before and after contextual lines.
///
/// This is disabled by default.
pub fn passthru(&mut self, yes: bool) -> &mut SearcherTester {
self.passthru = yes;
self
}
/// Return the minimum size of a buffer required for a successful search.
///
/// Generally, this corresponds to the maximum length of a line (including
/// its terminator), but if context settings are enabled, then this must
/// include the sum of the longest N lines.
///
/// Note that this must account for whether the test is using multi line
/// search or not, since multi line search requires being able to fit the
/// entire haystack into memory.
fn minimal_heap_limit(&self, multi_line: bool) -> usize {
if multi_line {
1 + self.haystack.len()
} else if self.before_context == 0 && self.after_context == 0 {
1 + self.haystack.lines().map(|s| s.len()).max().unwrap_or(0)
} else {
let mut lens: Vec<usize> =
self.haystack.lines().map(|s| s.len()).collect();
lens.sort();
lens.reverse();
let context_count = if self.passthru {
self.haystack.lines().count()
} else {
// Why do we add 2 here? Well, we need to add 1 in order to
// have room to search at least one line. We add another
// because the implementation will occasionally include
// an additional line when handling the context. There's
// no particularly good reason, other than keeping the
// implementation simple.
2 + self.before_context + self.after_context
};
// We add 1 to each line since `str::lines` doesn't include the
// line terminator.
lens.into_iter()
.take(context_count)
.map(|len| len + 1)
.sum::<usize>()
}
}
/// Returns true if and only if the given label should be included as part
/// of executing `test`.
///
/// Inclusion is determined by the filter specified. If no filter has been
/// given, then this always returns `true`.
fn include(&self, label: &str) -> bool {
let re = match self.filter {
None => return true,
Some(ref re) => re,
};
re.is_match(label)
}
/// Configs generates a set of all search configurations that should be
/// tested. The configs generated are based on the configuration in this
/// builder.
fn configs(&self) -> Vec<TesterConfig> {
let mut configs = vec![];
let matcher = RegexMatcher::new(&self.pattern);
let mut builder = SearcherBuilder::new();
builder
.line_number(false)
.invert_match(self.invert_match)
.binary_detection(self.binary.clone())
.after_context(self.after_context)
.before_context(self.before_context)
.passthru(self.passthru);
if self.by_line {
let mut matcher = matcher.clone();
let mut builder = builder.clone();
let expected_reader =
self.expected_no_line_number.as_ref().unwrap().to_string();
let expected_slice = match self.expected_slice_no_line_number {
None => expected_reader.clone(),
Some(ref e) => e.to_string(),
};
configs.push(TesterConfig {
label: "byline-noterm-nonumber".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
if self.auto_heap_limit {
builder.heap_limit(Some(self.minimal_heap_limit(false)));
configs.push(TesterConfig {
label: "byline-noterm-nonumber-heaplimit".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
builder.heap_limit(None);
}
matcher.set_line_term(Some(LineTerminator::byte(b'\n')));
configs.push(TesterConfig {
label: "byline-term-nonumber".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
matcher.every_line_is_candidate(true);
configs.push(TesterConfig {
label: "byline-term-nonumber-candidates".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
}
if self.by_line && self.line_number {
let mut matcher = matcher.clone();
let mut builder = builder.clone();
let expected_reader =
self.expected_with_line_number.as_ref().unwrap().to_string();
let expected_slice = match self.expected_slice_with_line_number {
None => expected_reader.clone(),
Some(ref e) => e.to_string(),
};
builder.line_number(true);
configs.push(TesterConfig {
label: "byline-noterm-number".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
matcher.set_line_term(Some(LineTerminator::byte(b'\n')));
configs.push(TesterConfig {
label: "byline-term-number".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
matcher.every_line_is_candidate(true);
configs.push(TesterConfig {
label: "byline-term-number-candidates".to_string(),
expected_reader: expected_reader.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
}
if self.multi_line {
let mut builder = builder.clone();
let expected_slice = match self.expected_slice_no_line_number {
None => {
self.expected_no_line_number.as_ref().unwrap().to_string()
}
Some(ref e) => e.to_string(),
};
builder.multi_line(true);
configs.push(TesterConfig {
label: "multiline-nonumber".to_string(),
expected_reader: expected_slice.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
if self.auto_heap_limit {
builder.heap_limit(Some(self.minimal_heap_limit(true)));
configs.push(TesterConfig {
label: "multiline-nonumber-heaplimit".to_string(),
expected_reader: expected_slice.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
builder.heap_limit(None);
}
}
if self.multi_line && self.line_number {
let mut builder = builder.clone();
let expected_slice = match self.expected_slice_with_line_number {
None => self
.expected_with_line_number
.as_ref()
.unwrap()
.to_string(),
Some(ref e) => e.to_string(),
};
builder.multi_line(true);
builder.line_number(true);
configs.push(TesterConfig {
label: "multiline-number".to_string(),
expected_reader: expected_slice.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
builder.heap_limit(Some(self.minimal_heap_limit(true)));
configs.push(TesterConfig {
label: "multiline-number-heaplimit".to_string(),
expected_reader: expected_slice.clone(),
expected_slice: expected_slice.clone(),
builder: builder.clone(),
matcher: matcher.clone(),
});
builder.heap_limit(None);
}
configs
}
}
#[derive(Debug)]
struct TesterConfig {
label: String,
expected_reader: String,
expected_slice: String,
builder: SearcherBuilder,
matcher: RegexMatcher,
}
impl TesterConfig {
/// Execute a search using a reader. This exercises the incremental search
/// strategy, where the entire contents of the corpus aren't necessarily
/// in memory at once.
fn search_reader(&self, haystack: &str) -> String {
let mut sink = KitchenSink::new();
let mut searcher = self.builder.build();
let result = searcher.search_reader(
&self.matcher,
haystack.as_bytes(),
&mut sink,
);
if let Err(err) = result {
let label = format!("reader-{}", self.label);
panic!("error running '{}': {}", label, err);
}
String::from_utf8(sink.as_bytes().to_vec()).unwrap()
}
/// Execute a search using a slice. This exercises the search routines that
/// have the entire contents of the corpus in memory at one time.
fn search_slice(&self, haystack: &str) -> String {
let mut sink = KitchenSink::new();
let mut searcher = self.builder.build();
let result = searcher.search_slice(
&self.matcher,
haystack.as_bytes(),
&mut sink,
);
if let Err(err) = result {
let label = format!("slice-{}", self.label);
panic!("error running '{}': {}", label, err);
}
String::from_utf8(sink.as_bytes().to_vec()).unwrap()
}
}
#[cfg(test)]
mod tests {
use grep_matcher::{Match, Matcher};
use super::*;
fn m(start: usize, end: usize) -> Match {
Match::new(start, end)
}
#[test]
fn empty_line1() {
let haystack = b"";
let matcher = RegexMatcher::new(r"^$");
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0))));
}
#[test]
fn empty_line2() {
let haystack = b"\n";
let matcher = RegexMatcher::new(r"^$");
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0))));
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1))));
}
#[test]
fn empty_line3() {
let haystack = b"\n\n";
let matcher = RegexMatcher::new(r"^$");
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0))));
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1))));
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
}
#[test]
fn empty_line4() {
let haystack = b"a\n\nb\n";
let matcher = RegexMatcher::new(r"^$");
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 3), Ok(Some(m(5, 5))));
assert_eq!(matcher.find_at(haystack, 4), Ok(Some(m(5, 5))));
assert_eq!(matcher.find_at(haystack, 5), Ok(Some(m(5, 5))));
}
#[test]
fn empty_line5() {
let haystack = b"a\n\nb\nc";
let matcher = RegexMatcher::new(r"^$");
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 3), Ok(None));
assert_eq!(matcher.find_at(haystack, 4), Ok(None));
assert_eq!(matcher.find_at(haystack, 5), Ok(None));
assert_eq!(matcher.find_at(haystack, 6), Ok(None));
}
#[test]
fn empty_line6() {
let haystack = b"a\n";
let matcher = RegexMatcher::new(r"^$");
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2))));
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
}
}