Andrew Gallant 4bce2dff5d
libripgrep: initial commit introducing libripgrep
libripgrep is not any one library, but rather, a collection of libraries
that roughly separate the following key distinct phases in a grep
implementation:

  1. Pattern matching (e.g., by a regex engine).
  2. Searching a file using a pattern matcher.
  3. Printing results.

Ultimately, both (1) and (3) are defined by de-coupled interfaces, of
which there may be multiple implementations. Namely, (1) is satisfied by
the `Matcher` trait in the `grep-matcher` crate and (3) is satisfied by
the `Sink` trait in the `grep2` crate. The searcher (2) ties everything
together and finds results using a matcher and reports those results
using a `Sink` implementation.
2018-08-07 18:23:13 -04:00

1073 lines
36 KiB
Rust

/*!
An interface for regular expressions, with a focus on line oriented search.
*/
#![deny(missing_docs)]
extern crate memchr;
use std::fmt;
use std::io;
use std::ops;
use std::u64;
use interpolate::interpolate;
mod interpolate;
/// The type of a match.
///
/// The type of a match is a possibly empty range pointing to a contiguous
/// block of addressable memory.
///
/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`.
///
/// # Indexing
///
/// This type is structurally identical to `std::ops::Range<usize>`, but
/// is a bit more ergonomic for dealing with match indices. In particular,
/// this type implements `Copy` and provides methods for building new `Match`
/// values based on old `Match` values. Finally, the invariant that `start`
/// is always less than or equal to `end` is enforced.
///
/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using
/// range notation. e.g.,
///
/// ```
/// use grep_matcher::Match;
///
/// let m = Match::new(2, 5);
/// let bytes = b"abcdefghi";
/// assert_eq!(b"cde", &bytes[m]);
/// ```
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub struct Match {
start: usize,
end: usize,
}
impl Match {
/// Create a new match.
///
/// # Panics
///
/// This function panics if `start > end`.
#[inline]
pub fn new(start: usize, end: usize) -> Match {
assert!(start <= end);
Match { start, end }
}
/// Creates a zero width match at the given offset.
#[inline]
pub fn zero(offset: usize) -> Match {
Match { start: offset, end: offset }
}
/// Return the start offset of this match.
#[inline]
pub fn start(&self) -> usize {
self.start
}
/// Return the end offset of this match.
#[inline]
pub fn end(&self) -> usize {
self.end
}
/// Return a new match with the start offset replaced with the given
/// value.
///
/// # Panics
///
/// This method panics if `start > self.end`.
#[inline]
pub fn with_start(&self, start: usize) -> Match {
assert!(start <= self.end);
Match { start, ..*self }
}
/// Return a new match with the end offset replaced with the given
/// value.
///
/// # Panics
///
/// This method panics if `self.start > end`.
#[inline]
pub fn with_end(&self, end: usize) -> Match {
assert!(self.start <= end);
Match { end, ..*self }
}
/// Offset this match by the given amount and return a new match.
///
/// This adds the given offset to the start and end of this match, and
/// returns the resulting match.
///
/// # Panics
///
/// This panics if adding the given amount to either the start or end
/// offset would result in an overflow.
#[inline]
pub fn offset(&self, amount: usize) -> Match {
Match {
start: self.start.checked_add(amount).unwrap(),
end: self.end.checked_add(amount).unwrap(),
}
}
/// Returns the number of bytes in this match.
#[inline]
pub fn len(&self) -> usize {
self.end - self.start
}
/// Returns true if and only if this match is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl ops::Index<Match> for [u8] {
type Output = [u8];
#[inline]
fn index(&self, index: Match) -> &[u8] {
&self[index.start..index.end]
}
}
impl ops::IndexMut<Match> for [u8] {
#[inline]
fn index_mut(&mut self, index: Match) -> &mut [u8] {
&mut self[index.start..index.end]
}
}
impl ops::Index<Match> for str {
type Output = str;
#[inline]
fn index(&self, index: Match) -> &str {
&self[index.start..index.end]
}
}
/// A line terminator.
///
/// A line terminator represents the end of a line. Generally, every line is
/// either "terminated" by the end of a stream or a specific byte (or sequence
/// of bytes).
///
/// Generally, a line terminator is a single byte, specifically, `\n`, on
/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to
/// as `CRLF` for `Carriage Return; Line Feed`).
///
/// The default line terminator is `\n` on all platforms.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub struct LineTerminator(LineTerminatorImp);
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
enum LineTerminatorImp {
/// Any single byte representing a line terminator.
///
/// We represent this as an array so we can safely convert it to a slice
/// for convenient access. At some point, we can use `std::slice::from_ref`
/// instead.
Byte([u8; 1]),
/// A line terminator represented by `\r\n`.
///
/// When this option is used, consumers may generally treat a lone `\n` as
/// a line terminator in addition to `\r\n`.
CRLF,
}
impl LineTerminator {
/// Return a new single-byte line terminator. Any byte is valid.
pub fn byte(byte: u8) -> LineTerminator {
LineTerminator(LineTerminatorImp::Byte([byte]))
}
/// Return a new line terminator represented by `\r\n`.
///
/// When this option is used, consumers may generally treat a lone `\n` as
/// a line terminator in addition to `\r\n`.
pub fn crlf() -> LineTerminator {
LineTerminator(LineTerminatorImp::CRLF)
}
/// Returns true if and only if this line terminator is CRLF.
pub fn is_crlf(&self) -> bool {
self.0 == LineTerminatorImp::CRLF
}
/// Returns this line terminator as a single byte.
///
/// If the line terminator is CRLF, then this returns `\n`. This is
/// useful for routines that, for example, find line boundaries by treating
/// `\n` as a line terminator even when it isn't preceded by `\r`.
pub fn as_byte(&self) -> u8 {
match self.0 {
LineTerminatorImp::Byte(array) => array[0],
LineTerminatorImp::CRLF => b'\n',
}
}
/// Returns this line terminator as a sequence of bytes.
///
/// This returns a singleton sequence for all line terminators except for
/// `CRLF`, in which case, it returns `\r\n`.
///
/// The slice returned is guaranteed to have length at least `1`.
pub fn as_bytes(&self) -> &[u8] {
match self.0 {
LineTerminatorImp::Byte(ref array) => array,
LineTerminatorImp::CRLF => &[b'\r', b'\n'],
}
}
}
impl Default for LineTerminator {
fn default() -> LineTerminator {
LineTerminator::byte(b'\n')
}
}
/// A set of bytes.
///
/// In this crate, byte sets are used to express bytes that can never appear
/// anywhere in a match for a particular implementation of the `Matcher` trait.
/// Specifically, if such a set can be determined, then it's possible for
/// callers to perform additional operations on the basis that certain bytes
/// may never match.
///
/// For example, if a search is configured to possibly produce results that
/// span multiple lines but a caller provided pattern can never match across
/// multiple lines, then it may make sense to divert to more optimized line
/// oriented routines that don't need to handle the multi-line match case.
#[derive(Clone, Debug)]
pub struct ByteSet(BitSet);
#[derive(Clone, Copy)]
struct BitSet([u64; 4]);
impl fmt::Debug for BitSet {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut fmtd = f.debug_set();
for b in (0..256).map(|b| b as u8) {
if ByteSet(*self).contains(b) {
fmtd.entry(&b);
}
}
fmtd.finish()
}
}
impl ByteSet {
/// Create an empty set of bytes.
pub fn empty() -> ByteSet {
ByteSet(BitSet([0; 4]))
}
/// Create a full set of bytes such that every possible byte is in the set
/// returned.
pub fn full() -> ByteSet {
ByteSet(BitSet([u64::MAX; 4]))
}
/// Add a byte to this set.
///
/// If the given byte already belongs to this set, then this is a no-op.
pub fn add(&mut self, byte: u8) {
let bucket = byte / 64;
let bit = byte % 64;
(self.0).0[bucket as usize] |= 1 << bit;
}
/// Add an inclusive range of bytes.
pub fn add_all(&mut self, start: u8, end: u8) {
for b in (start as u64..end as u64 + 1).map(|b| b as u8) {
self.add(b);
}
}
/// Remove a byte from this set.
///
/// If the given byte is not in this set, then this is a no-op.
pub fn remove(&mut self, byte: u8) {
let bucket = byte / 64;
let bit = byte % 64;
(self.0).0[bucket as usize] &= !(1 << bit);
}
/// Remove an inclusive range of bytes.
pub fn remove_all(&mut self, start: u8, end: u8) {
for b in (start as u64..end as u64 + 1).map(|b| b as u8) {
self.remove(b);
}
}
/// Return true if and only if the given byte is in this set.
pub fn contains(&self, byte: u8) -> bool {
let bucket = byte / 64;
let bit = byte % 64;
(self.0).0[bucket as usize] & (1 << bit) > 0
}
}
/// A trait that describes implementations of capturing groups.
///
/// When a matcher supports capturing group extraction, then it is the
/// matcher's responsibility to provide an implementation of this trait.
///
/// Principally, this trait provides a way to access capturing groups
/// in a uniform way that does not require any specific representation.
/// Namely, differ matcher implementations may require different in-memory
/// representations of capturing groups. This trait permits matchers to
/// maintain their specific in-memory representation.
///
/// Note that this trait explicitly does not provide a way to construct a new
/// captures value. Instead, it is the responsibility of a `Matcher` to build
/// one, which might require knowledge of the matcher's internal implementation
/// details.
pub trait Captures {
/// Return the total number of capturing groups. This includes capturing
/// groups that have not matched anything.
fn len(&self) -> usize;
/// Return the capturing group match at the given index. If no match of
/// that capturing group exists, then this returns `None`.
///
/// When a matcher reports a match with capturing groups, then the first
/// capturing group (at index `0`) must always correspond to the offsets
/// for the overall match.
fn get(&self, i: usize) -> Option<Match>;
/// Returns true if and only if these captures are empty. This occurs
/// when `len` is `0`.
///
/// Note that capturing groups that have non-zero length but otherwise
/// contain no matching groups are *not* empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
/// Expands all instances of `$name` in `replacement` to the corresponding
/// capture group `name`, and writes them to the `dst` buffer given.
///
/// (Note: If you're looking for a convenient way to perform replacements
/// with interpolation, then you'll want to use the `replace_with_captures`
/// method on the `Matcher` trait.)
///
/// `name` may be an integer corresponding to the index of the
/// capture group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// A `name` is translated to a capture group index via the given
/// `name_to_index` function. If `name` isn't a valid capture group
/// (whether the name doesn't exist or isn't a valid index), then it is
/// replaced with the empty string.
///
/// The longest possible name is used. e.g., `$1a` looks up the capture
/// group named `1a` and not the capture group at index `1`. To exert
/// more precise control over the name, use braces, e.g., `${1}a`. In all
/// cases, capture group names are limited to ASCII letters, numbers and
/// underscores.
///
/// To write a literal `$` use `$$`.
///
/// Note that the capture group match indices are resolved by slicing
/// the given `haystack`. Generally, this means that `haystack` should be
/// the same slice that was searched to get the current capture group
/// matches.
fn interpolate<F>(
&self,
name_to_index: F,
haystack: &[u8],
replacement: &[u8],
dst: &mut Vec<u8>,
) where F: FnMut(&str) -> Option<usize>
{
interpolate(
replacement,
|i, dst| {
if let Some(range) = self.get(i) {
dst.extend(&haystack[range]);
}
},
name_to_index,
dst,
)
}
}
/// NoCaptures provides an always-empty implementation of the `Captures` trait.
///
/// This type is useful for implementations of `Matcher` that don't support
/// capturing groups.
#[derive(Clone, Debug)]
pub struct NoCaptures(());
impl NoCaptures {
/// Create an empty set of capturing groups.
pub fn new() -> NoCaptures { NoCaptures(()) }
}
impl Captures for NoCaptures {
fn len(&self) -> usize { 0 }
fn get(&self, _: usize) -> Option<Match> { None }
}
/// NoError provides an error type for matchers that never produce errors.
///
/// This error type implements the `std::error::Error` and `fmt::Display`
/// traits for use in matcher implementations that can never produce errors.
///
/// The `fmt::Display` impl for this type panics.
#[derive(Debug, Eq, PartialEq)]
pub struct NoError(());
impl ::std::error::Error for NoError {
fn description(&self) -> &str { "no error" }
}
impl fmt::Display for NoError {
fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result {
panic!("BUG for NoError: an impossible error occurred")
}
}
impl From<NoError> for io::Error {
fn from(_: NoError) -> io::Error {
panic!("BUG for NoError: an impossible error occurred")
}
}
/// The type of match for a line oriented matcher.
#[derive(Clone, Copy, Debug)]
pub enum LineMatchKind {
/// A position inside a line that is known to contain a match.
///
/// This position can be anywhere in the line. It does not need to point
/// at the location of the match.
Confirmed(usize),
/// A position inside a line that may contain a match, and must be searched
/// for verification.
///
/// This position can be anywhere in the line. It does not need to point
/// at the location of the match.
Candidate(usize),
}
/// A matcher defines an interface for regular expression implementations.
pub trait Matcher {
/// The concrete type of capturing groups used for this matcher.
///
/// If this implementation does not support capturing groups, then set
/// this to `NoCaptures`.
type Captures: Captures;
/// The error type used by this matcher.
///
/// For matchers in which an error is not possible, they are encouraged to
/// use the `NoError` type in this crate. In the future, when the "never"
/// (spelled `!`) type is stabilized, then it should probably be used
/// instead.
type Error: fmt::Display;
/// Returns the start and end byte range of the first match in `haystack`
/// after `at`, where the byte offsets are relative to that start of
/// `haystack` (and not `at`). If no match exists, then `None` is returned.
///
/// The text encoding of `haystack` is not strictly specified. Matchers are
/// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `at == 0`.
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, Self::Error>;
/// Creates an empty group of captures suitable for use with the capturing
/// APIs of this trait.
///
/// Implementations that don't support capturing groups should use
/// the `NoCaptures` type and implement this method by calling
/// `NoCaptures::new()`.
fn new_captures(&self) -> Result<Self::Captures, Self::Error>;
/// Returns the total number of capturing groups in this matcher.
///
/// If a matcher supports capturing groups, then this value must always be
/// at least 1, where the first capturing group always corresponds to the
/// overall match.
///
/// If a matcher does not support capturing groups, then this should
/// always return 0.
///
/// By default, capturing groups are not supported, so this always
/// returns 0.
fn capture_count(&self) -> usize {
0
}
/// Maps the given capture group name to its corresponding capture group
/// index, if one exists. If one does not exist, then `None` is returned.
///
/// If the given capture group name maps to multiple indices, then it is
/// not specified which one is returned. However, it is guaranteed that
/// one of them is returned.
///
/// By default, capturing groups are not supported, so this always returns
/// `None`.
fn capture_index(&self, _name: &str) -> Option<usize> {
None
}
/// Returns the start and end byte range of the first match in `haystack`.
/// If no match exists, then `None` is returned.
///
/// The text encoding of `haystack` is not strictly specified. Matchers are
/// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
fn find(
&self,
haystack: &[u8],
) -> Result<Option<Match>, Self::Error> {
self.find_at(haystack, 0)
}
/// Executes the given function over successive non-overlapping matches
/// in `haystack`. If no match exists, then the given function is never
/// called. If the function returns `false`, then iteration stops.
fn find_iter<F>(
&self,
haystack: &[u8],
mut matched: F,
) -> Result<(), Self::Error>
where F: FnMut(Match) -> bool
{
self.try_find_iter(haystack, |m| Ok(matched(m)))
.map(|r: Result<(), ()>| r.unwrap())
}
/// Executes the given function over successive non-overlapping matches
/// in `haystack`. If no match exists, then the given function is never
/// called. If the function returns `false`, then iteration stops.
/// Similarly, if the function returns an error then iteration stops and
/// the error is yielded. If an error occurs while executing the search,
/// then it is converted to
/// `E`.
fn try_find_iter<F, E>(
&self,
haystack: &[u8],
mut matched: F,
) -> Result<Result<(), E>, Self::Error>
where F: FnMut(Match) -> Result<bool, E>
{
let mut last_end = 0;
let mut last_match = None;
loop {
if last_end > haystack.len() {
return Ok(Ok(()));
}
let m = match self.find_at(haystack, last_end)? {
None => return Ok(Ok(())),
Some(m) => m,
};
if m.start == m.end {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
last_end = m.end + 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end) == last_match {
continue;
}
} else {
last_end = m.end;
}
last_match = Some(m.end);
match matched(m) {
Ok(true) => continue,
Ok(false) => return Ok(Ok(())),
Err(err) => return Ok(Err(err)),
}
}
}
/// Populates the first set of capture group matches from `haystack` into
/// `caps`. If no match exists, then `false` is returned.
///
/// The text encoding of `haystack` is not strictly specified. Matchers are
/// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
fn captures(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
) -> Result<bool, Self::Error> {
self.captures_at(haystack, 0, caps)
}
/// Executes the given function over successive non-overlapping matches
/// in `haystack` with capture groups extracted from each match. If no
/// match exists, then the given function is never called. If the function
/// returns `false`, then iteration stops.
fn captures_iter<F>(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
mut matched: F,
) -> Result<(), Self::Error>
where F: FnMut(&Self::Captures) -> bool
{
self.try_captures_iter(haystack, caps, |caps| Ok(matched(caps)))
.map(|r: Result<(), ()>| r.unwrap())
}
/// Executes the given function over successive non-overlapping matches
/// in `haystack` with capture groups extracted from each match. If no
/// match exists, then the given function is never called. If the function
/// returns `false`, then iteration stops. Similarly, if the function
/// returns an error then iteration stops and the error is yielded. If
/// an error occurs while executing the search, then it is converted to
/// `E`.
fn try_captures_iter<F, E>(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
mut matched: F,
) -> Result<Result<(), E>, Self::Error>
where F: FnMut(&Self::Captures) -> Result<bool, E>
{
let mut last_end = 0;
let mut last_match = None;
loop {
if last_end > haystack.len() {
return Ok(Ok(()));
}
if !self.captures_at(haystack, last_end, caps)? {
return Ok(Ok(()));
}
let m = caps.get(0).unwrap();
if m.start == m.end {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
last_end = m.end + 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(m.end) == last_match {
continue;
}
} else {
last_end = m.end;
}
last_match = Some(m.end);
match matched(caps) {
Ok(true) => continue,
Ok(false) => return Ok(Ok(())),
Err(err) => return Ok(Err(err)),
}
}
}
/// Populates the first set of capture group matches from `haystack`
/// into `matches` after `at`, where the byte offsets in each capturing
/// group are relative to the start of `haystack` (and not `at`). If no
/// match exists, then `false` is returned and the contents of the given
/// capturing groups are unspecified.
///
/// The text encoding of `haystack` is not strictly specified. Matchers are
/// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `at == 0`.
///
/// By default, capturing groups aren't supported, and this implementation
/// will always behave as if a match were impossible.
///
/// Implementors that provide support for capturing groups must guarantee
/// that when a match occurs, the first capture match (at index `0`) is
/// always set to the overall match offsets.
///
/// Note that if implementors seek to support capturing groups, then they
/// should implement this method. Other methods that match based on
/// captures will then work automatically.
fn captures_at(
&self,
_haystack: &[u8],
_at: usize,
_caps: &mut Self::Captures,
) -> Result<bool, Self::Error> {
Ok(false)
}
/// Replaces every match in the given haystack with the result of calling
/// `append`. `append` is given the start and end of a match, along with
/// a handle to the `dst` buffer provided.
///
/// If the given `append` function returns `false`, then replacement stops.
fn replace<F>(
&self,
haystack: &[u8],
dst: &mut Vec<u8>,
mut append: F,
) -> Result<(), Self::Error>
where F: FnMut(Match, &mut Vec<u8>) -> bool
{
let mut last_match = 0;
self.find_iter(haystack, |m| {
dst.extend(&haystack[last_match..m.start]);
last_match = m.end;
append(m, dst)
})?;
dst.extend(&haystack[last_match..]);
Ok(())
}
/// Replaces every match in the given haystack with the result of calling
/// `append` with the matching capture groups.
///
/// If the given `append` function returns `false`, then replacement stops.
fn replace_with_captures<F>(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
dst: &mut Vec<u8>,
mut append: F,
) -> Result<(), Self::Error>
where F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool
{
let mut last_match = 0;
self.captures_iter(haystack, caps, |caps| {
let m = caps.get(0).unwrap();
dst.extend(&haystack[last_match..m.start]);
last_match = m.end;
append(caps, dst)
})?;
dst.extend(&haystack[last_match..]);
Ok(())
}
/// Returns true if and only if the matcher matches the given haystack.
///
/// By default, this method is implemented by calling `shortest_match`.
fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
self.is_match_at(haystack, 0)
}
/// Returns true if and only if the matcher matches the given haystack
/// starting at the given position.
///
/// By default, this method is implemented by calling `shortest_match_at`.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `at == 0`.
fn is_match_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<bool, Self::Error> {
Ok(self.shortest_match_at(haystack, at)?.is_some())
}
/// Returns an end location of the first match in `haystack`. If no match
/// exists, then `None` is returned.
///
/// Note that the end location reported by this method may be less than the
/// same end location reported by `find`. For example, running `find` with
/// the pattern `a+` on the haystack `aaa` should report a range of `[0,
/// 3)`, but `shortest_match` may report `1` as the ending location since
/// that is the place at which a match is guaranteed to occur.
///
/// This method should never report false positives or false negatives. The
/// point of this method is that some implementors may be able to provide
/// a faster implementation of this than what `find` does.
///
/// By default, this method is implemented by calling `find`.
fn shortest_match(
&self,
haystack: &[u8],
) -> Result<Option<usize>, Self::Error> {
self.shortest_match_at(haystack, 0)
}
/// Returns an end location of the first match in `haystack` starting at
/// the given position. If no match exists, then `None` is returned.
///
/// Note that the end location reported by this method may be less than the
/// same end location reported by `find`. For example, running `find` with
/// the pattern `a+` on the haystack `aaa` should report a range of `[0,
/// 3)`, but `shortest_match` may report `1` as the ending location since
/// that is the place at which a match is guaranteed to occur.
///
/// This method should never report false positives or false negatives. The
/// point of this method is that some implementors may be able to provide
/// a faster implementation of this than what `find` does.
///
/// By default, this method is implemented by calling `find_at`.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `at == 0`.
fn shortest_match_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<usize>, Self::Error> {
Ok(self.find_at(haystack, at)?.map(|m| m.end))
}
/// If available, return a set of bytes that will never appear in a match
/// produced by an implementation.
///
/// Specifically, if such a set can be determined, then it's possible for
/// callers to perform additional operations on the basis that certain
/// bytes may never match.
///
/// For example, if a search is configured to possibly produce results
/// that span multiple lines but a caller provided pattern can never
/// match across multiple lines, then it may make sense to divert to
/// more optimized line oriented routines that don't need to handle the
/// multi-line match case.
///
/// Implementations that produce this set must never report false
/// positives, but may produce false negatives. That is, is a byte is in
/// this set then it must be guaranteed that it is never in a match. But,
/// if a byte is not in this set, then callers cannot assume that a match
/// exists with that byte.
///
/// By default, this returns `None`.
fn non_matching_bytes(&self) -> Option<&ByteSet> {
None
}
/// If this matcher was compiled as a line oriented matcher, then this
/// method returns the line terminator if and only if the line terminator
/// never appears in any match produced by this matcher. If this wasn't
/// compiled as a line oriented matcher, or if the aforementioned guarantee
/// cannot be made, then this must return `None`, which is the default.
/// It is **never wrong** to return `None`, but returning a line terminator
/// when it can appear in a match results in unspecified behavior.
///
/// The line terminator is typically `b'\n'`, but can be any single byte or
/// `CRLF`.
///
/// By default, this returns `None`.
fn line_terminator(&self) -> Option<LineTerminator> {
None
}
/// Return one of the following: a confirmed line match, a candidate line
/// match (which may be a false positive) or no match at all (which **must
/// not** be a false negative). When reporting a confirmed or candidate
/// match, the position returned can be any position in the line.
///
/// By default, this never returns a candidate match, and always either
/// returns a confirmed match or no match at all.
///
/// When a matcher can match spans over multiple lines, then the behavior
/// of this method is unspecified. Namely, use of this method only
/// makes sense in a context where the caller is looking for the next
/// matching line. That is, callers should only use this method when
/// `line_terminator` does not return `None`.
///
/// # Design rationale
///
/// A line matcher is, fundamentally, a normal matcher with the addition
/// of one optional method: finding a line. By default, this routine
/// is implemented via the matcher's `shortest_match` method, which
/// always yields either no match or a `LineMatchKind::Confirmed`. However,
/// implementors may provide a routine for this that can return candidate
/// lines that need subsequent verification to be confirmed as a match.
/// This can be useful in cases where it may be quicker to find candidate
/// lines via some other means instead of relying on the more general
/// implementations for `find` and `shortest_match`.
///
/// For example, consider the regex `\w+foo\s+`. Both `find` and
/// `shortest_match` must consider the entire regex, including the `\w+`
/// and `\s+`, while searching. However, this method could look for lines
/// containing `foo` and return them as candidates. Finding `foo` might
/// be implemented as a highly optimized substring search routine (like
/// `memmem`), which is likely to be faster than whatever more generalized
/// routine is required for resolving `\w+foo\s+`. The caller is then
/// responsible for confirming whether a match exists or not.
///
/// Note that while this method may report false positives, it must never
/// report false negatives. That is, it can never skip over lines that
/// contain a match.
fn find_candidate_line(
&self,
haystack: &[u8],
) -> Result<Option<LineMatchKind>, Self::Error> {
Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
}
}
impl<'a, M: Matcher> Matcher for &'a M {
type Captures = M::Captures;
type Error = M::Error;
fn find_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<Match>, Self::Error> {
(*self).find_at(haystack, at)
}
fn new_captures(&self) -> Result<Self::Captures, Self::Error> {
(*self).new_captures()
}
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut Self::Captures,
) -> Result<bool, Self::Error> {
(*self).captures_at(haystack, at, caps)
}
fn capture_index(&self, name: &str) -> Option<usize> {
(*self).capture_index(name)
}
fn capture_count(&self) -> usize {
(*self).capture_count()
}
fn find(
&self,
haystack: &[u8]
) -> Result<Option<Match>, Self::Error> {
(*self).find(haystack)
}
fn find_iter<F>(
&self,
haystack: &[u8],
matched: F,
) -> Result<(), Self::Error>
where F: FnMut(Match) -> bool
{
(*self).find_iter(haystack, matched)
}
fn try_find_iter<F, E>(
&self,
haystack: &[u8],
matched: F,
) -> Result<Result<(), E>, Self::Error>
where F: FnMut(Match) -> Result<bool, E>
{
(*self).try_find_iter(haystack, matched)
}
fn captures(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
) -> Result<bool, Self::Error> {
(*self).captures(haystack, caps)
}
fn captures_iter<F>(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
matched: F,
) -> Result<(), Self::Error>
where F: FnMut(&Self::Captures) -> bool
{
(*self).captures_iter(haystack, caps, matched)
}
fn try_captures_iter<F, E>(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
matched: F,
) -> Result<Result<(), E>, Self::Error>
where F: FnMut(&Self::Captures) -> Result<bool, E>
{
(*self).try_captures_iter(haystack, caps, matched)
}
fn replace<F>(
&self,
haystack: &[u8],
dst: &mut Vec<u8>,
append: F,
) -> Result<(), Self::Error>
where F: FnMut(Match, &mut Vec<u8>) -> bool
{
(*self).replace(haystack, dst, append)
}
fn replace_with_captures<F>(
&self,
haystack: &[u8],
caps: &mut Self::Captures,
dst: &mut Vec<u8>,
append: F,
) -> Result<(), Self::Error>
where F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool
{
(*self).replace_with_captures(haystack, caps, dst, append)
}
fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
(*self).is_match(haystack)
}
fn is_match_at(
&self,
haystack: &[u8],
at: usize
) -> Result<bool, Self::Error> {
(*self).is_match_at(haystack, at)
}
fn shortest_match(
&self,
haystack: &[u8],
) -> Result<Option<usize>, Self::Error> {
(*self).shortest_match(haystack)
}
fn shortest_match_at(
&self,
haystack: &[u8],
at: usize,
) -> Result<Option<usize>, Self::Error> {
(*self).shortest_match_at(haystack, at)
}
fn non_matching_bytes(&self) -> Option<&ByteSet> {
(*self).non_matching_bytes()
}
fn line_terminator(&self) -> Option<LineTerminator> {
(*self).line_terminator()
}
fn find_candidate_line(
&self,
haystack: &[u8],
) -> Result<Option<LineMatchKind>, Self::Error> {
(*self).find_candidate_line(haystack)
}
}