mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-07-31 04:02:00 -07:00
This undoes the patch to stop using bytecount on big-endian
architectures. In particular, we bump our bytecount dependency to the
latest release, which has a fix.
This reverts commit a4868b8835
.
Fixes #1144 (again), Closes #1194
463 lines
16 KiB
Rust
463 lines
16 KiB
Rust
/*!
|
|
A collection of routines for performing operations on lines.
|
|
*/
|
|
|
|
use bytecount;
|
|
use memchr::{memchr, memrchr};
|
|
use grep_matcher::{LineTerminator, Match};
|
|
|
|
/// An iterator over lines in a particular slice of bytes.
|
|
///
|
|
/// Line terminators are considered part of the line they terminate. All lines
|
|
/// yielded by the iterator are guaranteed to be non-empty.
|
|
///
|
|
/// `'b` refers to the lifetime of the underlying bytes.
|
|
#[derive(Debug)]
|
|
pub struct LineIter<'b> {
|
|
bytes: &'b [u8],
|
|
stepper: LineStep,
|
|
}
|
|
|
|
impl<'b> LineIter<'b> {
|
|
/// Create a new line iterator that yields lines in the given bytes that
|
|
/// are terminated by `line_term`.
|
|
pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> {
|
|
LineIter {
|
|
bytes: bytes,
|
|
stepper: LineStep::new(line_term, 0, bytes.len()),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'b> Iterator for LineIter<'b> {
|
|
type Item = &'b [u8];
|
|
|
|
fn next(&mut self) -> Option<&'b [u8]> {
|
|
self.stepper.next_match(self.bytes).map(|m| &self.bytes[m])
|
|
}
|
|
}
|
|
|
|
/// An explicit iterator over lines in a particular slice of bytes.
|
|
///
|
|
/// This iterator avoids borrowing the bytes themselves, and instead requires
|
|
/// callers to explicitly provide the bytes when moving through the iterator.
|
|
/// While not idiomatic, this provides a simple way of iterating over lines
|
|
/// that doesn't require borrowing the slice itself, which can be convenient.
|
|
///
|
|
/// Line terminators are considered part of the line they terminate. All lines
|
|
/// yielded by the iterator are guaranteed to be non-empty.
|
|
#[derive(Debug)]
|
|
pub struct LineStep {
|
|
line_term: u8,
|
|
pos: usize,
|
|
end: usize,
|
|
}
|
|
|
|
impl LineStep {
|
|
/// Create a new line iterator over the given range of bytes using the
|
|
/// given line terminator.
|
|
///
|
|
/// Callers should provide the actual bytes for each call to `next`. The
|
|
/// same slice must be provided to each call.
|
|
///
|
|
/// This panics if `start` is not less than or equal to `end`.
|
|
pub fn new(line_term: u8, start: usize, end: usize) -> LineStep {
|
|
LineStep { line_term, pos: start, end: end }
|
|
}
|
|
|
|
/// Return the start and end position of the next line in the given bytes.
|
|
///
|
|
/// The caller must past exactly the same slice of bytes for each call to
|
|
/// `next`.
|
|
///
|
|
/// The range returned includes the line terminator. Ranges are always
|
|
/// non-empty.
|
|
pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
|
|
self.next_impl(bytes)
|
|
}
|
|
|
|
/// Like next, but returns a `Match` instead of a tuple.
|
|
#[inline(always)]
|
|
pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option<Match> {
|
|
self.next_impl(bytes).map(|(s, e)| Match::new(s, e))
|
|
}
|
|
|
|
#[inline(always)]
|
|
fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
|
|
bytes = &bytes[..self.end];
|
|
match memchr(self.line_term, &bytes[self.pos..]) {
|
|
None => {
|
|
if self.pos < bytes.len() {
|
|
let m = (self.pos, bytes.len());
|
|
assert!(m.0 <= m.1);
|
|
|
|
self.pos = m.1;
|
|
Some(m)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
Some(line_end) => {
|
|
let m = (self.pos, self.pos + line_end + 1);
|
|
assert!(m.0 <= m.1);
|
|
|
|
self.pos = m.1;
|
|
Some(m)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Count the number of occurrences of `line_term` in `bytes`.
|
|
pub fn count(bytes: &[u8], line_term: u8) -> u64 {
|
|
bytecount::count(bytes, line_term) as u64
|
|
}
|
|
|
|
/// Given a line that possibly ends with a terminator, return that line without
|
|
/// the terminator.
|
|
#[inline(always)]
|
|
pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] {
|
|
let line_term = line_term.as_bytes();
|
|
let start = bytes.len().saturating_sub(line_term.len());
|
|
if bytes.get(start..) == Some(line_term) {
|
|
return &bytes[..bytes.len() - line_term.len()];
|
|
}
|
|
bytes
|
|
}
|
|
|
|
/// Return the start and end offsets of the lines containing the given range
|
|
/// of bytes.
|
|
///
|
|
/// Line terminators are considered part of the line they terminate.
|
|
#[inline(always)]
|
|
pub fn locate(
|
|
bytes: &[u8],
|
|
line_term: u8,
|
|
range: Match,
|
|
) -> Match {
|
|
let line_start = memrchr(line_term, &bytes[0..range.start()])
|
|
.map_or(0, |i| i + 1);
|
|
let line_end =
|
|
if range.end() > line_start && bytes[range.end() - 1] == line_term {
|
|
range.end()
|
|
} else {
|
|
memchr(line_term, &bytes[range.end()..])
|
|
.map_or(bytes.len(), |i| range.end() + i + 1)
|
|
};
|
|
Match::new(line_start, line_end)
|
|
}
|
|
|
|
/// Returns the minimal starting offset of the line that occurs `count` lines
|
|
/// before the last line in `bytes`.
|
|
///
|
|
/// Lines are terminated by `line_term`. If `count` is zero, then this returns
|
|
/// the starting offset of the last line in `bytes`.
|
|
///
|
|
/// If `bytes` ends with a line terminator, then the terminator itself is
|
|
/// considered part of the last line.
|
|
pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize {
|
|
preceding_by_pos(bytes, bytes.len(), line_term, count)
|
|
}
|
|
|
|
/// Returns the minimal starting offset of the line that occurs `count` lines
|
|
/// before the line containing `pos`. Lines are terminated by `line_term`.
|
|
/// If `count` is zero, then this returns the starting offset of the line
|
|
/// containing `pos`.
|
|
///
|
|
/// If `pos` points just past a line terminator, then it is considered part of
|
|
/// the line that it terminates. For example, given `bytes = b"abc\nxyz\n"`
|
|
/// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos
|
|
/// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`.
|
|
fn preceding_by_pos(
|
|
bytes: &[u8],
|
|
mut pos: usize,
|
|
line_term: u8,
|
|
mut count: usize,
|
|
) -> usize {
|
|
if pos == 0 {
|
|
return 0;
|
|
} else if bytes[pos - 1] == line_term {
|
|
pos -= 1;
|
|
}
|
|
loop {
|
|
match memrchr(line_term, &bytes[..pos]) {
|
|
None => {
|
|
return 0;
|
|
}
|
|
Some(i) => {
|
|
if count == 0 {
|
|
return i + 1;
|
|
} else if i == 0 {
|
|
return 0;
|
|
}
|
|
count -= 1;
|
|
pos = i;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::ops::Range;
|
|
use std::str;
|
|
use grep_matcher::Match;
|
|
use super::*;
|
|
|
|
const SHERLOCK: &'static str = "\
|
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
|
Holmeses, success in the province of detective work must always
|
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
|
can extract a clew from a wisp of straw or a flake of cigar ash;
|
|
but Doctor Watson has to have it taken out for him and dusted,
|
|
and exhibited clearly, with a label attached.\
|
|
";
|
|
|
|
fn m(start: usize, end: usize) -> Match {
|
|
Match::new(start, end)
|
|
}
|
|
|
|
fn lines(text: &str) -> Vec<&str> {
|
|
let mut results = vec![];
|
|
let mut it = LineStep::new(b'\n', 0, text.len());
|
|
while let Some(m) = it.next_match(text.as_bytes()) {
|
|
results.push(&text[m]);
|
|
}
|
|
results
|
|
}
|
|
|
|
fn line_ranges(text: &str) -> Vec<Range<usize>> {
|
|
let mut results = vec![];
|
|
let mut it = LineStep::new(b'\n', 0, text.len());
|
|
while let Some(m) = it.next_match(text.as_bytes()) {
|
|
results.push(m.start()..m.end());
|
|
}
|
|
results
|
|
}
|
|
|
|
fn prev(text: &str, pos: usize, count: usize) -> usize {
|
|
preceding_by_pos(text.as_bytes(), pos, b'\n', count)
|
|
}
|
|
|
|
fn loc(text: &str, start: usize, end: usize) -> Match {
|
|
locate(text.as_bytes(), b'\n', Match::new(start, end))
|
|
}
|
|
|
|
#[test]
|
|
fn line_count() {
|
|
assert_eq!(0, count(b"", b'\n'));
|
|
assert_eq!(1, count(b"\n", b'\n'));
|
|
assert_eq!(2, count(b"\n\n", b'\n'));
|
|
assert_eq!(2, count(b"a\nb\nc", b'\n'));
|
|
}
|
|
|
|
#[test]
|
|
fn line_locate() {
|
|
let t = SHERLOCK;
|
|
let lines = line_ranges(t);
|
|
|
|
assert_eq!(
|
|
loc(t, lines[0].start, lines[0].end),
|
|
m(lines[0].start, lines[0].end));
|
|
assert_eq!(
|
|
loc(t, lines[0].start + 1, lines[0].end),
|
|
m(lines[0].start, lines[0].end));
|
|
assert_eq!(
|
|
loc(t, lines[0].end - 1, lines[0].end),
|
|
m(lines[0].start, lines[0].end));
|
|
assert_eq!(
|
|
loc(t, lines[0].end, lines[0].end),
|
|
m(lines[1].start, lines[1].end));
|
|
|
|
assert_eq!(
|
|
loc(t, lines[5].start, lines[5].end),
|
|
m(lines[5].start, lines[5].end));
|
|
assert_eq!(
|
|
loc(t, lines[5].start + 1, lines[5].end),
|
|
m(lines[5].start, lines[5].end));
|
|
assert_eq!(
|
|
loc(t, lines[5].end - 1, lines[5].end),
|
|
m(lines[5].start, lines[5].end));
|
|
assert_eq!(
|
|
loc(t, lines[5].end, lines[5].end),
|
|
m(lines[5].start, lines[5].end));
|
|
}
|
|
|
|
#[test]
|
|
fn line_locate_weird() {
|
|
assert_eq!(loc("", 0, 0), m(0, 0));
|
|
|
|
assert_eq!(loc("\n", 0, 1), m(0, 1));
|
|
assert_eq!(loc("\n", 1, 1), m(1, 1));
|
|
|
|
assert_eq!(loc("\n\n", 0, 0), m(0, 1));
|
|
assert_eq!(loc("\n\n", 0, 1), m(0, 1));
|
|
assert_eq!(loc("\n\n", 1, 1), m(1, 2));
|
|
assert_eq!(loc("\n\n", 1, 2), m(1, 2));
|
|
assert_eq!(loc("\n\n", 2, 2), m(2, 2));
|
|
|
|
assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2));
|
|
assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2));
|
|
assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4));
|
|
assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4));
|
|
assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5));
|
|
assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5));
|
|
}
|
|
|
|
#[test]
|
|
fn line_iter() {
|
|
assert_eq!(lines("abc"), vec!["abc"]);
|
|
|
|
assert_eq!(lines("abc\n"), vec!["abc\n"]);
|
|
assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]);
|
|
assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
|
|
|
|
assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]);
|
|
assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]);
|
|
assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]);
|
|
assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]);
|
|
assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]);
|
|
|
|
assert_eq!(lines("\n"), vec!["\n"]);
|
|
assert_eq!(lines(""), Vec::<&str>::new());
|
|
}
|
|
|
|
#[test]
|
|
fn line_iter_empty() {
|
|
let mut it = LineStep::new(b'\n', 0, 0);
|
|
assert_eq!(it.next(b"abc"), None);
|
|
}
|
|
|
|
#[test]
|
|
fn preceding_lines_doc() {
|
|
// These are the examples mentions in the documentation of `preceding`.
|
|
let bytes = b"abc\nxyz\n";
|
|
assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0));
|
|
assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0));
|
|
assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1));
|
|
assert_eq!(0, preceding_by_pos(bytes, 8, b'\n', 1));
|
|
}
|
|
|
|
#[test]
|
|
fn preceding_lines_sherlock() {
|
|
let t = SHERLOCK;
|
|
let lines = line_ranges(t);
|
|
|
|
// The following tests check the count == 0 case, i.e., finding the
|
|
// beginning of the line containing the given position.
|
|
assert_eq!(0, prev(t, 0, 0));
|
|
assert_eq!(0, prev(t, 1, 0));
|
|
// The line terminator is addressed by `end-1` and terminates the line
|
|
// it is part of.
|
|
assert_eq!(0, prev(t, lines[0].end - 1, 0));
|
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
|
|
// The end position of line addresses the byte immediately following a
|
|
// line terminator, which puts it on the following line.
|
|
assert_eq!(lines[1].start, prev(t, lines[0].end + 1, 0));
|
|
|
|
// Now tests for count > 0.
|
|
assert_eq!(0, prev(t, 0, 1));
|
|
assert_eq!(0, prev(t, 0, 2));
|
|
assert_eq!(0, prev(t, 1, 1));
|
|
assert_eq!(0, prev(t, 1, 2));
|
|
assert_eq!(0, prev(t, lines[0].end - 1, 1));
|
|
assert_eq!(0, prev(t, lines[0].end - 1, 2));
|
|
assert_eq!(0, prev(t, lines[0].end, 1));
|
|
assert_eq!(0, prev(t, lines[0].end, 2));
|
|
assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
|
|
assert_eq!(lines[3].start, prev(t, lines[4].end, 1));
|
|
assert_eq!(lines[4].start, prev(t, lines[4].end + 1, 1));
|
|
|
|
// The last line has no line terminator.
|
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
|
assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
|
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
|
}
|
|
|
|
#[test]
|
|
fn preceding_lines_short() {
|
|
let t = "a\nb\nc\nd\ne\nf\n";
|
|
let lines = line_ranges(t);
|
|
assert_eq!(12, t.len());
|
|
|
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
|
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
|
|
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
|
|
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
|
|
|
|
assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
|
|
assert_eq!(lines[4].start, prev(t, lines[5].end - 1, 1));
|
|
assert_eq!(lines[3].start, prev(t, lines[5].end - 1, 2));
|
|
assert_eq!(lines[2].start, prev(t, lines[5].end - 1, 3));
|
|
assert_eq!(lines[1].start, prev(t, lines[5].end - 1, 4));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 5));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 6));
|
|
|
|
assert_eq!(lines[4].start, prev(t, lines[5].start, 0));
|
|
assert_eq!(lines[3].start, prev(t, lines[5].start, 1));
|
|
assert_eq!(lines[2].start, prev(t, lines[5].start, 2));
|
|
assert_eq!(lines[1].start, prev(t, lines[5].start, 3));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].start, 4));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].start, 5));
|
|
|
|
assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
|
|
assert_eq!(lines[2].start, prev(t, lines[4].start, 1));
|
|
|
|
assert_eq!(lines[2].start, prev(t, lines[3].end - 1, 1));
|
|
assert_eq!(lines[1].start, prev(t, lines[3].start, 1));
|
|
|
|
assert_eq!(lines[1].start, prev(t, lines[2].end - 1, 1));
|
|
assert_eq!(lines[0].start, prev(t, lines[2].start, 1));
|
|
|
|
assert_eq!(lines[0].start, prev(t, lines[1].end - 1, 1));
|
|
assert_eq!(lines[0].start, prev(t, lines[1].start, 1));
|
|
|
|
assert_eq!(lines[0].start, prev(t, lines[0].end - 1, 1));
|
|
assert_eq!(lines[0].start, prev(t, lines[0].start, 1));
|
|
}
|
|
|
|
#[test]
|
|
fn preceding_lines_empty1() {
|
|
let t = "\n\n\nd\ne\nf\n";
|
|
let lines = line_ranges(t);
|
|
assert_eq!(9, t.len());
|
|
|
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
|
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
|
|
assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
|
|
assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
|
|
|
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
|
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
|
|
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
|
|
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
|
|
}
|
|
|
|
#[test]
|
|
fn preceding_lines_empty2() {
|
|
let t = "a\n\n\nd\ne\nf\n";
|
|
let lines = line_ranges(t);
|
|
assert_eq!(10, t.len());
|
|
|
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
|
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
|
|
assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
|
|
assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
|
|
|
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
|
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
|
|
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
|
|
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
|
|
}
|
|
}
|