diff --git a/Cargo.lock b/Cargo.lock index 8f8965d3..20447b8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -34,6 +34,13 @@ name = "bitflags" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "bstr" +version = "0.0.1" +dependencies = [ + "memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "bytecount" version = "0.5.0" @@ -180,7 +187,7 @@ dependencies = [ name = "grep-matcher" version = "0.1.1" dependencies = [ - "memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "bstr 0.0.1", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -222,13 +229,13 @@ dependencies = [ name = "grep-searcher" version = "0.1.1" dependencies = [ + "bstr 0.0.1", "bytecount 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs 0.8.14 (registry+https://github.com/rust-lang/crates.io-index)", "encoding_rs_io 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "grep-matcher 0.1.1", "grep-regex 0.1.1", "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] diff --git a/grep-matcher/Cargo.toml b/grep-matcher/Cargo.toml index 39391291..d65ee505 100644 --- a/grep-matcher/Cargo.toml +++ b/grep-matcher/Cargo.toml @@ -13,8 +13,11 @@ keywords = ["regex", "pattern", "trait"] license = "Unlicense/MIT" autotests = false -[dependencies] -memchr = "2.1" +[dependencies.bstr] +version = "*" +path = "/home/andrew/rust/bstr" +default-features = false +features = ["std"] [dev-dependencies] regex = "1.1" diff --git a/grep-matcher/src/interpolate.rs b/grep-matcher/src/interpolate.rs index 168dd343..126ce521 100644 --- a/grep-matcher/src/interpolate.rs +++ b/grep-matcher/src/interpolate.rs @@ -1,6 +1,6 @@ use std::str; -use memchr::memchr; +use bstr::B; /// Interpolate capture references in `replacement` and write the interpolation /// result to `dst`. References in `replacement` take the form of $N or $name, @@ -22,7 +22,7 @@ pub fn interpolate( N: FnMut(&str) -> Option { while !replacement.is_empty() { - match memchr(b'$', replacement) { + match B(replacement).find_byte(b'$') { None => break, Some(i) => { dst.extend(&replacement[..i]); diff --git a/grep-matcher/src/lib.rs b/grep-matcher/src/lib.rs index 9a067efa..ba59b923 100644 --- a/grep-matcher/src/lib.rs +++ b/grep-matcher/src/lib.rs @@ -38,13 +38,15 @@ implementations. #![deny(missing_docs)] -extern crate memchr; +extern crate bstr; use std::fmt; use std::io; use std::ops; use std::u64; +use bstr::BStr; + use interpolate::interpolate; mod interpolate; @@ -180,6 +182,22 @@ impl ops::IndexMut for [u8] { } } +impl ops::Index for BStr { + type Output = BStr; + + #[inline] + fn index(&self, index: Match) -> &BStr { + &self[index.start..index.end] + } +} + +impl ops::IndexMut for BStr { + #[inline] + fn index_mut(&mut self, index: Match) -> &mut BStr { + &mut self[index.start..index.end] + } +} + impl ops::Index for str { type Output = str; diff --git a/grep-searcher/Cargo.toml b/grep-searcher/Cargo.toml index 4cf5fde3..1e5c4243 100644 --- a/grep-searcher/Cargo.toml +++ b/grep-searcher/Cargo.toml @@ -18,9 +18,14 @@ encoding_rs = "0.8.14" encoding_rs_io = "0.1.3" grep-matcher = { version = "0.1.1", path = "../grep-matcher" } log = "0.4.5" -memchr = "2.1" memmap = "0.7" +[dependencies.bstr] +version = "*" +path = "/home/andrew/rust/bstr" +default-features = false +features = ["std"] + [dev-dependencies] grep-regex = { version = "0.1.1", path = "../grep-regex" } regex = "1.1" diff --git a/grep-searcher/src/lib.rs b/grep-searcher/src/lib.rs index f3ec02f2..6a9f4ba7 100644 --- a/grep-searcher/src/lib.rs +++ b/grep-searcher/src/lib.rs @@ -99,13 +99,13 @@ searches stdin. #![deny(missing_docs)] +extern crate bstr; extern crate bytecount; extern crate encoding_rs; extern crate encoding_rs_io; extern crate grep_matcher; #[macro_use] extern crate log; -extern crate memchr; extern crate memmap; #[cfg(test)] extern crate regex; diff --git a/grep-searcher/src/line_buffer.rs b/grep-searcher/src/line_buffer.rs index 0f5a2a7a..5a969743 100644 --- a/grep-searcher/src/line_buffer.rs +++ b/grep-searcher/src/line_buffer.rs @@ -1,8 +1,7 @@ use std::cmp; use std::io; -use std::ptr; -use memchr::{memchr, memrchr}; +use bstr::{BStr, BString}; /// The default buffer capacity that we use for the line buffer. pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB @@ -123,7 +122,7 @@ impl LineBufferBuilder { pub fn build(&self) -> LineBuffer { LineBuffer { config: self.config, - buf: vec![0; self.config.capacity], + buf: BString::from(vec![0; self.config.capacity]), pos: 0, last_lineterm: 0, end: 0, @@ -254,7 +253,7 @@ impl<'b, R: io::Read> LineBufferReader<'b, R> { } /// Return the contents of this buffer. - pub fn buffer(&self) -> &[u8] { + pub fn buffer(&self) -> &BStr { self.line_buffer.buffer() } @@ -284,7 +283,7 @@ pub struct LineBuffer { /// The configuration of this buffer. config: Config, /// The primary buffer with which to hold data. - buf: Vec, + buf: BString, /// The current position of this buffer. This is always a valid sliceable /// index into `buf`, and its maximum value is the length of `buf`. pos: usize, @@ -339,13 +338,13 @@ impl LineBuffer { } /// Return the contents of this buffer. - fn buffer(&self) -> &[u8] { + fn buffer(&self) -> &BStr { &self.buf[self.pos..self.last_lineterm] } /// Return the contents of the free space beyond the end of the buffer as /// a mutable slice. - fn free_buffer(&mut self) -> &mut [u8] { + fn free_buffer(&mut self) -> &mut BStr { &mut self.buf[self.end..] } @@ -396,7 +395,7 @@ impl LineBuffer { assert_eq!(self.pos, 0); loop { self.ensure_capacity()?; - let readlen = rdr.read(self.free_buffer())?; + let readlen = rdr.read(self.free_buffer().as_bytes_mut())?; if readlen == 0 { // We're only done reading for good once the caller has // consumed everything. @@ -416,7 +415,7 @@ impl LineBuffer { match self.config.binary { BinaryDetection::None => {} // nothing to do BinaryDetection::Quit(byte) => { - if let Some(i) = memchr(byte, newbytes) { + if let Some(i) = newbytes.find_byte(byte) { self.end = oldend + i; self.last_lineterm = self.end; self.binary_byte_offset = @@ -444,7 +443,7 @@ impl LineBuffer { } // Update our `last_lineterm` positions if we read one. - if let Some(i) = memrchr(self.config.lineterm, newbytes) { + if let Some(i) = newbytes.rfind_byte(self.config.lineterm) { self.last_lineterm = oldend + i + 1; return Ok(true); } @@ -467,40 +466,8 @@ impl LineBuffer { return; } - assert!(self.pos < self.end && self.end <= self.buf.len()); let roll_len = self.end - self.pos; - unsafe { - // SAFETY: A buffer contains Copy data, so there's no problem - // moving it around. Safety also depends on our indices being - // in bounds, which they should always be, and we enforce with - // an assert above. - // - // It seems like it should be possible to do this in safe code that - // results in the same codegen. I tried the obvious: - // - // for (src, dst) in (self.pos..self.end).zip(0..) { - // self.buf[dst] = self.buf[src]; - // } - // - // But the above does not work, and in fact compiles down to a slow - // byte-by-byte loop. I tried a few other minor variations, but - // alas, better minds might prevail. - // - // Overall, this doesn't save us *too* much. It mostly matters when - // the number of bytes we're copying is large, which can happen - // if the searcher is asked to produce a lot of context. We could - // decide this isn't worth it, but it does make an appreciable - // impact at or around the context=30 range on my machine. - // - // We could also use a temporary buffer that compiles down to two - // memcpys and is faster than the byte-at-a-time loop, but it - // complicates our options for limiting memory allocation a bit. - ptr::copy( - self.buf[self.pos..].as_ptr(), - self.buf.as_mut_ptr(), - roll_len, - ); - } + self.buf.copy_within(self.pos.., 0); self.pos = 0; self.last_lineterm = roll_len; self.end = roll_len; @@ -536,14 +503,15 @@ impl LineBuffer { } } -/// Replaces `src` with `replacement` in bytes. -fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option { +/// Replaces `src` with `replacement` in bytes, and return the offset of the +/// first replacement, if one exists. +fn replace_bytes(bytes: &mut BStr, src: u8, replacement: u8) -> Option { if src == replacement { return None; } let mut first_pos = None; let mut pos = 0; - while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) { + while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) { if first_pos.is_none() { first_pos = Some(i); } @@ -560,6 +528,7 @@ fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option { #[cfg(test)] mod tests { use std::str; + use bstr::BString; use super::*; const SHERLOCK: &'static str = "\ @@ -575,18 +544,14 @@ and exhibited clearly, with a label attached.\ slice.to_string() } - fn btos(slice: &[u8]) -> &str { - str::from_utf8(slice).unwrap() - } - fn replace_str( slice: &str, src: u8, replacement: u8, ) -> (String, Option) { - let mut dst = slice.to_string().into_bytes(); + let mut dst = BString::from(slice); let result = replace_bytes(&mut dst, src, replacement); - (String::from_utf8(dst).unwrap(), result) + (dst.into_string().unwrap(), result) } #[test] @@ -607,7 +572,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\n"); + assert_eq!(rdr.buffer(), "homer\nlisa\n"); assert_eq!(rdr.absolute_byte_offset(), 0); rdr.consume(5); assert_eq!(rdr.absolute_byte_offset(), 5); @@ -615,7 +580,7 @@ and exhibited clearly, with a label attached.\ assert_eq!(rdr.absolute_byte_offset(), 11); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "maggie"); + assert_eq!(rdr.buffer(), "maggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -630,7 +595,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n"); + assert_eq!(rdr.buffer(), "homer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -645,7 +610,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\n"); + assert_eq!(rdr.buffer(), "\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -660,7 +625,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\n\n"); + assert_eq!(rdr.buffer(), "\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -698,12 +663,12 @@ and exhibited clearly, with a label attached.\ let mut linebuf = LineBufferBuilder::new().capacity(1).build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); - let mut got = vec![]; + let mut got = BString::new(); while rdr.fill().unwrap() { - got.extend(rdr.buffer()); + got.push(rdr.buffer()); rdr.consume_all(); } - assert_eq!(bytes, btos(&got)); + assert_eq!(bytes, got); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } @@ -718,11 +683,11 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\n"); + assert_eq!(rdr.buffer(), "homer\n"); rdr.consume_all(); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "lisa\n"); + assert_eq!(rdr.buffer(), "lisa\n"); rdr.consume_all(); // This returns an error because while we have just enough room to @@ -732,11 +697,11 @@ and exhibited clearly, with a label attached.\ assert!(rdr.fill().is_err()); // We can mush on though! - assert_eq!(btos(rdr.buffer()), "m"); + assert_eq!(rdr.buffer(), "m"); rdr.consume_all(); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "aggie"); + assert_eq!(rdr.buffer(), "aggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -752,16 +717,16 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\n"); + assert_eq!(rdr.buffer(), "homer\n"); rdr.consume_all(); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "lisa\n"); + assert_eq!(rdr.buffer(), "lisa\n"); rdr.consume_all(); // We have just enough space. assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "maggie"); + assert_eq!(rdr.buffer(), "maggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -777,7 +742,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().is_err()); - assert_eq!(btos(rdr.buffer()), ""); + assert_eq!(rdr.buffer(), ""); } #[test] @@ -789,7 +754,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n"); + assert_eq!(rdr.buffer(), "homer\nli\x00sa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -808,7 +773,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nli"); + assert_eq!(rdr.buffer(), "homer\nli"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -825,7 +790,7 @@ and exhibited clearly, with a label attached.\ let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(!rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), ""); + assert_eq!(rdr.buffer(), ""); assert_eq!(rdr.absolute_byte_offset(), 0); assert_eq!(rdr.binary_byte_offset(), Some(0)); } @@ -841,7 +806,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n"); + assert_eq!(rdr.buffer(), "homer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -860,7 +825,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie"); + assert_eq!(rdr.buffer(), "homer\nlisa\nmaggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -878,7 +843,7 @@ and exhibited clearly, with a label attached.\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\ + assert_eq!(rdr.buffer(), "\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, s\ "); @@ -901,7 +866,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n"); + assert_eq!(rdr.buffer(), "homer\nli\nsa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -920,7 +885,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n"); + assert_eq!(rdr.buffer(), "\nhomer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -939,7 +904,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n"); + assert_eq!(rdr.buffer(), "homer\nlisa\nmaggie\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); @@ -958,7 +923,7 @@ Holmeses, s\ assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); - assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n"); + assert_eq!(rdr.buffer(), "homer\nlisa\nmaggie\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); diff --git a/grep-searcher/src/lines.rs b/grep-searcher/src/lines.rs index ed225a42..aac7a343 100644 --- a/grep-searcher/src/lines.rs +++ b/grep-searcher/src/lines.rs @@ -2,8 +2,8 @@ A collection of routines for performing operations on lines. */ +use bstr::{B, BStr}; use bytecount; -use memchr::{memchr, memrchr}; use grep_matcher::{LineTerminator, Match}; /// An iterator over lines in a particular slice of bytes. @@ -14,7 +14,7 @@ use grep_matcher::{LineTerminator, Match}; /// `'b` refers to the lifetime of the underlying bytes. #[derive(Debug)] pub struct LineIter<'b> { - bytes: &'b [u8], + bytes: &'b BStr, stepper: LineStep, } @@ -23,7 +23,7 @@ impl<'b> LineIter<'b> { /// are terminated by `line_term`. pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> { LineIter { - bytes: bytes, + bytes: B(bytes), stepper: LineStep::new(line_term, 0, bytes.len()), } } @@ -33,7 +33,7 @@ impl<'b> Iterator for LineIter<'b> { type Item = &'b [u8]; fn next(&mut self) -> Option<&'b [u8]> { - self.stepper.next_match(self.bytes).map(|m| &self.bytes[m]) + self.stepper.next_match(self.bytes).map(|m| self.bytes[m].as_bytes()) } } @@ -73,19 +73,19 @@ impl LineStep { /// The range returned includes the line terminator. Ranges are always /// non-empty. pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> { - self.next_impl(bytes) + self.next_impl(B(bytes)) } /// Like next, but returns a `Match` instead of a tuple. #[inline(always)] - pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option { + pub(crate) fn next_match(&mut self, bytes: &BStr) -> Option { self.next_impl(bytes).map(|(s, e)| Match::new(s, e)) } #[inline(always)] - fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> { + fn next_impl(&mut self, mut bytes: &BStr) -> Option<(usize, usize)> { bytes = &bytes[..self.end]; - match memchr(self.line_term, &bytes[self.pos..]) { + match bytes[self.pos..].find_byte(self.line_term) { None => { if self.pos < bytes.len() { let m = (self.pos, bytes.len()); @@ -109,15 +109,15 @@ impl LineStep { } /// Count the number of occurrences of `line_term` in `bytes`. -pub fn count(bytes: &[u8], line_term: u8) -> u64 { - bytecount::count(bytes, line_term) as u64 +pub fn count(bytes: &BStr, line_term: u8) -> u64 { + bytecount::count(bytes.as_bytes(), line_term) as u64 } /// Given a line that possibly ends with a terminator, return that line without /// the terminator. #[inline(always)] -pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] { - let line_term = line_term.as_bytes(); +pub fn without_terminator(bytes: &BStr, line_term: LineTerminator) -> &BStr { + let line_term = BStr::new(line_term.as_bytes()); let start = bytes.len().saturating_sub(line_term.len()); if bytes.get(start..) == Some(line_term) { return &bytes[..bytes.len() - line_term.len()]; @@ -131,18 +131,20 @@ pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] { /// Line terminators are considered part of the line they terminate. #[inline(always)] pub fn locate( - bytes: &[u8], + bytes: &BStr, line_term: u8, range: Match, ) -> Match { - let line_start = memrchr(line_term, &bytes[0..range.start()]) + let line_start = bytes[..range.start()] + .rfind_byte(line_term) .map_or(0, |i| i + 1); let line_end = if range.end() > line_start && bytes[range.end() - 1] == line_term { range.end() } else { - memchr(line_term, &bytes[range.end()..]) - .map_or(bytes.len(), |i| range.end() + i + 1) + bytes[range.end()..] + .find_byte(line_term) + .map_or(bytes.len(), |i| range.end() + i + 1) }; Match::new(line_start, line_end) } @@ -155,7 +157,7 @@ pub fn locate( /// /// If `bytes` ends with a line terminator, then the terminator itself is /// considered part of the last line. -pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize { +pub fn preceding(bytes: &BStr, line_term: u8, count: usize) -> usize { preceding_by_pos(bytes, bytes.len(), line_term, count) } @@ -169,7 +171,7 @@ pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize { /// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos /// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`. fn preceding_by_pos( - bytes: &[u8], + bytes: &BStr, mut pos: usize, line_term: u8, mut count: usize, @@ -180,7 +182,7 @@ fn preceding_by_pos( pos -= 1; } loop { - match memrchr(line_term, &bytes[..pos]) { + match bytes[..pos].rfind_byte(line_term) { None => { return 0; } @@ -201,7 +203,10 @@ fn preceding_by_pos( mod tests { use std::ops::Range; use std::str; + + use bstr::B; use grep_matcher::Match; + use super::*; const SHERLOCK: &'static str = "\ @@ -220,7 +225,7 @@ and exhibited clearly, with a label attached.\ fn lines(text: &str) -> Vec<&str> { let mut results = vec![]; let mut it = LineStep::new(b'\n', 0, text.len()); - while let Some(m) = it.next_match(text.as_bytes()) { + while let Some(m) = it.next_match(B(text)) { results.push(&text[m]); } results @@ -229,26 +234,26 @@ and exhibited clearly, with a label attached.\ fn line_ranges(text: &str) -> Vec> { let mut results = vec![]; let mut it = LineStep::new(b'\n', 0, text.len()); - while let Some(m) = it.next_match(text.as_bytes()) { + while let Some(m) = it.next_match(B(text)) { results.push(m.start()..m.end()); } results } fn prev(text: &str, pos: usize, count: usize) -> usize { - preceding_by_pos(text.as_bytes(), pos, b'\n', count) + preceding_by_pos(B(text), pos, b'\n', count) } fn loc(text: &str, start: usize, end: usize) -> Match { - locate(text.as_bytes(), b'\n', Match::new(start, end)) + locate(B(text), b'\n', Match::new(start, end)) } #[test] fn line_count() { - assert_eq!(0, count(b"", b'\n')); - assert_eq!(1, count(b"\n", b'\n')); - assert_eq!(2, count(b"\n\n", b'\n')); - assert_eq!(2, count(b"a\nb\nc", b'\n')); + assert_eq!(0, count(B(""), b'\n')); + assert_eq!(1, count(B("\n"), b'\n')); + assert_eq!(2, count(B("\n\n"), b'\n')); + assert_eq!(2, count(B("a\nb\nc"), b'\n')); } #[test] @@ -331,7 +336,7 @@ and exhibited clearly, with a label attached.\ #[test] fn preceding_lines_doc() { // These are the examples mentions in the documentation of `preceding`. - let bytes = b"abc\nxyz\n"; + let bytes = B("abc\nxyz\n"); assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0)); assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0)); assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1)); diff --git a/grep-searcher/src/searcher/core.rs b/grep-searcher/src/searcher/core.rs index 21dbae37..77f8369b 100644 --- a/grep-searcher/src/searcher/core.rs +++ b/grep-searcher/src/searcher/core.rs @@ -1,6 +1,6 @@ use std::cmp; -use memchr::memchr; +use bstr::BStr; use grep_matcher::{LineMatchKind, Matcher}; use lines::{self, LineStep}; @@ -84,7 +84,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { pub fn matched( &mut self, - buf: &[u8], + buf: &BStr, range: &Range, ) -> Result { self.sink_matched(buf, range) @@ -107,7 +107,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { }) } - pub fn match_by_line(&mut self, buf: &[u8]) -> Result { + pub fn match_by_line(&mut self, buf: &BStr) -> Result { if self.is_line_by_line_fast() { self.match_by_line_fast(buf) } else { @@ -115,7 +115,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { } } - pub fn roll(&mut self, buf: &[u8]) -> usize { + pub fn roll(&mut self, buf: &BStr) -> usize { let consumed = if self.config.max_context() == 0 { buf.len() @@ -141,7 +141,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { consumed } - pub fn detect_binary(&mut self, buf: &[u8], range: &Range) -> bool { + pub fn detect_binary(&mut self, buf: &BStr, range: &Range) -> bool { if self.binary_byte_offset.is_some() { return true; } @@ -149,7 +149,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { BinaryDetection::Quit(b) => b, _ => return false, }; - if let Some(i) = memchr(binary_byte, &buf[*range]) { + if let Some(i) = buf[*range].find_byte(binary_byte) { self.binary_byte_offset = Some(range.start() + i); true } else { @@ -159,7 +159,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { pub fn before_context_by_line( &mut self, - buf: &[u8], + buf: &BStr, upto: usize, ) -> Result { if self.config.before_context == 0 { @@ -194,7 +194,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { pub fn after_context_by_line( &mut self, - buf: &[u8], + buf: &BStr, upto: usize, ) -> Result { if self.after_context_left == 0 { @@ -219,7 +219,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { pub fn other_context_by_line( &mut self, - buf: &[u8], + buf: &BStr, upto: usize, ) -> Result { let range = Range::new(self.last_line_visited, upto); @@ -236,7 +236,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { Ok(true) } - fn match_by_line_slow(&mut self, buf: &[u8]) -> Result { + fn match_by_line_slow(&mut self, buf: &BStr) -> Result { debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); let range = Range::new(self.pos(), buf.len()); @@ -255,7 +255,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { &buf[line], self.config.line_term, ); - match self.matcher.shortest_match(slice) { + match self.matcher.shortest_match(slice.as_bytes()) { Err(err) => return Err(S::Error::error_message(err)), Ok(result) => result.is_some(), } @@ -281,7 +281,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { Ok(true) } - fn match_by_line_fast(&mut self, buf: &[u8]) -> Result { + fn match_by_line_fast(&mut self, buf: &BStr) -> Result { debug_assert!(!self.config.passthru); while !buf[self.pos()..].is_empty() { @@ -316,7 +316,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { #[inline(always)] fn match_by_line_fast_invert( &mut self, - buf: &[u8], + buf: &BStr, ) -> Result { assert!(self.config.invert_match); @@ -357,14 +357,14 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { #[inline(always)] fn find_by_line_fast( &self, - buf: &[u8], + buf: &BStr, ) -> Result, S::Error> { debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); debug_assert!(self.is_line_by_line_fast()); let mut pos = self.pos(); while !buf[pos..].is_empty() { - match self.matcher.find_candidate_line(&buf[pos..]) { + match self.matcher.find_candidate_line(buf[pos..].as_bytes()) { Err(err) => return Err(S::Error::error_message(err)), Ok(None) => return Ok(None), Ok(Some(LineMatchKind::Confirmed(i))) => { @@ -396,7 +396,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { &buf[line], self.config.line_term, ); - match self.matcher.is_match(slice) { + match self.matcher.is_match(slice.as_bytes()) { Err(err) => return Err(S::Error::error_message(err)), Ok(true) => return Ok(Some(line)), Ok(false) => { @@ -413,7 +413,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { #[inline(always)] fn sink_matched( &mut self, - buf: &[u8], + buf: &BStr, range: &Range, ) -> Result { if self.binary && self.detect_binary(buf, range) { @@ -438,7 +438,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { &self.searcher, &SinkMatch { line_term: self.config.line_term, - bytes: linebuf, + bytes: linebuf.as_bytes(), absolute_byte_offset: offset, line_number: self.line_number, }, @@ -454,7 +454,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { fn sink_before_context( &mut self, - buf: &[u8], + buf: &BStr, range: &Range, ) -> Result { if self.binary && self.detect_binary(buf, range) { @@ -466,7 +466,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { &self.searcher, &SinkContext { line_term: self.config.line_term, - bytes: &buf[*range], + bytes: buf[*range].as_bytes(), kind: SinkContextKind::Before, absolute_byte_offset: offset, line_number: self.line_number, @@ -482,7 +482,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { fn sink_after_context( &mut self, - buf: &[u8], + buf: &BStr, range: &Range, ) -> Result { assert!(self.after_context_left >= 1); @@ -496,7 +496,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { &self.searcher, &SinkContext { line_term: self.config.line_term, - bytes: &buf[*range], + bytes: buf[*range].as_bytes(), kind: SinkContextKind::After, absolute_byte_offset: offset, line_number: self.line_number, @@ -513,7 +513,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { fn sink_other_context( &mut self, - buf: &[u8], + buf: &BStr, range: &Range, ) -> Result { if self.binary && self.detect_binary(buf, range) { @@ -525,7 +525,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { &self.searcher, &SinkContext { line_term: self.config.line_term, - bytes: &buf[*range], + bytes: buf[*range].as_bytes(), kind: SinkContextKind::Other, absolute_byte_offset: offset, line_number: self.line_number, @@ -555,7 +555,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { } } - fn count_lines(&mut self, buf: &[u8], upto: usize) { + fn count_lines(&mut self, buf: &BStr, upto: usize) { if let Some(ref mut line_number) = self.line_number { if self.last_line_counted >= upto { return; diff --git a/grep-searcher/src/searcher/glue.rs b/grep-searcher/src/searcher/glue.rs index 3a5d4291..5d24129a 100644 --- a/grep-searcher/src/searcher/glue.rs +++ b/grep-searcher/src/searcher/glue.rs @@ -1,7 +1,9 @@ use std::cmp; use std::io; +use bstr::BStr; use grep_matcher::Matcher; + use lines::{self, LineStep}; use line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader}; use sink::{Sink, SinkError}; @@ -77,14 +79,14 @@ where M: Matcher, pub struct SliceByLine<'s, M: 's, S> { config: &'s Config, core: Core<'s, M, S>, - slice: &'s [u8], + slice: &'s BStr, } impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> { pub fn new( searcher: &'s Searcher, matcher: M, - slice: &'s [u8], + slice: &'s BStr, write_to: S, ) -> SliceByLine<'s, M, S> { debug_assert!(!searcher.multi_line_with_matcher(&matcher)); @@ -127,7 +129,7 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> { pub struct MultiLine<'s, M: 's, S> { config: &'s Config, core: Core<'s, M, S>, - slice: &'s [u8], + slice: &'s BStr, last_match: Option, } @@ -135,7 +137,7 @@ impl<'s, M: Matcher, S: Sink> MultiLine<'s, M, S> { pub fn new( searcher: &'s Searcher, matcher: M, - slice: &'s [u8], + slice: &'s BStr, write_to: S, ) -> MultiLine<'s, M, S> { debug_assert!(searcher.multi_line_with_matcher(&matcher)); @@ -306,7 +308,8 @@ impl<'s, M: Matcher, S: Sink> MultiLine<'s, M, S> { } fn find(&mut self) -> Result, S::Error> { - match self.core.matcher().find(&self.slice[self.core.pos()..]) { + let haystack = &self.slice[self.core.pos()..]; + match self.core.matcher().find(haystack.as_bytes()) { Err(err) => Err(S::Error::error_message(err)), Ok(None) => Ok(None), Ok(Some(m)) => Ok(Some(m.offset(self.core.pos()))), diff --git a/grep-searcher/src/searcher/mod.rs b/grep-searcher/src/searcher/mod.rs index bc428b68..9f77e8cd 100644 --- a/grep-searcher/src/searcher/mod.rs +++ b/grep-searcher/src/searcher/mod.rs @@ -5,6 +5,7 @@ use std::fs::File; use std::io::{self, Read}; use std::path::Path; +use bstr::{B, BStr, BString}; use encoding_rs; use encoding_rs_io::DecodeReaderBytesBuilder; use grep_matcher::{LineTerminator, Match, Matcher}; @@ -311,9 +312,9 @@ impl SearcherBuilder { Searcher { config: config, decode_builder: decode_builder, - decode_buffer: RefCell::new(vec![0; 8 * (1<<10)]), + decode_buffer: RefCell::new(BString::from(vec![0; 8 * (1<<10)])), line_buffer: RefCell::new(self.config.line_buffer()), - multi_line_buffer: RefCell::new(vec![]), + multi_line_buffer: RefCell::new(BString::new()), } } @@ -543,7 +544,7 @@ pub struct Searcher { /// through the underlying bytes with no additional overhead. decode_builder: DecodeReaderBytesBuilder, /// A buffer that is used for transcoding scratch space. - decode_buffer: RefCell>, + decode_buffer: RefCell, /// A line buffer for use in line oriented searching. /// /// We wrap it in a RefCell to permit lending out borrows of `Searcher` @@ -555,7 +556,7 @@ pub struct Searcher { /// multi line search. In particular, multi line searches cannot be /// performed incrementally, and need the entire haystack in memory at /// once. - multi_line_buffer: RefCell>, + multi_line_buffer: RefCell, } impl Searcher { @@ -666,7 +667,7 @@ impl Searcher { let mut decode_buffer = self.decode_buffer.borrow_mut(); let read_from = self.decode_builder - .build_with_buffer(read_from, &mut *decode_buffer) + .build_with_buffer(read_from, decode_buffer.as_mut_vec()) .map_err(S::Error::error_io)?; if self.multi_line_with_matcher(&matcher) { @@ -698,12 +699,13 @@ impl Searcher { where M: Matcher, S: Sink, { + let slice = B(slice); self.check_config(&matcher).map_err(S::Error::error_config)?; // We can search the slice directly, unless we need to do transcoding. if self.slice_needs_transcoding(slice) { trace!("slice reader: needs transcoding, using generic reader"); - return self.search_reader(matcher, slice, write_to); + return self.search_reader(matcher, slice.as_bytes(), write_to); } if self.multi_line_with_matcher(&matcher) { trace!("slice reader: searching via multiline strategy"); @@ -736,7 +738,7 @@ impl Searcher { } /// Returns true if and only if the given slice needs to be transcoded. - fn slice_needs_transcoding(&self, slice: &[u8]) -> bool { + fn slice_needs_transcoding(&self, slice: &BStr) -> bool { self.config.encoding.is_some() || slice_has_utf16_bom(slice) } } @@ -851,7 +853,9 @@ impl Searcher { .map(|m| m.len() as usize + 1) .unwrap_or(0); buf.reserve(cap); - read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?; + read_from + .read_to_end(buf.as_mut_vec()) + .map_err(S::Error::error_io)?; return Ok(()); } self.fill_multi_line_buffer_from_reader::<_, S>(read_from) @@ -868,6 +872,7 @@ impl Searcher { assert!(self.config.multi_line); let mut buf = self.multi_line_buffer.borrow_mut(); + let buf = buf.as_mut_vec(); buf.clear(); // If we don't have a heap limit, then we can defer to std's @@ -919,8 +924,8 @@ impl Searcher { /// /// This is used by the searcher to determine if a transcoder is necessary. /// Otherwise, it is advantageous to search the slice directly. -fn slice_has_utf16_bom(slice: &[u8]) -> bool { - let enc = match encoding_rs::Encoding::for_bom(slice) { +fn slice_has_utf16_bom(slice: &BStr) -> bool { + let enc = match encoding_rs::Encoding::for_bom(slice.as_bytes()) { None => return false, Some((enc, _)) => enc, }; diff --git a/grep-searcher/src/testutil.rs b/grep-searcher/src/testutil.rs index b51508a1..ee79f108 100644 --- a/grep-searcher/src/testutil.rs +++ b/grep-searcher/src/testutil.rs @@ -1,10 +1,10 @@ use std::io::{self, Write}; use std::str; +use bstr::B; use grep_matcher::{ LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError, }; -use memchr::memchr; use regex::bytes::{Regex, RegexBuilder}; use searcher::{BinaryDetection, Searcher, SearcherBuilder}; @@ -94,8 +94,8 @@ impl Matcher for RegexMatcher { } // Make it interesting and return the last byte in the current // line. - let i = memchr(self.line_term.unwrap().as_byte(), haystack) - .map(|i| i) + let i = B(haystack) + .find_byte(self.line_term.unwrap().as_byte()) .unwrap_or(haystack.len() - 1); Ok(Some(LineMatchKind::Candidate(i))) } else {