deps: add bstr to Cargo.lock

regex: print out final regex in trace mode
This is useful for debugging to see what regex is actually being run. We put this as a trace since the regex can be quite gnarly. (It is not pretty printed.)
2025-07-25 17:21:57 -07:00 · 2019-04-05 22:58:58 -04:00 · 2019-04-05 22:58:58 -04:00 · 2019-04-05 22:58:58 -04:00 · 2019-04-05 22:58:58 -04:00 · 2019-04-05 22:58:58 -04:00
26 changed files with 224 additions and 333 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -36,6 +36,16 @@ name = "bitflags"
 version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"

+[[package]]
+name = "bstr"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex-automata 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "bytecount"
 version = "0.5.1"
@@ -130,10 +140,10 @@ name = "globset"
 version = "0.4.2"
 dependencies = [
 "aho-corasick 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
 "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
 "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex 1.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

@@ -156,6 +166,7 @@ name = "grep-cli"
 version = "0.1.1"
 dependencies = [
 "atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
+ "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "globset 0.4.2",
 "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -186,6 +197,7 @@ name = "grep-printer"
 version = "0.1.1"
 dependencies = [
 "base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "grep-matcher 0.1.1",
 "grep-regex 0.1.2",
 "grep-searcher 0.1.3",
@@ -211,13 +223,13 @@ dependencies = [
 name = "grep-searcher"
 version = "0.1.3"
 dependencies = [
+ "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
 "encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
 "grep-matcher 0.1.1",
 "grep-regex 0.1.2",
 "log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex 1.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
@@ -463,6 +475,14 @@ dependencies = [
 "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "regex-automata"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.6"
@@ -483,6 +503,7 @@ dependencies = [
 name = "ripgrep"
 version = "0.10.0"
 dependencies = [
+ "bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "grep 0.2.3",
 "ignore 0.4.6",
@@ -673,6 +694,7 @@ dependencies = [
 "checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799"
 "checksum base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0b25d992356d2eb0ed82172f5248873db5560c4721f564b13cb5193bda5e668e"
 "checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12"
+"checksum bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6c8203ca06c502958719dae5f653a79e0cc6ba808ed02beffbf27d09610f2143"
 "checksum bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "be0fdd54b507df8f22012890aadd099979befdba27713c767993f8380112ca7c"
 "checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb"
 "checksum cc 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)" = "30f813bf45048a18eda9190fd3c6b78644146056740c43172a5a3699118588fd"
@@ -713,6 +735,7 @@ dependencies = [
 "checksum redox_syscall 0.1.52 (registry+https://github.com/rust-lang/crates.io-index)" = "d32b3053e5ced86e4bc0411fec997389532bf56b000e66cb4884eeeb41413d69"
 "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
 "checksum regex 1.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "559008764a17de49a3146b234641644ed37d118d1ef641a0bb573d146edc6ce0"
+"checksum regex-automata 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "a25a7daa2eea48550e9946133d6cc9621020d29cc7069089617234bf8b6a8693"
 "checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96"
 "checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
 "checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ members = [
 ]

 [dependencies]
+bstr = "0.1.2"
 grep = { version = "0.2.3", path = "grep" }
 ignore = { version = "0.4.4", path = "ignore" }
 lazy_static = "1.1.0"
--- a/GUIDE.md
+++ b/GUIDE.md
@@ -525,9 +525,9 @@ config file. Once the environment variable is set, open the file and just type
 in the flags you want set automatically. There are only two rules for
 describing the format of the config file:

-1. Every line is a shell argument, after trimming ASCII whitespace.
-2. Lines starting with `#` (optionally preceded by any amount of
-   ASCII whitespace) are ignored.
+1. Every line is a shell argument, after trimming whitespace.
+2. Lines starting with `#` (optionally preceded by any amount of whitespace)
+are ignored.

 In particular, there is no escaping. Each line is given to ripgrep as a single
 command line argument verbatim.
--- a/doc/rg.1.txt.tpl
+++ b/doc/rg.1.txt.tpl
@@ -107,9 +107,9 @@ ripgrep supports reading configuration files that change ripgrep's default
 behavior. The format of the configuration file is an "rc" style and is very
 simple. It is defined by two rules:

-    1. Every line is a shell argument, after trimming ASCII whitespace.
+    1. Every line is a shell argument, after trimming whitespace.
    2. Lines starting with *#* (optionally preceded by any amount of
-       ASCII whitespace) are ignored.
+       whitespace) are ignored.

 ripgrep will look for a single configuration file if and only if the
 *RIPGREP_CONFIG_PATH* environment variable is set and is non-empty.
--- a/globset/Cargo.toml
+++ b/globset/Cargo.toml
@@ -20,10 +20,10 @@ bench = false

 [dependencies]
 aho-corasick = "0.7.3"
+bstr = { version = "0.1.2", default-features = false, features = ["std"] }
 fnv = "1.0.6"
 log = "0.4.5"
-memchr = "2.1.0"
-regex = "1.1.0"
+regex = "1.1.5"

 [dev-dependencies]
 glob = "0.2.11"
--- a/globset/src/glob.rs
+++ b/globset/src/glob.rs
@@ -120,7 +120,7 @@ impl GlobMatcher {

    /// Tests whether the given path matches this pattern or not.
    pub fn is_match_candidate(&self, path: &Candidate) -> bool {
-        self.re.is_match(&path.path)
+        self.re.is_match(path.path.as_bytes())
    }
 }

@@ -145,7 +145,7 @@ impl GlobStrategic {

    /// Tests whether the given path matches this pattern or not.
    fn is_match_candidate(&self, candidate: &Candidate) -> bool {
-        let byte_path = &*candidate.path;
+        let byte_path = candidate.path.as_bytes();

        match self.strategy {
            MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path,
--- a/globset/src/lib.rs
+++ b/globset/src/lib.rs
@@ -104,27 +104,25 @@ or to enable case insensitive matching.
 #![deny(missing_docs)]

 extern crate aho_corasick;
+extern crate bstr;
 extern crate fnv;
 #[macro_use]
 extern crate log;
-extern crate memchr;
 extern crate regex;

 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
 use std::error::Error as StdError;
-use std::ffi::OsStr;
 use std::fmt;
 use std::hash;
 use std::path::Path;
 use std::str;

 use aho_corasick::AhoCorasick;
+use bstr::{B, BStr, BString};
 use regex::bytes::{Regex, RegexBuilder, RegexSet};

-use pathutil::{
-    file_name, file_name_ext, normalize_path, os_str_bytes, path_bytes,
-};
+use pathutil::{file_name, file_name_ext, normalize_path};
 use glob::MatchStrategy;
 pub use glob::{Glob, GlobBuilder, GlobMatcher};

@@ -294,6 +292,7 @@ pub struct GlobSet {

 impl GlobSet {
    /// Create an empty `GlobSet`. An empty set matches nothing.
+    #[inline]
    pub fn empty() -> GlobSet {
        GlobSet {
            len: 0,
@@ -302,11 +301,13 @@ impl GlobSet {
    }

    /// Returns true if this set is empty, and therefore matches nothing.
+    #[inline]
    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    /// Returns the number of globs in this set.
+    #[inline]
    pub fn len(&self) -> usize {
        self.len
    }
@@ -489,24 +490,25 @@ impl GlobSetBuilder {
 /// path against multiple globs or sets of globs.
 #[derive(Clone, Debug)]
 pub struct Candidate<'a> {
-    path: Cow<'a, [u8]>,
-    basename: Cow<'a, [u8]>,
-    ext: Cow<'a, [u8]>,
+    path: Cow<'a, BStr>,
+    basename: Cow<'a, BStr>,
+    ext: Cow<'a, BStr>,
 }

 impl<'a> Candidate<'a> {
    /// Create a new candidate for matching from the given path.
    pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
-        let path = path.as_ref();
-        let basename = file_name(path).unwrap_or(OsStr::new(""));
+        let path = normalize_path(BString::from_path_lossy(path.as_ref()));
+        let basename = file_name(&path).unwrap_or(Cow::Borrowed(B("")));
+        let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B("")));
        Candidate {
-            path: normalize_path(path_bytes(path)),
-            basename: os_str_bytes(basename),
-            ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")),
+            path: path,
+            basename: basename,
+            ext: ext,
        }
    }

-    fn path_prefix(&self, max: usize) -> &[u8] {
+    fn path_prefix(&self, max: usize) -> &BStr {
        if self.path.len() <= max {
            &*self.path
        } else {
@@ -514,7 +516,7 @@ impl<'a> Candidate<'a> {
        }
    }

-    fn path_suffix(&self, max: usize) -> &[u8] {
+    fn path_suffix(&self, max: usize) -> &BStr {
        if self.path.len() <= max {
            &*self.path
        } else {
@@ -575,12 +577,12 @@ impl LiteralStrategy {
    }

    fn is_match(&self, candidate: &Candidate) -> bool {
-        self.0.contains_key(&*candidate.path)
+        self.0.contains_key(candidate.path.as_bytes())
    }

    #[inline(never)]
    fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
-        if let Some(hits) = self.0.get(&*candidate.path) {
+        if let Some(hits) = self.0.get(candidate.path.as_bytes()) {
            matches.extend(hits);
        }
    }
@@ -602,7 +604,7 @@ impl BasenameLiteralStrategy {
        if candidate.basename.is_empty() {
            return false;
        }
-        self.0.contains_key(&*candidate.basename)
+        self.0.contains_key(candidate.basename.as_bytes())
    }

    #[inline(never)]
@@ -610,7 +612,7 @@ impl BasenameLiteralStrategy {
        if candidate.basename.is_empty() {
            return;
        }
-        if let Some(hits) = self.0.get(&*candidate.basename) {
+        if let Some(hits) = self.0.get(candidate.basename.as_bytes()) {
            matches.extend(hits);
        }
    }
@@ -632,7 +634,7 @@ impl ExtensionStrategy {
        if candidate.ext.is_empty() {
            return false;
        }
-        self.0.contains_key(&*candidate.ext)
+        self.0.contains_key(candidate.ext.as_bytes())
    }

    #[inline(never)]
@@ -640,7 +642,7 @@ impl ExtensionStrategy {
        if candidate.ext.is_empty() {
            return;
        }
-        if let Some(hits) = self.0.get(&*candidate.ext) {
+        if let Some(hits) = self.0.get(candidate.ext.as_bytes()) {
            matches.extend(hits);
        }
    }
@@ -710,11 +712,11 @@ impl RequiredExtensionStrategy {
        if candidate.ext.is_empty() {
            return false;
        }
-        match self.0.get(&*candidate.ext) {
+        match self.0.get(candidate.ext.as_bytes()) {
            None => false,
            Some(regexes) => {
                for &(_, ref re) in regexes {
-                    if re.is_match(&*candidate.path) {
+                    if re.is_match(candidate.path.as_bytes()) {
                        return true;
                    }
                }
@@ -728,9 +730,9 @@ impl RequiredExtensionStrategy {
        if candidate.ext.is_empty() {
            return;
        }
-        if let Some(regexes) = self.0.get(&*candidate.ext) {
+        if let Some(regexes) = self.0.get(candidate.ext.as_bytes()) {
            for &(global_index, ref re) in regexes {
-                if re.is_match(&*candidate.path) {
+                if re.is_match(candidate.path.as_bytes()) {
                    matches.push(global_index);
                }
            }
@@ -746,11 +748,11 @@ struct RegexSetStrategy {

 impl RegexSetStrategy {
    fn is_match(&self, candidate: &Candidate) -> bool {
-        self.matcher.is_match(&*candidate.path)
+        self.matcher.is_match(candidate.path.as_bytes())
    }

    fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
-        for i in self.matcher.matches(&*candidate.path) {
+        for i in self.matcher.matches(candidate.path.as_bytes()) {
            matches.push(self.map[i]);
        }
    }
--- a/globset/src/pathutil.rs
+++ b/globset/src/pathutil.rs
@@ -1,41 +1,26 @@
 use std::borrow::Cow;
-use std::ffi::OsStr;
-use std::path::Path;
+
+use bstr::BStr;

 /// The final component of the path, if it is a normal file.
 ///
 /// If the path terminates in ., .., or consists solely of a root of prefix,
 /// file_name will return None.
-#[cfg(unix)]
-pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
-    path: &'a P,
-) -> Option<&'a OsStr> {
-    use std::os::unix::ffi::OsStrExt;
-    use memchr::memrchr;
-
-    let path = path.as_ref().as_os_str().as_bytes();
+pub fn file_name<'a>(path: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
    if path.is_empty() {
        return None;
-    } else if path.len() == 1 && path[0] == b'.' {
-        return None;
-    } else if path.last() == Some(&b'.') {
-        return None;
-    } else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
+    } else if path.last() == Some(b'.') {
        return None;
    }
-    let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0);
-    Some(OsStr::from_bytes(&path[last_slash..]))
-}
-
-/// The final component of the path, if it is a normal file.
-///
-/// If the path terminates in ., .., or consists solely of a root of prefix,
-/// file_name will return None.
-#[cfg(not(unix))]
-pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
-    path: &'a P,
-) -> Option<&'a OsStr> {
-    path.as_ref().file_name()
+    let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0);
+    Some(match *path {
+        Cow::Borrowed(path) => Cow::Borrowed(&path[last_slash..]),
+        Cow::Owned(ref path) => {
+            let mut path = path.clone();
+            path.drain_bytes(..last_slash);
+            Cow::Owned(path)
+        }
+    })
 }

 /// Return a file extension given a path's file name.
@@ -54,59 +39,28 @@ pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
 /// a pattern like `*.rs` is obviously trying to match files with a `rs`
 /// extension, but it also matches files like `.rs`, which doesn't have an
 /// extension according to std::path::Path::extension.
-pub fn file_name_ext(name: &OsStr) -> Option<Cow<[u8]>> {
+pub fn file_name_ext<'a>(name: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
    if name.is_empty() {
        return None;
    }
-    let name = os_str_bytes(name);
-    let last_dot_at = {
-        let result = name
-            .iter().enumerate().rev()
-            .find(|&(_, &b)| b == b'.')
-            .map(|(i, _)| i);
-        match result {
-            None => return None,
-            Some(i) => i,
-        }
+    let last_dot_at = match name.rfind_byte(b'.') {
+        None => return None,
+        Some(i) => i,
    };
-    Some(match name {
+    Some(match *name {
        Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]),
-        Cow::Owned(mut name) => {
-            name.drain(..last_dot_at);
+        Cow::Owned(ref name) => {
+            let mut name = name.clone();
+            name.drain_bytes(..last_dot_at);
            Cow::Owned(name)
        }
    })
 }

-/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
-pub fn path_bytes(path: &Path) -> Cow<[u8]> {
-    os_str_bytes(path.as_os_str())
-}
-
-/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
-#[cfg(unix)]
-pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
-    use std::os::unix::ffi::OsStrExt;
-    Cow::Borrowed(s.as_bytes())
-}
-
-/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
-#[cfg(not(unix))]
-pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
-    // TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset
-    // of UTF-8, so even if we could get at the raw bytes, they wouldn't
-    // be useful. We *must* convert to UTF-8 before doing path matching.
-    // Unfortunate, but necessary.
-    match s.to_string_lossy() {
-        Cow::Owned(s) => Cow::Owned(s.into_bytes()),
-        Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
-    }
-}
-
 /// Normalizes a path to use `/` as a separator everywhere, even on platforms
 /// that recognize other characters as separators.
 #[cfg(unix)]
-pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
+pub fn normalize_path(path: Cow<BStr>) -> Cow<BStr> {
    // UNIX only uses /, so we're good.
    path
 }
@@ -114,7 +68,7 @@ pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
 /// Normalizes a path to use `/` as a separator everywhere, even on platforms
 /// that recognize other characters as separators.
 #[cfg(not(unix))]
-pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
+pub fn normalize_path(mut path: Cow<BStr>) -> Cow<BStr> {
    use std::path::is_separator;

    for i in 0..path.len() {
@@ -129,7 +83,8 @@ pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
 #[cfg(test)]
 mod tests {
    use std::borrow::Cow;
-    use std::ffi::OsStr;
+
+    use bstr::{B, BString};

    use super::{file_name_ext, normalize_path};

@@ -137,8 +92,9 @@ mod tests {
        ($name:ident, $file_name:expr, $ext:expr) => {
            #[test]
            fn $name() {
-                let got = file_name_ext(OsStr::new($file_name));
-                assert_eq!($ext.map(|s| Cow::Borrowed(s.as_bytes())), got);
+                let bs = BString::from($file_name);
+                let got = file_name_ext(&Cow::Owned(bs));
+                assert_eq!($ext.map(|s| Cow::Borrowed(B(s))), got);
            }
        };
    }
@@ -153,7 +109,8 @@ mod tests {
        ($name:ident, $path:expr, $expected:expr) => {
            #[test]
            fn $name() {
-                let got = normalize_path(Cow::Owned($path.to_vec()));
+                let bs = BString::from_slice($path);
+                let got = normalize_path(Cow::Owned(bs));
                assert_eq!($expected.to_vec(), got.into_owned());
            }
        };
--- a/grep-cli/Cargo.toml
+++ b/grep-cli/Cargo.toml
@@ -14,6 +14,7 @@ license = "Unlicense/MIT"

 [dependencies]
 atty = "0.2.11"
+bstr = "0.1.2"
 globset = { version = "0.4.2", path = "../globset" }
 lazy_static = "1.1.0"
 log = "0.4.5"
--- a/grep-cli/src/escape.rs
+++ b/grep-cli/src/escape.rs
@@ -1,6 +1,8 @@
 use std::ffi::OsStr;
 use std::str;

+use bstr::{BStr, BString};
+
 /// A single state in the state machine used by `unescape`.
 #[derive(Clone, Copy, Eq, PartialEq)]
 enum State {
@@ -35,18 +37,16 @@ enum State {
 ///
 /// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz"));
 /// ```
-pub fn escape(mut bytes: &[u8]) -> String {
+pub fn escape(bytes: &[u8]) -> String {
+    let bytes = BStr::new(bytes);
    let mut escaped = String::new();
-    while let Some(result) = decode_utf8(bytes) {
-        match result {
-            Ok(cp) => {
-                escape_char(cp, &mut escaped);
-                bytes = &bytes[cp.len_utf8()..];
-            }
-            Err(byte) => {
-                escape_byte(byte, &mut escaped);
-                bytes = &bytes[1..];
+    for (s, e, ch) in bytes.char_indices() {
+        if ch == '\u{FFFD}' {
+            for b in bytes[s..e].bytes() {
+                escape_byte(b, &mut escaped);
            }
+        } else {
+            escape_char(ch, &mut escaped);
        }
    }
    escaped
@@ -56,19 +56,7 @@ pub fn escape(mut bytes: &[u8]) -> String {
 ///
 /// This is like [`escape`](fn.escape.html), but accepts an OS string.
 pub fn escape_os(string: &OsStr) -> String {
-    #[cfg(unix)]
-    fn imp(string: &OsStr) -> String {
-        use std::os::unix::ffi::OsStrExt;
-
-        escape(string.as_bytes())
-    }
-
-    #[cfg(not(unix))]
-    fn imp(string: &OsStr) -> String {
-        escape(string.to_string_lossy().as_bytes())
-    }
-
-    imp(string)
+    escape(BString::from_os_str_lossy(string).as_bytes())
 }

 /// Unescapes a string.
@@ -195,46 +183,6 @@ fn escape_byte(byte: u8, into: &mut String) {
    }
 }

-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the given
-/// byte slice, then the first byte is returned instead.
-///
-/// This returns `None` if and only if `bytes` is empty.
-fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let len = match utf8_len(bytes[0]) {
-        None => return Some(Err(bytes[0])),
-        Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
-        Some(len) =>  len,
-    };
-    match str::from_utf8(&bytes[..len]) {
-        Ok(s) => Some(Ok(s.chars().next().unwrap())),
-        Err(_) => Some(Err(bytes[0])),
-    }
-}
-
-/// Given a UTF-8 leading byte, this returns the total number of code units
-/// in the following encoded codepoint.
-///
-/// If the given byte is not a valid UTF-8 leading byte, then this returns
-/// `None`.
-fn utf8_len(byte: u8) -> Option<usize> {
-    if byte <= 0x7F {
-        Some(1)
-    } else if byte <= 0b110_11111 {
-        Some(2)
-    } else if byte <= 0b1110_1111 {
-        Some(3)
-    } else if byte <= 0b1111_0111 {
-        Some(4)
-    } else {
-        None
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::{escape, unescape};
--- a/grep-cli/src/lib.rs
+++ b/grep-cli/src/lib.rs
@@ -159,6 +159,7 @@ error message is crafted that typically tells the user how to fix the problem.
 #![deny(missing_docs)]

 extern crate atty;
+extern crate bstr;
 extern crate globset;
 #[macro_use]
 extern crate lazy_static;
--- a/grep-printer/Cargo.toml
+++ b/grep-printer/Cargo.toml
@@ -19,6 +19,7 @@ serde1 = ["base64", "serde", "serde_derive", "serde_json"]

 [dependencies]
 base64 = { version = "0.10.0", optional = true }
+bstr = "0.1.2"
 grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
 grep-searcher = { version = "0.1.1", path = "../grep-searcher" }
 termcolor = "1.0.4"
--- a/grep-printer/src/lib.rs
+++ b/grep-printer/src/lib.rs
@@ -70,6 +70,7 @@ fn example() -> Result<(), Box<Error>> {

 #[cfg(feature = "serde1")]
 extern crate base64;
+extern crate bstr;
 extern crate grep_matcher;
 #[cfg(test)]
 extern crate grep_regex;
--- a/grep-printer/src/util.rs
+++ b/grep-printer/src/util.rs
@@ -4,6 +4,7 @@ use std::io;
 use std::path::Path;
 use std::time;

+use bstr::{BStr, BString};
 use grep_matcher::{Captures, LineTerminator, Match, Matcher};
 use grep_searcher::{
    LineIter,
@@ -262,26 +263,12 @@ impl<'a> Sunk<'a> {
 /// portability with a small cost: on Windows, paths that are not valid UTF-16
 /// will not roundtrip correctly.
 #[derive(Clone, Debug)]
-pub struct PrinterPath<'a>(Cow<'a, [u8]>);
+pub struct PrinterPath<'a>(Cow<'a, BStr>);

 impl<'a> PrinterPath<'a> {
    /// Create a new path suitable for printing.
    pub fn new(path: &'a Path) -> PrinterPath<'a> {
-        PrinterPath::new_impl(path)
-    }
-
-    #[cfg(unix)]
-    fn new_impl(path: &'a Path) -> PrinterPath<'a> {
-        use std::os::unix::ffi::OsStrExt;
-        PrinterPath(Cow::Borrowed(path.as_os_str().as_bytes()))
-    }
-
-    #[cfg(not(unix))]
-    fn new_impl(path: &'a Path) -> PrinterPath<'a> {
-        PrinterPath(match path.to_string_lossy() {
-            Cow::Owned(path) => Cow::Owned(path.into_bytes()),
-            Cow::Borrowed(path) => Cow::Borrowed(path.as_bytes()),
-        })
+        PrinterPath(BString::from_path_lossy(path))
    }

    /// Create a new printer path from the given path which can be efficiently
@@ -302,7 +289,7 @@ impl<'a> PrinterPath<'a> {
    /// path separators that are both replaced by `new_sep`. In all other
    /// environments, only `/` is treated as a path separator.
    fn replace_separator(&mut self, new_sep: u8) {
-        let transformed_path: Vec<_> = self.as_bytes().iter().map(|&b| {
+        let transformed_path: BString = self.0.bytes().map(|b| {
            if b == b'/' || (cfg!(windows) && b == b'\\') {
                new_sep
            } else {
@@ -314,7 +301,7 @@ impl<'a> PrinterPath<'a> {

    /// Return the raw bytes for this path.
    pub fn as_bytes(&self) -> &[u8] {
-        &*self.0
+        self.0.as_bytes()
    }
 }

--- a/grep-regex/src/config.rs
+++ b/grep-regex/src/config.rs
@@ -207,7 +207,7 @@ impl ConfiguredHIR {
        if self.config.line_terminator.is_none() {
            return Ok(None);
        }
-        match LiteralSets::new(&self.expr).one_regex() {
+        match LiteralSets::new(&self.expr).one_regex(self.config.word) {
            None => Ok(None),
            Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
        }
--- a/grep-regex/src/crlf.rs
+++ b/grep-regex/src/crlf.rs
@@ -34,6 +34,11 @@ impl CRLFMatcher {
        }
        Ok(CRLFMatcher { regex, names })
    }
+
+    /// Return the underlying regex used by this matcher.
+    pub fn regex(&self) -> &Regex {
+        &self.regex
+    }
 }

 impl Matcher for CRLFMatcher {
--- a/grep-regex/src/literal.rs
+++ b/grep-regex/src/literal.rs
@@ -47,18 +47,23 @@ impl LiteralSets {
    /// generated these literal sets. The idea here is that the pattern
    /// returned by this method is much cheaper to search for. i.e., It is
    /// usually a single literal or an alternation of literals.
-    pub fn one_regex(&self) -> Option<String> {
+    pub fn one_regex(&self, word: bool) -> Option<String> {
        // TODO: The logic in this function is basically inscrutable. It grew
        // organically in the old grep 0.1 crate. Ideally, it would be
        // re-worked. In fact, the entire inner literal extraction should be
        // re-worked. Actually, most of regex-syntax's literal extraction
        // should also be re-worked. Alas... only so much time in the day.

-        if self.prefixes.all_complete() && !self.prefixes.is_empty() {
-            debug!("literal prefixes detected: {:?}", self.prefixes);
-            // When this is true, the regex engine will do a literal scan,
-            // so we don't need to return anything.
-            return None;
+        if !word {
+            if self.prefixes.all_complete() && !self.prefixes.is_empty() {
+                debug!("literal prefixes detected: {:?}", self.prefixes);
+                // When this is true, the regex engine will do a literal scan,
+                // so we don't need to return anything. But we only do this
+                // if we aren't doing a word regex, since a word regex adds
+                // a `(?:\W|^)` to the beginning of the regex, thereby
+                // defeating the regex engine's literal detection.
+                return None;
+            }
        }

        // Out of inner required literals, prefixes and suffixes, which one
@@ -285,7 +290,7 @@ mod tests {
    }

    fn one_regex(pattern: &str) -> Option<String> {
-        sets(pattern).one_regex()
+        sets(pattern).one_regex(false)
    }

    // Put a pattern into the same format as the one returned by `one_regex`.
--- a/grep-regex/src/matcher.rs
+++ b/grep-regex/src/matcher.rs
@@ -50,9 +50,12 @@ impl RegexMatcherBuilder {
        if let Some(ref re) = fast_line_regex {
            trace!("extracted fast line regex: {:?}", re);
        }
+
+        let matcher = RegexMatcherImpl::new(&chir)?;
+        trace!("final regex: {:?}", matcher.regex());
        Ok(RegexMatcher {
            config: self.config.clone(),
-            matcher: RegexMatcherImpl::new(&chir)?,
+            matcher: matcher,
            fast_line_regex: fast_line_regex,
            non_matching_bytes: non_matching_bytes,
        })
@@ -370,6 +373,15 @@ impl RegexMatcherImpl {
            Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
        }
    }
+
+    /// Return the underlying regex object used.
+    fn regex(&self) -> &Regex {
+        match *self {
+            RegexMatcherImpl::Word(ref x) => x.regex(),
+            RegexMatcherImpl::CRLF(ref x) => x.regex(),
+            RegexMatcherImpl::Standard(ref x) => &x.regex,
+        }
+    }
 }

 // This implementation just dispatches on the internal matcher impl except
--- a/grep-regex/src/word.rs
+++ b/grep-regex/src/word.rs
@@ -55,6 +55,11 @@ impl WordMatcher {
        }
        Ok(WordMatcher { regex, names, locs })
    }
+
+    /// Return the underlying regex used by this matcher.
+    pub fn regex(&self) -> &Regex {
+        &self.regex
+    }
 }

 impl Matcher for WordMatcher {
--- a/grep-searcher/Cargo.toml
+++ b/grep-searcher/Cargo.toml
@@ -13,12 +13,12 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
 license = "Unlicense/MIT"

 [dependencies]
+bstr = { version = "0.1.2", default-features = false, features = ["std"] }
 bytecount = "0.5"
 encoding_rs = "0.8.14"
 encoding_rs_io = "0.1.4"
 grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
 log = "0.4.5"
-memchr = "2.1"
 memmap = "0.7"

 [dev-dependencies]
--- a/grep-searcher/src/lib.rs
+++ b/grep-searcher/src/lib.rs
@@ -99,13 +99,13 @@ searches stdin.

 #![deny(missing_docs)]

+extern crate bstr;
 extern crate bytecount;
 extern crate encoding_rs;
 extern crate encoding_rs_io;
 extern crate grep_matcher;
 #[macro_use]
 extern crate log;
-extern crate memchr;
 extern crate memmap;
 #[cfg(test)]
 extern crate regex;
--- a/grep-searcher/src/line_buffer.rs
+++ b/grep-searcher/src/line_buffer.rs
@@ -1,8 +1,7 @@
 use std::cmp;
 use std::io;
-use std::ptr;

-use memchr::{memchr, memrchr};
+use bstr::{BStr, BString};

 /// The default buffer capacity that we use for the line buffer.
 pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB
@@ -123,7 +122,7 @@ impl LineBufferBuilder {
    pub fn build(&self) -> LineBuffer {
        LineBuffer {
            config: self.config,
-            buf: vec![0; self.config.capacity],
+            buf: BString::from(vec![0; self.config.capacity]),
            pos: 0,
            last_lineterm: 0,
            end: 0,
@@ -255,6 +254,12 @@ impl<'b, R: io::Read> LineBufferReader<'b, R> {

    /// Return the contents of this buffer.
    pub fn buffer(&self) -> &[u8] {
+        self.line_buffer.buffer().as_bytes()
+    }
+
+    /// Return the underlying buffer as a byte string. Used for tests only.
+    #[cfg(test)]
+    fn bstr(&self) -> &BStr {
        self.line_buffer.buffer()
    }

@@ -284,7 +289,7 @@ pub struct LineBuffer {
    /// The configuration of this buffer.
    config: Config,
    /// The primary buffer with which to hold data.
-    buf: Vec<u8>,
+    buf: BString,
    /// The current position of this buffer. This is always a valid sliceable
    /// index into `buf`, and its maximum value is the length of `buf`.
    pos: usize,
@@ -339,13 +344,13 @@ impl LineBuffer {
    }

    /// Return the contents of this buffer.
-    fn buffer(&self) -> &[u8] {
+    fn buffer(&self) -> &BStr {
        &self.buf[self.pos..self.last_lineterm]
    }

    /// Return the contents of the free space beyond the end of the buffer as
    /// a mutable slice.
-    fn free_buffer(&mut self) -> &mut [u8] {
+    fn free_buffer(&mut self) -> &mut BStr {
        &mut self.buf[self.end..]
    }

@@ -396,7 +401,7 @@ impl LineBuffer {
        assert_eq!(self.pos, 0);
        loop {
            self.ensure_capacity()?;
-            let readlen = rdr.read(self.free_buffer())?;
+            let readlen = rdr.read(self.free_buffer().as_bytes_mut())?;
            if readlen == 0 {
                // We're only done reading for good once the caller has
                // consumed everything.
@@ -416,7 +421,7 @@ impl LineBuffer {
            match self.config.binary {
                BinaryDetection::None => {} // nothing to do
                BinaryDetection::Quit(byte) => {
-                    if let Some(i) = memchr(byte, newbytes) {
+                    if let Some(i) = newbytes.find_byte(byte) {
                        self.end = oldend + i;
                        self.last_lineterm = self.end;
                        self.binary_byte_offset =
@@ -444,7 +449,7 @@ impl LineBuffer {
            }

            // Update our `last_lineterm` positions if we read one.
-            if let Some(i) = memrchr(self.config.lineterm, newbytes) {
+            if let Some(i) = newbytes.rfind_byte(self.config.lineterm) {
                self.last_lineterm = oldend + i + 1;
                return Ok(true);
            }
@@ -467,40 +472,8 @@ impl LineBuffer {
            return;
        }

-        assert!(self.pos < self.end && self.end <= self.buf.len());
        let roll_len = self.end - self.pos;
-        unsafe {
-            // SAFETY: A buffer contains Copy data, so there's no problem
-            // moving it around. Safety also depends on our indices being
-            // in bounds, which they should always be, and we enforce with
-            // an assert above.
-            //
-            // It seems like it should be possible to do this in safe code that
-            // results in the same codegen. I tried the obvious:
-            //
-            //   for (src, dst) in (self.pos..self.end).zip(0..) {
-            //     self.buf[dst] = self.buf[src];
-            //   }
-            //
-            // But the above does not work, and in fact compiles down to a slow
-            // byte-by-byte loop. I tried a few other minor variations, but
-            // alas, better minds might prevail.
-            //
-            // Overall, this doesn't save us *too* much. It mostly matters when
-            // the number of bytes we're copying is large, which can happen
-            // if the searcher is asked to produce a lot of context. We could
-            // decide this isn't worth it, but it does make an appreciable
-            // impact at or around the context=30 range on my machine.
-            //
-            // We could also use a temporary buffer that compiles down to two
-            // memcpys and is faster than the byte-at-a-time loop, but it
-            // complicates our options for limiting memory allocation a bit.
-            ptr::copy(
-                self.buf[self.pos..].as_ptr(),
-                self.buf.as_mut_ptr(),
-                roll_len,
-            );
-        }
+        self.buf.copy_within(self.pos.., 0);
        self.pos = 0;
        self.last_lineterm = roll_len;
        self.end = roll_len;
@@ -536,14 +509,15 @@ impl LineBuffer {
    }
 }

-/// Replaces `src` with `replacement` in bytes.
-fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
+/// Replaces `src` with `replacement` in bytes, and return the offset of the
+/// first replacement, if one exists.
+fn replace_bytes(bytes: &mut BStr, src: u8, replacement: u8) -> Option<usize> {
    if src == replacement {
        return None;
    }
    let mut first_pos = None;
    let mut pos = 0;
-    while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) {
+    while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) {
        if first_pos.is_none() {
            first_pos = Some(i);
        }
@@ -560,6 +534,7 @@ fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
 #[cfg(test)]
 mod tests {
    use std::str;
+    use bstr::BString;
    use super::*;

    const SHERLOCK: &'static str = "\
@@ -575,18 +550,14 @@ and exhibited clearly, with a label attached.\
        slice.to_string()
    }

-    fn btos(slice: &[u8]) -> &str {
-        str::from_utf8(slice).unwrap()
-    }
-
    fn replace_str(
        slice: &str,
        src: u8,
        replacement: u8,
    ) -> (String, Option<usize>) {
-        let mut dst = slice.to_string().into_bytes();
+        let mut dst = BString::from(slice);
        let result = replace_bytes(&mut dst, src, replacement);
-        (String::from_utf8(dst).unwrap(), result)
+        (dst.into_string().unwrap(), result)
    }

    #[test]
@@ -607,7 +578,7 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nlisa\n");
+        assert_eq!(rdr.bstr(), "homer\nlisa\n");
        assert_eq!(rdr.absolute_byte_offset(), 0);
        rdr.consume(5);
        assert_eq!(rdr.absolute_byte_offset(), 5);
@@ -615,7 +586,7 @@ and exhibited clearly, with a label attached.\
        assert_eq!(rdr.absolute_byte_offset(), 11);

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "maggie");
+        assert_eq!(rdr.bstr(), "maggie");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -630,7 +601,7 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
+        assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -645,7 +616,7 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "\n");
+        assert_eq!(rdr.bstr(), "\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -660,7 +631,7 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "\n\n");
+        assert_eq!(rdr.bstr(), "\n\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -698,12 +669,12 @@ and exhibited clearly, with a label attached.\
        let mut linebuf = LineBufferBuilder::new().capacity(1).build();
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

-        let mut got = vec![];
+        let mut got = BString::new();
        while rdr.fill().unwrap() {
-            got.extend(rdr.buffer());
+            got.push(rdr.buffer());
            rdr.consume_all();
        }
-        assert_eq!(bytes, btos(&got));
+        assert_eq!(bytes, got);
        assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
        assert_eq!(rdr.binary_byte_offset(), None);
    }
@@ -718,11 +689,11 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\n");
+        assert_eq!(rdr.bstr(), "homer\n");
        rdr.consume_all();

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "lisa\n");
+        assert_eq!(rdr.bstr(), "lisa\n");
        rdr.consume_all();

        // This returns an error because while we have just enough room to
@@ -732,11 +703,11 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.fill().is_err());

        // We can mush on though!
-        assert_eq!(btos(rdr.buffer()), "m");
+        assert_eq!(rdr.bstr(), "m");
        rdr.consume_all();

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "aggie");
+        assert_eq!(rdr.bstr(), "aggie");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -752,16 +723,16 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\n");
+        assert_eq!(rdr.bstr(), "homer\n");
        rdr.consume_all();

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "lisa\n");
+        assert_eq!(rdr.bstr(), "lisa\n");
        rdr.consume_all();

        // We have just enough space.
        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "maggie");
+        assert_eq!(rdr.bstr(), "maggie");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -777,7 +748,7 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(rdr.fill().is_err());
-        assert_eq!(btos(rdr.buffer()), "");
+        assert_eq!(rdr.bstr(), "");
    }

    #[test]
@@ -789,7 +760,7 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n");
+        assert_eq!(rdr.bstr(), "homer\nli\x00sa\nmaggie\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -808,7 +779,7 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nli");
+        assert_eq!(rdr.bstr(), "homer\nli");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -825,7 +796,7 @@ and exhibited clearly, with a label attached.\
        let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);

        assert!(!rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "");
+        assert_eq!(rdr.bstr(), "");
        assert_eq!(rdr.absolute_byte_offset(), 0);
        assert_eq!(rdr.binary_byte_offset(), Some(0));
    }
@@ -841,7 +812,7 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
+        assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -860,7 +831,7 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie");
+        assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -878,7 +849,7 @@ and exhibited clearly, with a label attached.\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "\
+        assert_eq!(rdr.bstr(), "\
 For the Doctor Watsons of this world, as opposed to the Sherlock
 Holmeses, s\
 ");
@@ -901,7 +872,7 @@ Holmeses, s\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n");
+        assert_eq!(rdr.bstr(), "homer\nli\nsa\nmaggie\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -920,7 +891,7 @@ Holmeses, s\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n");
+        assert_eq!(rdr.bstr(), "\nhomer\nlisa\nmaggie\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -939,7 +910,7 @@ Holmeses, s\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
+        assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
@@ -958,7 +929,7 @@ Holmeses, s\
        assert!(rdr.buffer().is_empty());

        assert!(rdr.fill().unwrap());
-        assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
+        assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
        rdr.consume_all();

        assert!(!rdr.fill().unwrap());
--- a/grep-searcher/src/lines.rs
+++ b/grep-searcher/src/lines.rs
@@ -2,8 +2,8 @@
 A collection of routines for performing operations on lines.
 */

+use bstr::B;
 use bytecount;
-use memchr::{memchr, memrchr};
 use grep_matcher::{LineTerminator, Match};

 /// An iterator over lines in a particular slice of bytes.
@@ -85,7 +85,7 @@ impl LineStep {
    #[inline(always)]
    fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
        bytes = &bytes[..self.end];
-        match memchr(self.line_term, &bytes[self.pos..]) {
+        match B(&bytes[self.pos..]).find_byte(self.line_term) {
            None => {
                if self.pos < bytes.len() {
                    let m = (self.pos, bytes.len());
@@ -135,14 +135,16 @@ pub fn locate(
    line_term: u8,
    range: Match,
 ) -> Match {
-    let line_start = memrchr(line_term, &bytes[0..range.start()])
+    let line_start = B(&bytes[..range.start()])
+        .rfind_byte(line_term)
        .map_or(0, |i| i + 1);
    let line_end =
        if range.end() > line_start && bytes[range.end() - 1] == line_term {
            range.end()
        } else {
-            memchr(line_term, &bytes[range.end()..])
-            .map_or(bytes.len(), |i| range.end() + i + 1)
+            B(&bytes[range.end()..])
+                .find_byte(line_term)
+                .map_or(bytes.len(), |i| range.end() + i + 1)
        };
    Match::new(line_start, line_end)
 }
@@ -180,7 +182,7 @@ fn preceding_by_pos(
        pos -= 1;
    }
    loop {
-        match memrchr(line_term, &bytes[..pos]) {
+        match B(&bytes[..pos]).rfind_byte(line_term) {
            None => {
                return 0;
            }
--- a/grep-searcher/src/searcher/core.rs
+++ b/grep-searcher/src/searcher/core.rs
@@ -1,6 +1,6 @@
 use std::cmp;

-use memchr::memchr;
+use bstr::B;

 use grep_matcher::{LineMatchKind, Matcher};
 use lines::{self, LineStep};
@@ -149,7 +149,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
            BinaryDetection::Quit(b) => b,
            _ => return false,
        };
-        if let Some(i) = memchr(binary_byte, &buf[*range]) {
+        if let Some(i) = B(&buf[*range]).find_byte(binary_byte) {
            self.binary_byte_offset = Some(range.start() + i);
            true
        } else {
--- a/grep-searcher/src/testutil.rs
+++ b/grep-searcher/src/testutil.rs
@@ -1,10 +1,10 @@
 use std::io::{self, Write};
 use std::str;

+use bstr::B;
 use grep_matcher::{
    LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError,
 };
-use memchr::memchr;
 use regex::bytes::{Regex, RegexBuilder};

 use searcher::{BinaryDetection, Searcher, SearcherBuilder};
@@ -94,7 +94,8 @@ impl Matcher for RegexMatcher {
            }
            // Make it interesting and return the last byte in the current
            // line.
-            let i = memchr(self.line_term.unwrap().as_byte(), haystack)
+            let i = B(haystack)
+                .find_byte(self.line_term.unwrap().as_byte())
                .map(|i| i)
                .unwrap_or(haystack.len() - 1);
            Ok(Some(LineMatchKind::Candidate(i)))
--- a/src/config.rs
+++ b/src/config.rs
@@ -5,10 +5,11 @@
 use std::env;
 use std::error::Error;
 use std::fs::File;
-use std::io::{self, BufRead};
+use std::io;
 use std::ffi::OsString;
 use std::path::{Path, PathBuf};

+use bstr::io::BufReadExt;
 use log;

 use crate::Result;
@@ -76,62 +77,29 @@ fn parse<P: AsRef<Path>>(
 fn parse_reader<R: io::Read>(
    rdr: R,
 ) -> Result<(Vec<OsString>, Vec<Box<Error>>)> {
-    let mut bufrdr = io::BufReader::new(rdr);
+    let bufrdr = io::BufReader::new(rdr);
    let (mut args, mut errs) = (vec![], vec![]);
-    let mut line = vec![];
    let mut line_number = 0;
-    while {
-        line.clear();
+    bufrdr.for_byte_line_with_terminator(|line| {
        line_number += 1;
-        bufrdr.read_until(b'\n', &mut line)? > 0
-    } {
-        trim(&mut line);
+
+        let line = line.trim();
        if line.is_empty() || line[0] == b'#' {
-            continue;
+            return Ok(true);
        }
-        match bytes_to_os_string(&line) {
+        match line.to_os_str() {
            Ok(osstr) => {
-                args.push(osstr);
+                args.push(osstr.to_os_string());
            }
            Err(err) => {
                errs.push(format!("{}: {}", line_number, err).into());
            }
        }
-    }
+        Ok(true)
+    })?;
    Ok((args, errs))
 }

-/// Trim the given bytes of whitespace according to the ASCII definition.
-fn trim(x: &mut Vec<u8>) {
-    let upto = x.iter().take_while(|b| is_space(**b)).count();
-    x.drain(..upto);
-    let revto = x.len() - x.iter().rev().take_while(|b| is_space(**b)).count();
-    x.drain(revto..);
-}
-
-/// Returns true if and only if the given byte is an ASCII space character.
-fn is_space(b: u8) -> bool {
-    b == b'\t'
-    || b == b'\n'
-    || b == b'\x0B'
-    || b == b'\x0C'
-    || b == b'\r'
-    || b == b' '
-}
-
-/// On Unix, get an OsString from raw bytes.
-#[cfg(unix)]
-fn bytes_to_os_string(bytes: &[u8]) -> Result<OsString> {
-    use std::os::unix::ffi::OsStringExt;
-    Ok(OsString::from_vec(bytes.to_vec()))
-}
-
-/// On non-Unix (like Windows), require UTF-8.
-#[cfg(not(unix))]
-fn bytes_to_os_string(bytes: &[u8]) -> Result<OsString> {
-    String::from_utf8(bytes.to_vec()).map(OsString::from).map_err(From::from)
-}
-
 #[cfg(test)]
 mod tests {
    use std::ffi::OsString;
Author	SHA1	Message	Date
Andrew Gallant	a872d33714	deps: add bstr to Cargo.lock	2019-04-05 22:58:58 -04:00
Andrew Gallant	f08f274c5f	regex: print out final regex in trace mode This is useful for debugging to see what regex is actually being run. We put this as a trace since the regex can be quite gnarly. (It is not pretty printed.)	2019-04-05 22:58:58 -04:00
Andrew Gallant	db7e828989	regex: fix a perf bug when using -w flag When looking for an inner literal to speed up searches, if only a prefix is found, then we generally give up doing inner literal optimizations since the regex engine will generally handle it for us. Unfortunately, this decision was being made before we wrap the regex in (^\|\W)...($\|\W) when using the -w/--word-regexp flag, which would then defeat the literal optimizations inside the regex engine. We fix this with a bit of a hack that says, "if we're doing a word regexp, then give me back any literal you find, even if it's a prefix."	2019-04-05 22:58:58 -04:00
Andrew Gallant	fb6cad7152	globset: small perf improvements This tweaks the path handling functions slightly to make them a hair faster. In particular, `file_name` is called on every path that ripgrep visits, and it was possible to remove a few branches without changing behavior.	2019-04-05 22:58:58 -04:00
Andrew Gallant	8e1d40ed7d	globset: use bstr This simplifies the various path related functions and pushed more platform dependent code down into bstr. This likely also makes things a bit more efficient on Windows, since we now only do a single UTF-8 check for each file path.	2019-04-05 22:58:58 -04:00
Andrew Gallant	b1c064d5af	cli: use bstr This uses bstr in the unescaping logic. This lets us remove some platform specific code, and also lets us remove a hacked UTF-8 decoder on raw bytes.	2019-04-05 20:42:33 -04:00
Andrew Gallant	26a83c6301	config: switch to using bstrs This lets us implement correct Unicode trimming and also simplifies the parsing logic a bit. This also removes the last platform specific bits of code in ripgrep core.	2019-04-05 20:42:32 -04:00
Andrew Gallant	5e50a3c43c	printer: use bstr This starts the usage of bstr in the printer. We don't use it too much yet, but it comes in handy for implementing PrinterPath and lets us push down some platform specific code into bstr.	2019-04-05 20:42:25 -04:00
Andrew Gallant	85417e52e9	searcher: partially migrate to bstr This commit causes grep-searcher to use byte strings internally for its line buffer support. We manage to remove a use of `unsafe` by doing this (by pushing it down into `bstr`). We stop short of using byte strings everywhere else because we rely heavily on the `impl ops::Index<[u8]> for grep_matcher::Match` impl, which isn't available for byte strings. (It is premature to make bstr a public dep of a core crate like grep-matcher, but maybe some day.)	2019-04-05 20:41:26 -04:00