Compare commits

...

9 Commits

Author SHA1 Message Date
Andrew Gallant
a872d33714 deps: add bstr to Cargo.lock 2019-04-05 22:58:58 -04:00
Andrew Gallant
f08f274c5f regex: print out final regex in trace mode
This is useful for debugging to see what regex is actually being run.
We put this as a trace since the regex can be quite gnarly. (It is not
pretty printed.)
2019-04-05 22:58:58 -04:00
Andrew Gallant
db7e828989 regex: fix a perf bug when using -w flag
When looking for an inner literal to speed up searches, if only a prefix
is found, then we generally give up doing inner literal optimizations since
the regex engine will generally handle it for us. Unfortunately, this
decision was being made *before* we wrap the regex in (^|\W)...($|\W) when
using the -w/--word-regexp flag, which would then defeat the literal
optimizations inside the regex engine.

We fix this with a bit of a hack that says, "if we're doing a word regexp,
then give me back any literal you find, even if it's a prefix."
2019-04-05 22:58:58 -04:00
Andrew Gallant
fb6cad7152 globset: small perf improvements
This tweaks the path handling functions slightly to make them a hair
faster. In particular, `file_name` is called on every path that ripgrep
visits, and it was possible to remove a few branches without changing
behavior.
2019-04-05 22:58:58 -04:00
Andrew Gallant
8e1d40ed7d globset: use bstr
This simplifies the various path related functions and pushed more platform
dependent code down into bstr. This likely also makes things a bit more
efficient on Windows, since we now only do a single UTF-8 check for each
file path.
2019-04-05 22:58:58 -04:00
Andrew Gallant
b1c064d5af cli: use bstr
This uses bstr in the unescaping logic. This lets us remove some platform
specific code, and also lets us remove a hacked UTF-8 decoder on raw
bytes.
2019-04-05 20:42:33 -04:00
Andrew Gallant
26a83c6301 config: switch to using bstrs
This lets us implement correct Unicode trimming and also simplifies the
parsing logic a bit. This also removes the last platform specific bits of
code in ripgrep core.
2019-04-05 20:42:32 -04:00
Andrew Gallant
5e50a3c43c printer: use bstr
This starts the usage of bstr in the printer. We don't use it too much
yet, but it comes in handy for implementing PrinterPath and lets us push
down some platform specific code into bstr.
2019-04-05 20:42:25 -04:00
Andrew Gallant
85417e52e9 searcher: partially migrate to bstr
This commit causes grep-searcher to use byte strings internally for its
line buffer support. We manage to remove a use of `unsafe` by doing this
(by pushing it down into `bstr`).

We stop short of using byte strings everywhere else because we rely
heavily on the `impl ops::Index<[u8]> for grep_matcher::Match` impl,
which isn't available for byte strings. (It is premature to make bstr a
public dep of a core crate like grep-matcher, but maybe some day.)
2019-04-05 20:41:26 -04:00
26 changed files with 224 additions and 333 deletions

27
Cargo.lock generated
View File

@@ -36,6 +36,16 @@ name = "bitflags"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "bstr"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-automata 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bytecount"
version = "0.5.1"
@@ -130,10 +140,10 @@ name = "globset"
version = "0.4.2"
dependencies = [
"aho-corasick 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -156,6 +166,7 @@ name = "grep-cli"
version = "0.1.1"
dependencies = [
"atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"globset 0.4.2",
"lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -186,6 +197,7 @@ name = "grep-printer"
version = "0.1.1"
dependencies = [
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"grep-matcher 0.1.1",
"grep-regex 0.1.2",
"grep-searcher 0.1.3",
@@ -211,13 +223,13 @@ dependencies = [
name = "grep-searcher"
version = "0.1.3"
dependencies = [
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.17 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs_io 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"grep-matcher 0.1.1",
"grep-regex 0.1.2",
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -463,6 +475,14 @@ dependencies = [
"utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-automata"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "regex-syntax"
version = "0.6.6"
@@ -483,6 +503,7 @@ dependencies = [
name = "ripgrep"
version = "0.10.0"
dependencies = [
"bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
"grep 0.2.3",
"ignore 0.4.6",
@@ -673,6 +694,7 @@ dependencies = [
"checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799"
"checksum base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0b25d992356d2eb0ed82172f5248873db5560c4721f564b13cb5193bda5e668e"
"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12"
"checksum bstr 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6c8203ca06c502958719dae5f653a79e0cc6ba808ed02beffbf27d09610f2143"
"checksum bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "be0fdd54b507df8f22012890aadd099979befdba27713c767993f8380112ca7c"
"checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb"
"checksum cc 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)" = "30f813bf45048a18eda9190fd3c6b78644146056740c43172a5a3699118588fd"
@@ -713,6 +735,7 @@ dependencies = [
"checksum redox_syscall 0.1.52 (registry+https://github.com/rust-lang/crates.io-index)" = "d32b3053e5ced86e4bc0411fec997389532bf56b000e66cb4884eeeb41413d69"
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
"checksum regex 1.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "559008764a17de49a3146b234641644ed37d118d1ef641a0bb573d146edc6ce0"
"checksum regex-automata 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "a25a7daa2eea48550e9946133d6cc9621020d29cc7069089617234bf8b6a8693"
"checksum regex-syntax 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "dcfd8681eebe297b81d98498869d4aae052137651ad7b96822f09ceb690d0a96"
"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7"

View File

@@ -46,6 +46,7 @@ members = [
]
[dependencies]
bstr = "0.1.2"
grep = { version = "0.2.3", path = "grep" }
ignore = { version = "0.4.4", path = "ignore" }
lazy_static = "1.1.0"

View File

@@ -525,9 +525,9 @@ config file. Once the environment variable is set, open the file and just type
in the flags you want set automatically. There are only two rules for
describing the format of the config file:
1. Every line is a shell argument, after trimming ASCII whitespace.
2. Lines starting with `#` (optionally preceded by any amount of
ASCII whitespace) are ignored.
1. Every line is a shell argument, after trimming whitespace.
2. Lines starting with `#` (optionally preceded by any amount of whitespace)
are ignored.
In particular, there is no escaping. Each line is given to ripgrep as a single
command line argument verbatim.

View File

@@ -107,9 +107,9 @@ ripgrep supports reading configuration files that change ripgrep's default
behavior. The format of the configuration file is an "rc" style and is very
simple. It is defined by two rules:
1. Every line is a shell argument, after trimming ASCII whitespace.
1. Every line is a shell argument, after trimming whitespace.
2. Lines starting with *#* (optionally preceded by any amount of
ASCII whitespace) are ignored.
whitespace) are ignored.
ripgrep will look for a single configuration file if and only if the
*RIPGREP_CONFIG_PATH* environment variable is set and is non-empty.

View File

@@ -20,10 +20,10 @@ bench = false
[dependencies]
aho-corasick = "0.7.3"
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
fnv = "1.0.6"
log = "0.4.5"
memchr = "2.1.0"
regex = "1.1.0"
regex = "1.1.5"
[dev-dependencies]
glob = "0.2.11"

View File

@@ -120,7 +120,7 @@ impl GlobMatcher {
/// Tests whether the given path matches this pattern or not.
pub fn is_match_candidate(&self, path: &Candidate) -> bool {
self.re.is_match(&path.path)
self.re.is_match(path.path.as_bytes())
}
}
@@ -145,7 +145,7 @@ impl GlobStrategic {
/// Tests whether the given path matches this pattern or not.
fn is_match_candidate(&self, candidate: &Candidate) -> bool {
let byte_path = &*candidate.path;
let byte_path = candidate.path.as_bytes();
match self.strategy {
MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path,

View File

@@ -104,27 +104,25 @@ or to enable case insensitive matching.
#![deny(missing_docs)]
extern crate aho_corasick;
extern crate bstr;
extern crate fnv;
#[macro_use]
extern crate log;
extern crate memchr;
extern crate regex;
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::error::Error as StdError;
use std::ffi::OsStr;
use std::fmt;
use std::hash;
use std::path::Path;
use std::str;
use aho_corasick::AhoCorasick;
use bstr::{B, BStr, BString};
use regex::bytes::{Regex, RegexBuilder, RegexSet};
use pathutil::{
file_name, file_name_ext, normalize_path, os_str_bytes, path_bytes,
};
use pathutil::{file_name, file_name_ext, normalize_path};
use glob::MatchStrategy;
pub use glob::{Glob, GlobBuilder, GlobMatcher};
@@ -294,6 +292,7 @@ pub struct GlobSet {
impl GlobSet {
/// Create an empty `GlobSet`. An empty set matches nothing.
#[inline]
pub fn empty() -> GlobSet {
GlobSet {
len: 0,
@@ -302,11 +301,13 @@ impl GlobSet {
}
/// Returns true if this set is empty, and therefore matches nothing.
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}
/// Returns the number of globs in this set.
#[inline]
pub fn len(&self) -> usize {
self.len
}
@@ -489,24 +490,25 @@ impl GlobSetBuilder {
/// path against multiple globs or sets of globs.
#[derive(Clone, Debug)]
pub struct Candidate<'a> {
path: Cow<'a, [u8]>,
basename: Cow<'a, [u8]>,
ext: Cow<'a, [u8]>,
path: Cow<'a, BStr>,
basename: Cow<'a, BStr>,
ext: Cow<'a, BStr>,
}
impl<'a> Candidate<'a> {
/// Create a new candidate for matching from the given path.
pub fn new<P: AsRef<Path> + ?Sized>(path: &'a P) -> Candidate<'a> {
let path = path.as_ref();
let basename = file_name(path).unwrap_or(OsStr::new(""));
let path = normalize_path(BString::from_path_lossy(path.as_ref()));
let basename = file_name(&path).unwrap_or(Cow::Borrowed(B("")));
let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B("")));
Candidate {
path: normalize_path(path_bytes(path)),
basename: os_str_bytes(basename),
ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")),
path: path,
basename: basename,
ext: ext,
}
}
fn path_prefix(&self, max: usize) -> &[u8] {
fn path_prefix(&self, max: usize) -> &BStr {
if self.path.len() <= max {
&*self.path
} else {
@@ -514,7 +516,7 @@ impl<'a> Candidate<'a> {
}
}
fn path_suffix(&self, max: usize) -> &[u8] {
fn path_suffix(&self, max: usize) -> &BStr {
if self.path.len() <= max {
&*self.path
} else {
@@ -575,12 +577,12 @@ impl LiteralStrategy {
}
fn is_match(&self, candidate: &Candidate) -> bool {
self.0.contains_key(&*candidate.path)
self.0.contains_key(candidate.path.as_bytes())
}
#[inline(never)]
fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
if let Some(hits) = self.0.get(&*candidate.path) {
if let Some(hits) = self.0.get(candidate.path.as_bytes()) {
matches.extend(hits);
}
}
@@ -602,7 +604,7 @@ impl BasenameLiteralStrategy {
if candidate.basename.is_empty() {
return false;
}
self.0.contains_key(&*candidate.basename)
self.0.contains_key(candidate.basename.as_bytes())
}
#[inline(never)]
@@ -610,7 +612,7 @@ impl BasenameLiteralStrategy {
if candidate.basename.is_empty() {
return;
}
if let Some(hits) = self.0.get(&*candidate.basename) {
if let Some(hits) = self.0.get(candidate.basename.as_bytes()) {
matches.extend(hits);
}
}
@@ -632,7 +634,7 @@ impl ExtensionStrategy {
if candidate.ext.is_empty() {
return false;
}
self.0.contains_key(&*candidate.ext)
self.0.contains_key(candidate.ext.as_bytes())
}
#[inline(never)]
@@ -640,7 +642,7 @@ impl ExtensionStrategy {
if candidate.ext.is_empty() {
return;
}
if let Some(hits) = self.0.get(&*candidate.ext) {
if let Some(hits) = self.0.get(candidate.ext.as_bytes()) {
matches.extend(hits);
}
}
@@ -710,11 +712,11 @@ impl RequiredExtensionStrategy {
if candidate.ext.is_empty() {
return false;
}
match self.0.get(&*candidate.ext) {
match self.0.get(candidate.ext.as_bytes()) {
None => false,
Some(regexes) => {
for &(_, ref re) in regexes {
if re.is_match(&*candidate.path) {
if re.is_match(candidate.path.as_bytes()) {
return true;
}
}
@@ -728,9 +730,9 @@ impl RequiredExtensionStrategy {
if candidate.ext.is_empty() {
return;
}
if let Some(regexes) = self.0.get(&*candidate.ext) {
if let Some(regexes) = self.0.get(candidate.ext.as_bytes()) {
for &(global_index, ref re) in regexes {
if re.is_match(&*candidate.path) {
if re.is_match(candidate.path.as_bytes()) {
matches.push(global_index);
}
}
@@ -746,11 +748,11 @@ struct RegexSetStrategy {
impl RegexSetStrategy {
fn is_match(&self, candidate: &Candidate) -> bool {
self.matcher.is_match(&*candidate.path)
self.matcher.is_match(candidate.path.as_bytes())
}
fn matches_into(&self, candidate: &Candidate, matches: &mut Vec<usize>) {
for i in self.matcher.matches(&*candidate.path) {
for i in self.matcher.matches(candidate.path.as_bytes()) {
matches.push(self.map[i]);
}
}

View File

@@ -1,41 +1,26 @@
use std::borrow::Cow;
use std::ffi::OsStr;
use std::path::Path;
use bstr::BStr;
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(unix)]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
use std::os::unix::ffi::OsStrExt;
use memchr::memrchr;
let path = path.as_ref().as_os_str().as_bytes();
pub fn file_name<'a>(path: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
if path.is_empty() {
return None;
} else if path.len() == 1 && path[0] == b'.' {
return None;
} else if path.last() == Some(&b'.') {
return None;
} else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
} else if path.last() == Some(b'.') {
return None;
}
let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0);
Some(OsStr::from_bytes(&path[last_slash..]))
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(not(unix))]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
path.as_ref().file_name()
let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0);
Some(match *path {
Cow::Borrowed(path) => Cow::Borrowed(&path[last_slash..]),
Cow::Owned(ref path) => {
let mut path = path.clone();
path.drain_bytes(..last_slash);
Cow::Owned(path)
}
})
}
/// Return a file extension given a path's file name.
@@ -54,59 +39,28 @@ pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
/// a pattern like `*.rs` is obviously trying to match files with a `rs`
/// extension, but it also matches files like `.rs`, which doesn't have an
/// extension according to std::path::Path::extension.
pub fn file_name_ext(name: &OsStr) -> Option<Cow<[u8]>> {
pub fn file_name_ext<'a>(name: &Cow<'a, BStr>) -> Option<Cow<'a, BStr>> {
if name.is_empty() {
return None;
}
let name = os_str_bytes(name);
let last_dot_at = {
let result = name
.iter().enumerate().rev()
.find(|&(_, &b)| b == b'.')
.map(|(i, _)| i);
match result {
None => return None,
Some(i) => i,
}
let last_dot_at = match name.rfind_byte(b'.') {
None => return None,
Some(i) => i,
};
Some(match name {
Some(match *name {
Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]),
Cow::Owned(mut name) => {
name.drain(..last_dot_at);
Cow::Owned(ref name) => {
let mut name = name.clone();
name.drain_bytes(..last_dot_at);
Cow::Owned(name)
}
})
}
/// Return raw bytes of a path, transcoded to UTF-8 if necessary.
pub fn path_bytes(path: &Path) -> Cow<[u8]> {
os_str_bytes(path.as_os_str())
}
/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
#[cfg(unix)]
pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(s.as_bytes())
}
/// Return the raw bytes of the given OS string, possibly transcoded to UTF-8.
#[cfg(not(unix))]
pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
// TODO(burntsushi): On Windows, OS strings are WTF-8, which is a superset
// of UTF-8, so even if we could get at the raw bytes, they wouldn't
// be useful. We *must* convert to UTF-8 before doing path matching.
// Unfortunate, but necessary.
match s.to_string_lossy() {
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
}
}
/// Normalizes a path to use `/` as a separator everywhere, even on platforms
/// that recognize other characters as separators.
#[cfg(unix)]
pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
pub fn normalize_path(path: Cow<BStr>) -> Cow<BStr> {
// UNIX only uses /, so we're good.
path
}
@@ -114,7 +68,7 @@ pub fn normalize_path(path: Cow<[u8]>) -> Cow<[u8]> {
/// Normalizes a path to use `/` as a separator everywhere, even on platforms
/// that recognize other characters as separators.
#[cfg(not(unix))]
pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
pub fn normalize_path(mut path: Cow<BStr>) -> Cow<BStr> {
use std::path::is_separator;
for i in 0..path.len() {
@@ -129,7 +83,8 @@ pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> {
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use std::ffi::OsStr;
use bstr::{B, BString};
use super::{file_name_ext, normalize_path};
@@ -137,8 +92,9 @@ mod tests {
($name:ident, $file_name:expr, $ext:expr) => {
#[test]
fn $name() {
let got = file_name_ext(OsStr::new($file_name));
assert_eq!($ext.map(|s| Cow::Borrowed(s.as_bytes())), got);
let bs = BString::from($file_name);
let got = file_name_ext(&Cow::Owned(bs));
assert_eq!($ext.map(|s| Cow::Borrowed(B(s))), got);
}
};
}
@@ -153,7 +109,8 @@ mod tests {
($name:ident, $path:expr, $expected:expr) => {
#[test]
fn $name() {
let got = normalize_path(Cow::Owned($path.to_vec()));
let bs = BString::from_slice($path);
let got = normalize_path(Cow::Owned(bs));
assert_eq!($expected.to_vec(), got.into_owned());
}
};

View File

@@ -14,6 +14,7 @@ license = "Unlicense/MIT"
[dependencies]
atty = "0.2.11"
bstr = "0.1.2"
globset = { version = "0.4.2", path = "../globset" }
lazy_static = "1.1.0"
log = "0.4.5"

View File

@@ -1,6 +1,8 @@
use std::ffi::OsStr;
use std::str;
use bstr::{BStr, BString};
/// A single state in the state machine used by `unescape`.
#[derive(Clone, Copy, Eq, PartialEq)]
enum State {
@@ -35,18 +37,16 @@ enum State {
///
/// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz"));
/// ```
pub fn escape(mut bytes: &[u8]) -> String {
pub fn escape(bytes: &[u8]) -> String {
let bytes = BStr::new(bytes);
let mut escaped = String::new();
while let Some(result) = decode_utf8(bytes) {
match result {
Ok(cp) => {
escape_char(cp, &mut escaped);
bytes = &bytes[cp.len_utf8()..];
}
Err(byte) => {
escape_byte(byte, &mut escaped);
bytes = &bytes[1..];
for (s, e, ch) in bytes.char_indices() {
if ch == '\u{FFFD}' {
for b in bytes[s..e].bytes() {
escape_byte(b, &mut escaped);
}
} else {
escape_char(ch, &mut escaped);
}
}
escaped
@@ -56,19 +56,7 @@ pub fn escape(mut bytes: &[u8]) -> String {
///
/// This is like [`escape`](fn.escape.html), but accepts an OS string.
pub fn escape_os(string: &OsStr) -> String {
#[cfg(unix)]
fn imp(string: &OsStr) -> String {
use std::os::unix::ffi::OsStrExt;
escape(string.as_bytes())
}
#[cfg(not(unix))]
fn imp(string: &OsStr) -> String {
escape(string.to_string_lossy().as_bytes())
}
imp(string)
escape(BString::from_os_str_lossy(string).as_bytes())
}
/// Unescapes a string.
@@ -195,46 +183,6 @@ fn escape_byte(byte: u8, into: &mut String) {
}
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the given
/// byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
if bytes.is_empty() {
return None;
}
let len = match utf8_len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(len) => len,
};
match str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}
/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
fn utf8_len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
Some(1)
} else if byte <= 0b110_11111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::{escape, unescape};

View File

@@ -159,6 +159,7 @@ error message is crafted that typically tells the user how to fix the problem.
#![deny(missing_docs)]
extern crate atty;
extern crate bstr;
extern crate globset;
#[macro_use]
extern crate lazy_static;

View File

@@ -19,6 +19,7 @@ serde1 = ["base64", "serde", "serde_derive", "serde_json"]
[dependencies]
base64 = { version = "0.10.0", optional = true }
bstr = "0.1.2"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
grep-searcher = { version = "0.1.1", path = "../grep-searcher" }
termcolor = "1.0.4"

View File

@@ -70,6 +70,7 @@ fn example() -> Result<(), Box<Error>> {
#[cfg(feature = "serde1")]
extern crate base64;
extern crate bstr;
extern crate grep_matcher;
#[cfg(test)]
extern crate grep_regex;

View File

@@ -4,6 +4,7 @@ use std::io;
use std::path::Path;
use std::time;
use bstr::{BStr, BString};
use grep_matcher::{Captures, LineTerminator, Match, Matcher};
use grep_searcher::{
LineIter,
@@ -262,26 +263,12 @@ impl<'a> Sunk<'a> {
/// portability with a small cost: on Windows, paths that are not valid UTF-16
/// will not roundtrip correctly.
#[derive(Clone, Debug)]
pub struct PrinterPath<'a>(Cow<'a, [u8]>);
pub struct PrinterPath<'a>(Cow<'a, BStr>);
impl<'a> PrinterPath<'a> {
/// Create a new path suitable for printing.
pub fn new(path: &'a Path) -> PrinterPath<'a> {
PrinterPath::new_impl(path)
}
#[cfg(unix)]
fn new_impl(path: &'a Path) -> PrinterPath<'a> {
use std::os::unix::ffi::OsStrExt;
PrinterPath(Cow::Borrowed(path.as_os_str().as_bytes()))
}
#[cfg(not(unix))]
fn new_impl(path: &'a Path) -> PrinterPath<'a> {
PrinterPath(match path.to_string_lossy() {
Cow::Owned(path) => Cow::Owned(path.into_bytes()),
Cow::Borrowed(path) => Cow::Borrowed(path.as_bytes()),
})
PrinterPath(BString::from_path_lossy(path))
}
/// Create a new printer path from the given path which can be efficiently
@@ -302,7 +289,7 @@ impl<'a> PrinterPath<'a> {
/// path separators that are both replaced by `new_sep`. In all other
/// environments, only `/` is treated as a path separator.
fn replace_separator(&mut self, new_sep: u8) {
let transformed_path: Vec<_> = self.as_bytes().iter().map(|&b| {
let transformed_path: BString = self.0.bytes().map(|b| {
if b == b'/' || (cfg!(windows) && b == b'\\') {
new_sep
} else {
@@ -314,7 +301,7 @@ impl<'a> PrinterPath<'a> {
/// Return the raw bytes for this path.
pub fn as_bytes(&self) -> &[u8] {
&*self.0
self.0.as_bytes()
}
}

View File

@@ -207,7 +207,7 @@ impl ConfiguredHIR {
if self.config.line_terminator.is_none() {
return Ok(None);
}
match LiteralSets::new(&self.expr).one_regex() {
match LiteralSets::new(&self.expr).one_regex(self.config.word) {
None => Ok(None),
Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
}

View File

@@ -34,6 +34,11 @@ impl CRLFMatcher {
}
Ok(CRLFMatcher { regex, names })
}
/// Return the underlying regex used by this matcher.
pub fn regex(&self) -> &Regex {
&self.regex
}
}
impl Matcher for CRLFMatcher {

View File

@@ -47,18 +47,23 @@ impl LiteralSets {
/// generated these literal sets. The idea here is that the pattern
/// returned by this method is much cheaper to search for. i.e., It is
/// usually a single literal or an alternation of literals.
pub fn one_regex(&self) -> Option<String> {
pub fn one_regex(&self, word: bool) -> Option<String> {
// TODO: The logic in this function is basically inscrutable. It grew
// organically in the old grep 0.1 crate. Ideally, it would be
// re-worked. In fact, the entire inner literal extraction should be
// re-worked. Actually, most of regex-syntax's literal extraction
// should also be re-worked. Alas... only so much time in the day.
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan,
// so we don't need to return anything.
return None;
if !word {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan,
// so we don't need to return anything. But we only do this
// if we aren't doing a word regex, since a word regex adds
// a `(?:\W|^)` to the beginning of the regex, thereby
// defeating the regex engine's literal detection.
return None;
}
}
// Out of inner required literals, prefixes and suffixes, which one
@@ -285,7 +290,7 @@ mod tests {
}
fn one_regex(pattern: &str) -> Option<String> {
sets(pattern).one_regex()
sets(pattern).one_regex(false)
}
// Put a pattern into the same format as the one returned by `one_regex`.

View File

@@ -50,9 +50,12 @@ impl RegexMatcherBuilder {
if let Some(ref re) = fast_line_regex {
trace!("extracted fast line regex: {:?}", re);
}
let matcher = RegexMatcherImpl::new(&chir)?;
trace!("final regex: {:?}", matcher.regex());
Ok(RegexMatcher {
config: self.config.clone(),
matcher: RegexMatcherImpl::new(&chir)?,
matcher: matcher,
fast_line_regex: fast_line_regex,
non_matching_bytes: non_matching_bytes,
})
@@ -370,6 +373,15 @@ impl RegexMatcherImpl {
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
}
}
/// Return the underlying regex object used.
fn regex(&self) -> &Regex {
match *self {
RegexMatcherImpl::Word(ref x) => x.regex(),
RegexMatcherImpl::CRLF(ref x) => x.regex(),
RegexMatcherImpl::Standard(ref x) => &x.regex,
}
}
}
// This implementation just dispatches on the internal matcher impl except

View File

@@ -55,6 +55,11 @@ impl WordMatcher {
}
Ok(WordMatcher { regex, names, locs })
}
/// Return the underlying regex used by this matcher.
pub fn regex(&self) -> &Regex {
&self.regex
}
}
impl Matcher for WordMatcher {

View File

@@ -13,12 +13,12 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
bstr = { version = "0.1.2", default-features = false, features = ["std"] }
bytecount = "0.5"
encoding_rs = "0.8.14"
encoding_rs_io = "0.1.4"
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
log = "0.4.5"
memchr = "2.1"
memmap = "0.7"
[dev-dependencies]

View File

@@ -99,13 +99,13 @@ searches stdin.
#![deny(missing_docs)]
extern crate bstr;
extern crate bytecount;
extern crate encoding_rs;
extern crate encoding_rs_io;
extern crate grep_matcher;
#[macro_use]
extern crate log;
extern crate memchr;
extern crate memmap;
#[cfg(test)]
extern crate regex;

View File

@@ -1,8 +1,7 @@
use std::cmp;
use std::io;
use std::ptr;
use memchr::{memchr, memrchr};
use bstr::{BStr, BString};
/// The default buffer capacity that we use for the line buffer.
pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB
@@ -123,7 +122,7 @@ impl LineBufferBuilder {
pub fn build(&self) -> LineBuffer {
LineBuffer {
config: self.config,
buf: vec![0; self.config.capacity],
buf: BString::from(vec![0; self.config.capacity]),
pos: 0,
last_lineterm: 0,
end: 0,
@@ -255,6 +254,12 @@ impl<'b, R: io::Read> LineBufferReader<'b, R> {
/// Return the contents of this buffer.
pub fn buffer(&self) -> &[u8] {
self.line_buffer.buffer().as_bytes()
}
/// Return the underlying buffer as a byte string. Used for tests only.
#[cfg(test)]
fn bstr(&self) -> &BStr {
self.line_buffer.buffer()
}
@@ -284,7 +289,7 @@ pub struct LineBuffer {
/// The configuration of this buffer.
config: Config,
/// The primary buffer with which to hold data.
buf: Vec<u8>,
buf: BString,
/// The current position of this buffer. This is always a valid sliceable
/// index into `buf`, and its maximum value is the length of `buf`.
pos: usize,
@@ -339,13 +344,13 @@ impl LineBuffer {
}
/// Return the contents of this buffer.
fn buffer(&self) -> &[u8] {
fn buffer(&self) -> &BStr {
&self.buf[self.pos..self.last_lineterm]
}
/// Return the contents of the free space beyond the end of the buffer as
/// a mutable slice.
fn free_buffer(&mut self) -> &mut [u8] {
fn free_buffer(&mut self) -> &mut BStr {
&mut self.buf[self.end..]
}
@@ -396,7 +401,7 @@ impl LineBuffer {
assert_eq!(self.pos, 0);
loop {
self.ensure_capacity()?;
let readlen = rdr.read(self.free_buffer())?;
let readlen = rdr.read(self.free_buffer().as_bytes_mut())?;
if readlen == 0 {
// We're only done reading for good once the caller has
// consumed everything.
@@ -416,7 +421,7 @@ impl LineBuffer {
match self.config.binary {
BinaryDetection::None => {} // nothing to do
BinaryDetection::Quit(byte) => {
if let Some(i) = memchr(byte, newbytes) {
if let Some(i) = newbytes.find_byte(byte) {
self.end = oldend + i;
self.last_lineterm = self.end;
self.binary_byte_offset =
@@ -444,7 +449,7 @@ impl LineBuffer {
}
// Update our `last_lineterm` positions if we read one.
if let Some(i) = memrchr(self.config.lineterm, newbytes) {
if let Some(i) = newbytes.rfind_byte(self.config.lineterm) {
self.last_lineterm = oldend + i + 1;
return Ok(true);
}
@@ -467,40 +472,8 @@ impl LineBuffer {
return;
}
assert!(self.pos < self.end && self.end <= self.buf.len());
let roll_len = self.end - self.pos;
unsafe {
// SAFETY: A buffer contains Copy data, so there's no problem
// moving it around. Safety also depends on our indices being
// in bounds, which they should always be, and we enforce with
// an assert above.
//
// It seems like it should be possible to do this in safe code that
// results in the same codegen. I tried the obvious:
//
// for (src, dst) in (self.pos..self.end).zip(0..) {
// self.buf[dst] = self.buf[src];
// }
//
// But the above does not work, and in fact compiles down to a slow
// byte-by-byte loop. I tried a few other minor variations, but
// alas, better minds might prevail.
//
// Overall, this doesn't save us *too* much. It mostly matters when
// the number of bytes we're copying is large, which can happen
// if the searcher is asked to produce a lot of context. We could
// decide this isn't worth it, but it does make an appreciable
// impact at or around the context=30 range on my machine.
//
// We could also use a temporary buffer that compiles down to two
// memcpys and is faster than the byte-at-a-time loop, but it
// complicates our options for limiting memory allocation a bit.
ptr::copy(
self.buf[self.pos..].as_ptr(),
self.buf.as_mut_ptr(),
roll_len,
);
}
self.buf.copy_within(self.pos.., 0);
self.pos = 0;
self.last_lineterm = roll_len;
self.end = roll_len;
@@ -536,14 +509,15 @@ impl LineBuffer {
}
}
/// Replaces `src` with `replacement` in bytes.
fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
/// Replaces `src` with `replacement` in bytes, and return the offset of the
/// first replacement, if one exists.
fn replace_bytes(bytes: &mut BStr, src: u8, replacement: u8) -> Option<usize> {
if src == replacement {
return None;
}
let mut first_pos = None;
let mut pos = 0;
while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) {
while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) {
if first_pos.is_none() {
first_pos = Some(i);
}
@@ -560,6 +534,7 @@ fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
#[cfg(test)]
mod tests {
use std::str;
use bstr::BString;
use super::*;
const SHERLOCK: &'static str = "\
@@ -575,18 +550,14 @@ and exhibited clearly, with a label attached.\
slice.to_string()
}
fn btos(slice: &[u8]) -> &str {
str::from_utf8(slice).unwrap()
}
fn replace_str(
slice: &str,
src: u8,
replacement: u8,
) -> (String, Option<usize>) {
let mut dst = slice.to_string().into_bytes();
let mut dst = BString::from(slice);
let result = replace_bytes(&mut dst, src, replacement);
(String::from_utf8(dst).unwrap(), result)
(dst.into_string().unwrap(), result)
}
#[test]
@@ -607,7 +578,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\n");
assert_eq!(rdr.bstr(), "homer\nlisa\n");
assert_eq!(rdr.absolute_byte_offset(), 0);
rdr.consume(5);
assert_eq!(rdr.absolute_byte_offset(), 5);
@@ -615,7 +586,7 @@ and exhibited clearly, with a label attached.\
assert_eq!(rdr.absolute_byte_offset(), 11);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "maggie");
assert_eq!(rdr.bstr(), "maggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -630,7 +601,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -645,7 +616,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\n");
assert_eq!(rdr.bstr(), "\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -660,7 +631,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\n\n");
assert_eq!(rdr.bstr(), "\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -698,12 +669,12 @@ and exhibited clearly, with a label attached.\
let mut linebuf = LineBufferBuilder::new().capacity(1).build();
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
let mut got = vec![];
let mut got = BString::new();
while rdr.fill().unwrap() {
got.extend(rdr.buffer());
got.push(rdr.buffer());
rdr.consume_all();
}
assert_eq!(bytes, btos(&got));
assert_eq!(bytes, got);
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
assert_eq!(rdr.binary_byte_offset(), None);
}
@@ -718,11 +689,11 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\n");
assert_eq!(rdr.bstr(), "homer\n");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "lisa\n");
assert_eq!(rdr.bstr(), "lisa\n");
rdr.consume_all();
// This returns an error because while we have just enough room to
@@ -732,11 +703,11 @@ and exhibited clearly, with a label attached.\
assert!(rdr.fill().is_err());
// We can mush on though!
assert_eq!(btos(rdr.buffer()), "m");
assert_eq!(rdr.bstr(), "m");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "aggie");
assert_eq!(rdr.bstr(), "aggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -752,16 +723,16 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\n");
assert_eq!(rdr.bstr(), "homer\n");
rdr.consume_all();
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "lisa\n");
assert_eq!(rdr.bstr(), "lisa\n");
rdr.consume_all();
// We have just enough space.
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "maggie");
assert_eq!(rdr.bstr(), "maggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -777,7 +748,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(rdr.fill().is_err());
assert_eq!(btos(rdr.buffer()), "");
assert_eq!(rdr.bstr(), "");
}
#[test]
@@ -789,7 +760,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nli\x00sa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -808,7 +779,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nli");
assert_eq!(rdr.bstr(), "homer\nli");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -825,7 +796,7 @@ and exhibited clearly, with a label attached.\
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
assert!(!rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "");
assert_eq!(rdr.bstr(), "");
assert_eq!(rdr.absolute_byte_offset(), 0);
assert_eq!(rdr.binary_byte_offset(), Some(0));
}
@@ -841,7 +812,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -860,7 +831,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -878,7 +849,7 @@ and exhibited clearly, with a label attached.\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\
assert_eq!(rdr.bstr(), "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, s\
");
@@ -901,7 +872,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n");
assert_eq!(rdr.bstr(), "homer\nli\nsa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -920,7 +891,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n");
assert_eq!(rdr.bstr(), "\nhomer\nlisa\nmaggie\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -939,7 +910,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());
@@ -958,7 +929,7 @@ Holmeses, s\
assert!(rdr.buffer().is_empty());
assert!(rdr.fill().unwrap());
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n");
rdr.consume_all();
assert!(!rdr.fill().unwrap());

View File

@@ -2,8 +2,8 @@
A collection of routines for performing operations on lines.
*/
use bstr::B;
use bytecount;
use memchr::{memchr, memrchr};
use grep_matcher::{LineTerminator, Match};
/// An iterator over lines in a particular slice of bytes.
@@ -85,7 +85,7 @@ impl LineStep {
#[inline(always)]
fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
bytes = &bytes[..self.end];
match memchr(self.line_term, &bytes[self.pos..]) {
match B(&bytes[self.pos..]).find_byte(self.line_term) {
None => {
if self.pos < bytes.len() {
let m = (self.pos, bytes.len());
@@ -135,14 +135,16 @@ pub fn locate(
line_term: u8,
range: Match,
) -> Match {
let line_start = memrchr(line_term, &bytes[0..range.start()])
let line_start = B(&bytes[..range.start()])
.rfind_byte(line_term)
.map_or(0, |i| i + 1);
let line_end =
if range.end() > line_start && bytes[range.end() - 1] == line_term {
range.end()
} else {
memchr(line_term, &bytes[range.end()..])
.map_or(bytes.len(), |i| range.end() + i + 1)
B(&bytes[range.end()..])
.find_byte(line_term)
.map_or(bytes.len(), |i| range.end() + i + 1)
};
Match::new(line_start, line_end)
}
@@ -180,7 +182,7 @@ fn preceding_by_pos(
pos -= 1;
}
loop {
match memrchr(line_term, &bytes[..pos]) {
match B(&bytes[..pos]).rfind_byte(line_term) {
None => {
return 0;
}

View File

@@ -1,6 +1,6 @@
use std::cmp;
use memchr::memchr;
use bstr::B;
use grep_matcher::{LineMatchKind, Matcher};
use lines::{self, LineStep};
@@ -149,7 +149,7 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
BinaryDetection::Quit(b) => b,
_ => return false,
};
if let Some(i) = memchr(binary_byte, &buf[*range]) {
if let Some(i) = B(&buf[*range]).find_byte(binary_byte) {
self.binary_byte_offset = Some(range.start() + i);
true
} else {

View File

@@ -1,10 +1,10 @@
use std::io::{self, Write};
use std::str;
use bstr::B;
use grep_matcher::{
LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError,
};
use memchr::memchr;
use regex::bytes::{Regex, RegexBuilder};
use searcher::{BinaryDetection, Searcher, SearcherBuilder};
@@ -94,7 +94,8 @@ impl Matcher for RegexMatcher {
}
// Make it interesting and return the last byte in the current
// line.
let i = memchr(self.line_term.unwrap().as_byte(), haystack)
let i = B(haystack)
.find_byte(self.line_term.unwrap().as_byte())
.map(|i| i)
.unwrap_or(haystack.len() - 1);
Ok(Some(LineMatchKind::Candidate(i)))

View File

@@ -5,10 +5,11 @@
use std::env;
use std::error::Error;
use std::fs::File;
use std::io::{self, BufRead};
use std::io;
use std::ffi::OsString;
use std::path::{Path, PathBuf};
use bstr::io::BufReadExt;
use log;
use crate::Result;
@@ -76,62 +77,29 @@ fn parse<P: AsRef<Path>>(
fn parse_reader<R: io::Read>(
rdr: R,
) -> Result<(Vec<OsString>, Vec<Box<Error>>)> {
let mut bufrdr = io::BufReader::new(rdr);
let bufrdr = io::BufReader::new(rdr);
let (mut args, mut errs) = (vec![], vec![]);
let mut line = vec![];
let mut line_number = 0;
while {
line.clear();
bufrdr.for_byte_line_with_terminator(|line| {
line_number += 1;
bufrdr.read_until(b'\n', &mut line)? > 0
} {
trim(&mut line);
let line = line.trim();
if line.is_empty() || line[0] == b'#' {
continue;
return Ok(true);
}
match bytes_to_os_string(&line) {
match line.to_os_str() {
Ok(osstr) => {
args.push(osstr);
args.push(osstr.to_os_string());
}
Err(err) => {
errs.push(format!("{}: {}", line_number, err).into());
}
}
}
Ok(true)
})?;
Ok((args, errs))
}
/// Trim the given bytes of whitespace according to the ASCII definition.
fn trim(x: &mut Vec<u8>) {
let upto = x.iter().take_while(|b| is_space(**b)).count();
x.drain(..upto);
let revto = x.len() - x.iter().rev().take_while(|b| is_space(**b)).count();
x.drain(revto..);
}
/// Returns true if and only if the given byte is an ASCII space character.
fn is_space(b: u8) -> bool {
b == b'\t'
|| b == b'\n'
|| b == b'\x0B'
|| b == b'\x0C'
|| b == b'\r'
|| b == b' '
}
/// On Unix, get an OsString from raw bytes.
#[cfg(unix)]
fn bytes_to_os_string(bytes: &[u8]) -> Result<OsString> {
use std::os::unix::ffi::OsStringExt;
Ok(OsString::from_vec(bytes.to_vec()))
}
/// On non-Unix (like Windows), require UTF-8.
#[cfg(not(unix))]
fn bytes_to_os_string(bytes: &[u8]) -> Result<OsString> {
String::from_utf8(bytes.to_vec()).map(OsString::from).map_err(From::from)
}
#[cfg(test)]
mod tests {
use std::ffi::OsString;