Lots of progress:

- Refactored interaction between CLI args and rest of xrep.
  - Filling in a lot more options, including file type filtering.
  - Fixing some bugs in globbing/ignoring.
  - More documentation.
This commit is contained in:
Andrew Gallant 2016-09-05 00:52:23 -04:00
parent 0bf278e72f
commit 812cdb13c6
9 changed files with 1567 additions and 408 deletions

551
src/args.rs Normal file
View File

@ -0,0 +1,551 @@
use std::cmp;
use std::env;
use std::io;
use std::path::{Path, PathBuf};
use docopt::Docopt;
use env_logger;
use grep::{Grep, GrepBuilder};
use log;
use num_cpus;
use regex;
use walkdir::WalkDir;
use gitignore::{Gitignore, GitignoreBuilder};
use ignore::Ignore;
use out::Out;
use printer::Printer;
use search::{InputBuffer, Searcher};
use types::{FileTypeDef, Types, TypesBuilder};
use walk;
use Result;
/// The Docopt usage string.
///
/// If you've never heard of Docopt before, see: http://docopt.org
/// (TL;DR: The CLI parser is generated from the usage string below.)
const USAGE: &'static str = "
Usage: xrep [options] <pattern> [<path> ...]
xrep [options] --files [<path> ...]
xrep [options] --type-list
xrep --help
xrep --version
xrep is like the silver searcher and grep, but faster than both.
Common options:
-a, --text Search binary files as if they were text.
-c, --count Only show count of line matches for each file.
-g, --glob GLOB ... Include or exclude files for searching that
match the given glob. This always overrides any
other ignore logic. Multiple glob flags may be
used. Globbing rules match .gitignore globs.
Precede a glob with a '!' to exclude it.
-h, --help Show this usage message.
-i, --ignore-case Case insensitive search.
-n, --line-number Show line numbers (1-based).
-q, --quiet Do not print anything to stdout.
-t, --type TYPE ... Only search files matching TYPE. Multiple type
flags may be provided. Use the --type-list flag
to list all available types.
-T, --type-not TYPE ... Do not search files matching TYPE. Multiple
not-type flags may be provided.
-v, --invert-match Invert matching.
-w, --word-regexp Only show matches surrounded by word boundaries.
This is equivalent to putting \\b before and
after the search pattern.
Less common options:
-A, --after-context NUM
Show NUM lines after each match.
-B, --before-context NUM
Show NUM lines before each match.
-C, --context NUM
Show NUM lines before and after each match.
--context-separator ARG
The string to use when separating non-continuous context lines. Escape
sequences may be used. [default: --]
--debug
Show debug messages.
--files
Print each file that would be searched (but don't search).
-H, --with-filename
Prefix each match with the file name that contains it. This is the
default when more than one file is searched.
--hidden
Search hidden directories and files.
-L, --follow
Follow symlinks.
--line-terminator ARG
The byte to use for a line terminator. Escape sequences may be used.
[default: \\n]
--no-ignore
Don't respect ignore files (.gitignore, .xrepignore, etc.)
-Q, --literal
Treat the pattern as a literal string instead of a regular expression.
--threads ARG
The number of threads to use. Defaults to the number of logical CPUs
(capped at 6). [default: 0]
--version
Show the version number of xrep and exit.
File type management options:
--type-list
Show all supported file types and their associated globs.
--type-add ARG ...
Add a new glob for a particular file type.
Example: --type-add html:*.html,*.htm
--type-clear TYPE ...
Clear the file type globs for TYPE.
";
/// RawArgs are the args as they are parsed from Docopt. They aren't used
/// directly by the rest of xrep.
#[derive(Debug, RustcDecodable)]
pub struct RawArgs {
arg_pattern: String,
arg_path: Vec<String>,
flag_after_context: usize,
flag_before_context: usize,
flag_context: usize,
flag_context_separator: String,
flag_count: bool,
flag_debug: bool,
flag_files: bool,
flag_follow: bool,
flag_glob: Vec<String>,
flag_hidden: bool,
flag_ignore_case: bool,
flag_invert_match: bool,
flag_line_number: bool,
flag_line_terminator: String,
flag_literal: bool,
flag_no_ignore: bool,
flag_quiet: bool,
flag_text: bool,
flag_threads: usize,
flag_type: Vec<String>,
flag_type_not: Vec<String>,
flag_type_list: bool,
flag_type_add: Vec<String>,
flag_type_clear: Vec<String>,
flag_with_filename: bool,
flag_word_regexp: bool,
}
/// Args are transformed/normalized from RawArgs.
#[derive(Debug)]
pub struct Args {
pattern: String,
paths: Vec<PathBuf>,
after_context: usize,
before_context: usize,
context_separator: Vec<u8>,
count: bool,
eol: u8,
files: bool,
follow: bool,
glob_overrides: Option<Gitignore>,
hidden: bool,
ignore_case: bool,
invert_match: bool,
line_number: bool,
no_ignore: bool,
quiet: bool,
text: bool,
threads: usize,
type_defs: Vec<FileTypeDef>,
type_list: bool,
types: Types,
with_filename: bool,
}
impl RawArgs {
/// Convert arguments parsed into a configuration used by xrep.
fn to_args(&self) -> Result<Args> {
let pattern = {
let pattern =
if self.flag_literal {
regex::quote(&self.arg_pattern)
} else {
self.arg_pattern.clone()
};
if self.flag_word_regexp {
format!(r"\b{}\b", pattern)
} else {
pattern
}
};
let paths =
if self.arg_path.is_empty() {
vec![Path::new("./").to_path_buf()]
} else {
self.arg_path.iter().map(|p| {
Path::new(p).to_path_buf()
}).collect()
};
let (after_context, before_context) =
if self.flag_context > 0 {
(self.flag_context, self.flag_context)
} else {
(self.flag_after_context, self.flag_before_context)
};
let eol = {
let eol = unescape(&self.flag_line_terminator);
if eol.is_empty() {
errored!("Empty line terminator is not allowed.");
} else if eol.len() > 1 {
errored!("Line terminators are limited to exactly 1 byte.");
}
eol[0]
};
let glob_overrides =
if self.flag_glob.is_empty() {
None
} else {
let cwd = try!(env::current_dir());
let mut bgi = GitignoreBuilder::new(cwd);
for pat in &self.flag_glob {
try!(bgi.add("<argv>", pat));
}
Some(try!(bgi.build()))
};
let threads =
if self.flag_threads == 0 {
cmp::min(6, num_cpus::get())
} else {
self.flag_threads
};
let mut with_filename = self.flag_with_filename;
if !with_filename {
with_filename = paths.len() > 1 || paths[0].is_dir();
}
let mut btypes = TypesBuilder::new();
btypes.add_defaults();
try!(self.add_types(&mut btypes));
let types = try!(btypes.build());
Ok(Args {
pattern: pattern,
paths: paths,
after_context: after_context,
before_context: before_context,
context_separator: unescape(&self.flag_context_separator),
count: self.flag_count,
eol: eol,
files: self.flag_files,
follow: self.flag_follow,
glob_overrides: glob_overrides,
hidden: self.flag_hidden,
ignore_case: self.flag_ignore_case,
invert_match: self.flag_invert_match,
line_number: self.flag_line_number,
no_ignore: self.flag_no_ignore,
quiet: self.flag_quiet,
text: self.flag_text,
threads: threads,
type_defs: btypes.definitions(),
type_list: self.flag_type_list,
types: types,
with_filename: with_filename,
})
}
fn add_types(&self, types: &mut TypesBuilder) -> Result<()> {
for ty in &self.flag_type_clear {
types.clear(ty);
}
for def in &self.flag_type_add {
try!(types.add_def(def));
}
for ty in &self.flag_type {
types.select(ty);
}
for ty in &self.flag_type_not {
types.select_not(ty);
}
Ok(())
}
}
impl Args {
/// Parse the command line arguments for this process.
///
/// If a CLI usage error occurred, then exit the process and print a usage
/// or error message. Similarly, if the user requested the version of
/// xrep, then print the version and exit.
///
/// Also, initialize a global logger.
pub fn parse() -> Result<Args> {
let raw: RawArgs =
Docopt::new(USAGE)
.and_then(|d| d.version(Some(version())).decode())
.unwrap_or_else(|e| e.exit());
let mut logb = env_logger::LogBuilder::new();
if raw.flag_debug {
logb.filter(None, log::LogLevelFilter::Debug);
} else {
logb.filter(None, log::LogLevelFilter::Warn);
}
if let Err(err) = logb.init() {
errored!("failed to initialize logger: {}", err);
}
raw.to_args().map_err(From::from)
}
/// Returns true if xrep should print the files it will search and exit
/// (but not do any actual searching).
pub fn files(&self) -> bool {
self.files
}
/// Create a new line based matcher. The matcher returned can be used
/// across multiple threads simultaneously. This matcher only supports
/// basic searching of regular expressions in a single buffer.
///
/// The pattern and other flags are taken from the command line.
pub fn grep(&self) -> Result<Grep> {
GrepBuilder::new(&self.pattern)
.case_insensitive(self.ignore_case)
.line_terminator(self.eol)
.build()
.map_err(From::from)
}
/// Creates a new input buffer that is used in searching.
pub fn input_buffer(&self) -> InputBuffer {
let mut inp = InputBuffer::new();
inp.eol(self.eol);
inp
}
/// Create a new printer of individual search results that writes to the
/// writer given.
pub fn printer<W: io::Write>(&self, wtr: W) -> Printer<W> {
Printer::new(wtr)
.context_separator(self.context_separator.clone())
.eol(self.eol)
.quiet(self.quiet)
.with_filename(self.with_filename)
}
/// Create a new printer of search results for an entire file that writes
/// to the writer given.
pub fn out<W: io::Write>(&self, wtr: W) -> Out<W> {
let mut out = Out::new(wtr);
if self.before_context > 0 || self.after_context > 0 {
out = out.file_separator(self.context_separator.clone());
}
out
}
/// Return the paths that should be searched.
pub fn paths(&self) -> &[PathBuf] {
&self.paths
}
/// Create a new line based searcher whose configuration is taken from the
/// command line. This searcher supports a dizzying array of features:
/// inverted matching, line counting, context control and more.
pub fn searcher<'a, R: io::Read, W: io::Write>(
&self,
inp: &'a mut InputBuffer,
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
rdr: R,
) -> Searcher<'a, R, W> {
Searcher::new(inp, printer, grep, path, rdr)
.after_context(self.after_context)
.before_context(self.before_context)
.count(self.count)
.eol(self.eol)
.line_number(self.line_number)
.invert_match(self.invert_match)
.text(self.text)
}
/// Returns the number of worker search threads that should be used.
pub fn threads(&self) -> usize {
self.threads
}
/// Returns a list of type definitions currently loaded.
pub fn type_defs(&self) -> &[FileTypeDef] {
&self.type_defs
}
/// Returns true if xrep should print the type definitions currently loaded
/// and then exit.
pub fn type_list(&self) -> bool {
self.type_list
}
/// Create a new recursive directory iterator at the path given.
pub fn walker(&self, path: &Path) -> walk::Iter {
let wd = WalkDir::new(path).follow_links(self.follow);
let mut ig = Ignore::new();
ig.ignore_hidden(!self.hidden);
ig.no_ignore(self.no_ignore);
ig.add_types(self.types.clone());
if let Some(ref overrides) = self.glob_overrides {
ig.add_override(overrides.clone());
}
walk::Iter::new(ig, wd)
}
}
fn version() -> String {
let (maj, min, pat) = (
option_env!("CARGO_PKG_VERSION_MAJOR"),
option_env!("CARGO_PKG_VERSION_MINOR"),
option_env!("CARGO_PKG_VERSION_PATCH"),
);
match (maj, min, pat) {
(Some(maj), Some(min), Some(pat)) =>
format!("{}.{}.{}", maj, min, pat),
_ => "".to_owned(),
}
}
/// A single state in the state machine used by `unescape`.
#[derive(Clone, Copy, Eq, PartialEq)]
enum State {
Escape,
HexFirst,
HexSecond(char),
Literal,
}
/// Unescapes a string given on the command line. It supports a limit set of
/// escape sequences:
///
/// * \t, \r and \n are mapped to their corresponding ASCII bytes.
/// * \xZZ hexadecimal escapes are mapped to their byte.
fn unescape(s: &str) -> Vec<u8> {
use self::State::*;
let mut bytes = vec![];
let mut state = Literal;
for c in s.chars() {
match state {
Escape => {
match c {
'n' => { bytes.push(b'\n'); state = Literal; }
'r' => { bytes.push(b'\r'); state = Literal; }
't' => { bytes.push(b'\t'); state = Literal; }
'x' => { state = HexFirst; }
c => {
bytes.extend(&format!(r"\{}", c).into_bytes());
state = Literal;
}
}
}
HexFirst => {
match c {
'0'...'9' | 'A'...'F' | 'a'...'f' => {
state = HexSecond(c);
}
c => {
bytes.extend(&format!(r"\x{}", c).into_bytes());
state = Literal;
}
}
}
HexSecond(first) => {
match c {
'0'...'9' | 'A'...'F' | 'a'...'f' => {
let ordinal = format!("{}{}", first, c);
let byte = u8::from_str_radix(&ordinal, 16).unwrap();
bytes.push(byte);
state = Literal;
}
c => {
let original = format!(r"\x{}{}", first, c);
bytes.extend(&original.into_bytes());
state = Literal;
}
}
}
Literal => {
match c {
'\\' => { state = Escape; }
c => { bytes.extend(c.to_string().as_bytes()); }
}
}
}
}
match state {
Escape => bytes.push(b'\\'),
HexFirst => bytes.extend(b"\\x"),
HexSecond(c) => bytes.extend(&format!("\\x{}", c).into_bytes()),
Literal => {}
}
bytes
}
#[cfg(test)]
mod tests {
use super::unescape;
fn b(bytes: &'static [u8]) -> Vec<u8> {
bytes.to_vec()
}
#[test]
fn unescape_nul() {
assert_eq!(b(b"\x00"), unescape(r"\x00"));
}
#[test]
fn unescape_nl() {
assert_eq!(b(b"\n"), unescape(r"\n"));
}
#[test]
fn unescape_tab() {
assert_eq!(b(b"\t"), unescape(r"\t"));
}
#[test]
fn unescape_carriage() {
assert_eq!(b(b"\r"), unescape(r"\r"));
}
#[test]
fn unescape_nothing_simple() {
assert_eq!(b(b"\\a"), unescape(r"\a"));
}
#[test]
fn unescape_nothing_hex0() {
assert_eq!(b(b"\\x"), unescape(r"\x"));
}
#[test]
fn unescape_nothing_hex1() {
assert_eq!(b(b"\\xz"), unescape(r"\xz"));
}
#[test]
fn unescape_nothing_hex2() {
assert_eq!(b(b"\\xzz"), unescape(r"\xzz"));
}
}

View File

@ -79,6 +79,7 @@ impl From<io::Error> for Error {
} }
/// Gitignore is a matcher for the glob patterns in a single gitignore file. /// Gitignore is a matcher for the glob patterns in a single gitignore file.
#[derive(Clone, Debug)]
pub struct Gitignore { pub struct Gitignore {
set: glob::Set, set: glob::Set,
root: PathBuf, root: PathBuf,
@ -136,23 +137,27 @@ impl Gitignore {
pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match { pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match {
// A single regex with a bunch of alternations of glob patterns is // A single regex with a bunch of alternations of glob patterns is
// unfortunately typically faster than a regex, so we use it as a // unfortunately typically faster than a regex, so we use it as a
// first pass filter. We still need to run the RegexSet to most // first pass filter. We still need to run the RegexSet to get the most
// recently defined glob that matched. // recently defined glob that matched.
if !self.set.is_match(path) { if !self.set.is_match(path) {
return Match::None; return Match::None;
} }
let pat = match self.set.matches(path).iter().last() { // The regex set can't actually pick the right glob that matched all
None => return Match::None, // on its own. In particular, some globs require that only directories
Some(i) => &self.patterns[i], // can match. Thus, only accept a match from the regex set if the given
}; // path satisfies the corresponding glob's directory criteria.
if pat.whitelist { for i in self.set.matches(path).iter().rev() {
Match::Whitelist(&pat) let pat = &self.patterns[i];
} else if !pat.only_dir || is_dir { if !pat.only_dir || is_dir {
Match::Ignored(&pat) return if pat.whitelist {
Match::Whitelist(pat)
} else { } else {
Match::None Match::Ignored(pat)
};
} }
} }
Match::None
}
} }
/// The result of a glob match. /// The result of a glob match.
@ -177,6 +182,24 @@ impl<'a> Match<'a> {
Match::None | Match::Whitelist(_) => false, Match::None | Match::Whitelist(_) => false,
} }
} }
/// Returns true if the match result didn't match any globs.
pub fn is_none(&self) -> bool {
match *self {
Match::None => true,
Match::Ignored(_) | Match::Whitelist(_) => false,
}
}
/// Inverts the match so that Ignored becomes Whitelisted and Whitelisted
/// becomes Ignored. A non-match remains the same.
pub fn invert(self) -> Match<'a> {
match self {
Match::None => Match::None,
Match::Ignored(pat) => Match::Whitelist(pat),
Match::Whitelist(pat) => Match::Ignored(pat),
}
}
} }
/// GitignoreBuilder constructs a matcher for a single set of globs from a /// GitignoreBuilder constructs a matcher for a single set of globs from a
@ -231,7 +254,6 @@ impl GitignoreBuilder {
/// Add each pattern line from the file path given. /// Add each pattern line from the file path given.
pub fn add_path<P: AsRef<Path>>(&mut self, path: P) -> Result<(), Error> { pub fn add_path<P: AsRef<Path>>(&mut self, path: P) -> Result<(), Error> {
let rdr = io::BufReader::new(try!(File::open(&path))); let rdr = io::BufReader::new(try!(File::open(&path)));
// println!("adding ignores from: {}", path.as_ref().display());
for line in rdr.lines() { for line in rdr.lines() {
try!(self.add(&path, &try!(line))); try!(self.add(&path, &try!(line)));
} }

View File

@ -77,6 +77,8 @@ impl Set {
/// Returns every glob pattern (by sequence number) that matches the given /// Returns every glob pattern (by sequence number) that matches the given
/// path. /// path.
pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches { pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches {
// TODO(burntsushi): If we split this out into a separate crate, don't
// expose the regex::SetMatches type in the public API.
self.set.matches(path.as_ref()) self.set.matches(path.as_ref())
} }

View File

@ -18,6 +18,7 @@ use std::fmt;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use gitignore::{self, Gitignore, GitignoreBuilder, Match}; use gitignore::{self, Gitignore, GitignoreBuilder, Match};
use types::Types;
/// Represents an error that can occur when parsing a gitignore file. /// Represents an error that can occur when parsing a gitignore file.
#[derive(Debug)] #[derive(Debug)]
@ -56,7 +57,13 @@ pub struct Ignore {
/// A stack of ignore patterns at each directory level of traversal. /// A stack of ignore patterns at each directory level of traversal.
/// A directory that contributes no ignore patterns is `None`. /// A directory that contributes no ignore patterns is `None`.
stack: Vec<Option<IgnoreDir>>, stack: Vec<Option<IgnoreDir>>,
/// A set of override globs that are always checked first. A match (whether
/// it's whitelist or blacklist) trumps anything in stack.
overrides: Option<Gitignore>,
/// A file type matcher.
types: Option<Types>,
ignore_hidden: bool, ignore_hidden: bool,
no_ignore: bool,
} }
impl Ignore { impl Ignore {
@ -64,7 +71,10 @@ impl Ignore {
pub fn new() -> Ignore { pub fn new() -> Ignore {
Ignore { Ignore {
stack: vec![], stack: vec![],
overrides: None,
types: None,
ignore_hidden: true, ignore_hidden: true,
no_ignore: false,
} }
} }
@ -74,11 +84,34 @@ impl Ignore {
self self
} }
/// When set, ignore files are ignored.
pub fn no_ignore(&mut self, yes: bool) -> &mut Ignore {
self.no_ignore = yes;
self
}
/// Add a set of globs that overrides all other match logic.
pub fn add_override(&mut self, gi: Gitignore) -> &mut Ignore {
self.overrides = Some(gi);
self
}
/// Add a file type matcher. The file type matcher has the lowest
/// precedence.
pub fn add_types(&mut self, types: Types) -> &mut Ignore {
self.types = Some(types);
self
}
/// Add a directory to the stack. /// Add a directory to the stack.
/// ///
/// Note that even if this returns an error, the directory is added to the /// Note that even if this returns an error, the directory is added to the
/// stack (and therefore should be popped). /// stack (and therefore should be popped).
pub fn push<P: AsRef<Path>>(&mut self, path: P) -> Result<(), Error> { pub fn push<P: AsRef<Path>>(&mut self, path: P) -> Result<(), Error> {
if self.no_ignore {
self.stack.push(None);
return Ok(());
}
match IgnoreDir::new(path) { match IgnoreDir::new(path) {
Ok(id) => { Ok(id) => {
self.stack.push(id); self.stack.push(id);
@ -102,24 +135,57 @@ impl Ignore {
/// Returns true if and only if the given file path should be ignored. /// Returns true if and only if the given file path should be ignored.
pub fn ignored<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> bool { pub fn ignored<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> bool {
let path = path.as_ref(); let path = path.as_ref();
if let Some(ref overrides) = self.overrides {
let mat = overrides.matched(path, is_dir).invert();
if let Some(is_ignored) = self.ignore_match(path, mat) {
return is_ignored;
}
}
if self.ignore_hidden && is_hidden(&path) { if self.ignore_hidden && is_hidden(&path) {
debug!("{} ignored because it is hidden", path.display());
return true; return true;
} }
for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) { for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) {
match id.matched(path, is_dir) { let mat = id.matched(path, is_dir);
Match::Whitelist(ref pat) => { if let Some(is_ignored) = self.ignore_match(path, mat) {
debug!("{} whitelisted by {:?}", path.display(), pat); if is_ignored {
return false;
}
Match::Ignored(ref pat) => {
debug!("{} ignored by {:?}", path.display(), pat);
return true; return true;
} }
Match::None => {} // If this path is whitelisted by an ignore, then fallthrough
// and let the file type matcher have a say.
break;
}
}
if let Some(ref types) = self.types {
let mat = types.matched(path, is_dir);
if let Some(is_ignored) = self.ignore_match(path, mat) {
return is_ignored;
} }
} }
false false
} }
/// Returns true if the given match says the given pattern should be
/// ignored or false if the given pattern should be explicitly whitelisted.
/// Returns None otherwise.
pub fn ignore_match<P: AsRef<Path>>(
&self,
path: P,
mat: Match,
) -> Option<bool> {
let path = path.as_ref();
match mat {
Match::Whitelist(ref pat) => {
debug!("{} whitelisted by {:?}", path.display(), pat);
Some(false)
}
Match::Ignored(ref pat) => {
debug!("{} ignored by {:?}", path.display(), pat);
Some(true)
}
Match::None => None,
}
}
} }
/// IgnoreDir represents a set of ignore patterns retrieved from a single /// IgnoreDir represents a set of ignore patterns retrieved from a single

View File

@ -19,7 +19,6 @@ extern crate rustc_serialize;
extern crate thread_local; extern crate thread_local;
extern crate walkdir; extern crate walkdir;
use std::cmp;
use std::error::Error; use std::error::Error;
use std::fs::File; use std::fs::File;
use std::io::{self, Write}; use std::io::{self, Write};
@ -30,14 +29,13 @@ use std::sync::Arc;
use std::thread; use std::thread;
use crossbeam::sync::chase_lev::{self, Steal, Stealer}; use crossbeam::sync::chase_lev::{self, Steal, Stealer};
use docopt::Docopt; use grep::Grep;
use grep::{Grep, GrepBuilder};
use parking_lot::Mutex; use parking_lot::Mutex;
use walkdir::WalkDir;
use ignore::Ignore; use args::Args;
use out::Out;
use printer::Printer; use printer::Printer;
use search::{InputBuffer, Searcher}; use search::InputBuffer;
macro_rules! errored { macro_rules! errored {
($($tt:tt)*) => { ($($tt:tt)*) => {
@ -52,64 +50,22 @@ macro_rules! eprintln {
}} }}
} }
mod args;
mod gitignore; mod gitignore;
mod glob; mod glob;
mod ignore; mod ignore;
mod out;
mod printer; mod printer;
mod search; mod search;
mod types;
mod walk; mod walk;
const USAGE: &'static str = "
Usage: xrep [options] <pattern> [<path> ...]
xrep --files [<path> ...]
xrep is like the silver searcher and grep, but faster than both.
WARNING: Searching stdin isn't yet supported.
Options:
-c, --count Suppress normal output and show count of line
matches.
-A, --after-context NUM Show NUM lines after each match.
-B, --before-context NUM Show NUM lines before each match.
-C, --context NUM Show NUM lines before and after each match.
--debug Show debug messages.
--files Print each file that would be searched
(but don't search).
--hidden Search hidden directories and files.
-i, --ignore-case Case insensitive search.
-L, --follow Follow symlinks.
-n, --line-number Show line numbers (1-based).
-t, --threads ARG The number of threads to use. Defaults to the
number of logical CPUs. [default: 0]
-v, --invert-match Invert matching.
";
#[derive(RustcDecodable)]
struct Args {
arg_pattern: String,
arg_path: Vec<String>,
flag_after_context: usize,
flag_before_context: usize,
flag_context: usize,
flag_count: bool,
flag_debug: bool,
flag_files: bool,
flag_follow: bool,
flag_hidden: bool,
flag_ignore_case: bool,
flag_invert_match: bool,
flag_line_number: bool,
flag_threads: usize,
}
pub type Result<T> = result::Result<T, Box<Error + Send + Sync>>; pub type Result<T> = result::Result<T, Box<Error + Send + Sync>>;
fn main() { fn main() {
let args: Args = Docopt::new(USAGE).and_then(|d| d.decode()) match Args::parse().and_then(run) {
.unwrap_or_else(|e| e.exit()); Ok(count) if count == 0 => process::exit(1),
match run(args) { Ok(count) => process::exit(0),
Ok(_) => process::exit(0),
Err(err) => { Err(err) => {
let _ = writeln!(&mut io::stderr(), "{}", err); let _ = writeln!(&mut io::stderr(), "{}", err);
process::exit(1); process::exit(1);
@ -117,194 +73,158 @@ fn main() {
} }
} }
fn run(mut args: Args) -> Result<()> { fn run(args: Args) -> Result<u64> {
let mut logb = env_logger::LogBuilder::new(); if args.files() {
if args.flag_debug {
logb.filter(None, log::LogLevelFilter::Debug);
} else {
logb.filter(None, log::LogLevelFilter::Warn);
}
if let Err(err) = logb.init() {
errored!("failed to initialize logger: {}", err);
}
if args.arg_path.is_empty() {
args.arg_path.push("./".to_string());
}
if args.arg_path.iter().any(|p| p == "-") {
errored!("searching <stdin> isn't yet supported");
}
if args.flag_files {
return run_files(args); return run_files(args);
} }
if args.type_list() {
return run_types(args);
}
let args = Arc::new(args); let args = Arc::new(args);
let out = Arc::new(Mutex::new(args.out(io::stdout())));
let mut workers = vec![]; let mut workers = vec![];
let out = Arc::new(Mutex::new(Out::new(args.clone(), io::stdout())));
let mut chan_work_send = { let mut workq = {
let (worker, stealer) = chase_lev::deque(); let (workq, stealer) = chase_lev::deque();
for _ in 0..args.num_workers() { for _ in 0..args.threads() {
let grepb =
GrepBuilder::new(&args.arg_pattern)
.case_insensitive(args.flag_ignore_case);
let worker = Worker { let worker = Worker {
args: args.clone(), args: args.clone(),
out: out.clone(), out: out.clone(),
chan_work: stealer.clone(), chan_work: stealer.clone(),
inpbuf: InputBuffer::new(), inpbuf: args.input_buffer(),
outbuf: Some(vec![]), outbuf: Some(vec![]),
grep: try!(grepb.build()), grep: try!(args.grep()),
}; };
workers.push(thread::spawn(move || worker.run())); workers.push(thread::spawn(move || worker.run()));
} }
worker workq
}; };
for p in args.paths() {
for p in &args.arg_path { if p == Path::new("-") {
workq.push(Work::Stdin)
} else {
for path in args.walker(p) { for path in args.walker(p) {
chan_work_send.push(Message::Some(path)); workq.push(Work::File(path));
}
} }
} }
for _ in 0..workers.len() { for _ in 0..workers.len() {
chan_work_send.push(Message::Quit); workq.push(Work::Quit);
} }
let mut match_count = 0;
for worker in workers { for worker in workers {
worker.join().unwrap(); match_count += worker.join().unwrap();
} }
Ok(()) Ok(match_count)
} }
fn run_files(args: Args) -> Result<()> { fn run_files(args: Args) -> Result<u64> {
let mut printer = Printer::new(io::BufWriter::new(io::stdout())); let mut printer = Printer::new(io::BufWriter::new(io::stdout()));
for p in &args.arg_path { let mut file_count = 0;
for p in args.paths() {
if p == Path::new("-") {
printer.path(&Path::new("<stdin>"));
file_count += 1;
} else {
for path in args.walker(p) { for path in args.walker(p) {
printer.path(path); printer.path(path);
file_count += 1;
} }
} }
Ok(()) }
Ok(file_count)
} }
impl Args { fn run_types(args: Args) -> Result<u64> {
fn printer<W: io::Write>(&self, wtr: W) -> Printer<W> { let mut printer = Printer::new(io::BufWriter::new(io::stdout()));
Printer::new(wtr) let mut ty_count = 0;
} for def in args.type_defs() {
printer.type_def(def);
fn num_workers(&self) -> usize { ty_count += 1;
let mut num = self.flag_threads;
if num == 0 {
num = cmp::min(8, num_cpus::get());
}
num
}
fn walker<P: AsRef<Path>>(&self, path: P) -> walk::Iter {
let wd = WalkDir::new(path).follow_links(self.flag_follow);
let mut ig = Ignore::new();
ig.ignore_hidden(!self.flag_hidden);
walk::Iter::new(ig, wd)
}
fn before_context(&self) -> usize {
if self.flag_context > 0 {
self.flag_context
} else {
self.flag_before_context
}
}
fn after_context(&self) -> usize {
if self.flag_context > 0 {
self.flag_context
} else {
self.flag_after_context
}
}
fn has_context(&self) -> bool {
self.before_context() > 0 || self.after_context() > 0
} }
Ok(ty_count)
} }
enum Message<T> { enum Work {
Some(T), File(PathBuf),
Stdin,
Quit, Quit,
} }
struct Worker { struct Worker {
args: Arc<Args>, args: Arc<Args>,
out: Arc<Mutex<Out<io::Stdout>>>, out: Arc<Mutex<Out<io::Stdout>>>,
chan_work: Stealer<Message<PathBuf>>, chan_work: Stealer<Work>,
inpbuf: InputBuffer, inpbuf: InputBuffer,
outbuf: Option<Vec<u8>>, outbuf: Option<Vec<u8>>,
grep: Grep, grep: Grep,
} }
impl Worker { impl Worker {
fn run(mut self) { fn run(mut self) -> u64 {
let mut match_count = 0;
loop { loop {
let path = match self.chan_work.steal() { let (path, file) = match self.chan_work.steal() {
Steal::Empty | Steal::Abort => continue, Steal::Empty | Steal::Abort => continue,
Steal::Data(Message::Quit) => break, Steal::Data(Work::Quit) => break,
Steal::Data(Message::Some(path)) => path, Steal::Data(Work::File(path)) => {
}; match File::open(&path) {
let file = match File::open(&path) { Ok(file) => (path, Some(file)),
Ok(file) => file,
Err(err) => { Err(err) => {
eprintln!("{}: {}", path.display(), err); eprintln!("{}: {}", path.display(), err);
continue; continue;
} }
}
}
Steal::Data(Work::Stdin) => {
(Path::new("<stdin>").to_path_buf(), None)
}
}; };
let mut outbuf = self.outbuf.take().unwrap(); let mut outbuf = self.outbuf.take().unwrap();
outbuf.clear(); outbuf.clear();
let mut printer = self.args.printer(outbuf); let mut printer = self.args.printer(outbuf);
{ {
let mut searcher = Searcher::new( let result = match file {
&mut self.inpbuf, None => {
&mut printer, let stdin = io::stdin();
&self.grep, let stdin = stdin.lock();
&path, self.search(&mut printer, &path, stdin)
file, }
); Some(file) => {
searcher = searcher.count(self.args.flag_count); self.search(&mut printer, &path, file)
searcher = searcher.line_number(self.args.flag_line_number); }
searcher = searcher.invert_match(self.args.flag_invert_match); };
searcher = searcher.after_context(self.args.after_context()); match result {
searcher = searcher.before_context(self.args.before_context()); Ok(count) => {
if let Err(err) = searcher.run() { match_count += count;
}
Err(err) => {
eprintln!("{}", err); eprintln!("{}", err);
} }
} }
}
let outbuf = printer.into_inner(); let outbuf = printer.into_inner();
if !outbuf.is_empty() { if !outbuf.is_empty() {
let mut out = self.out.lock(); let mut out = self.out.lock();
out.write_file_matches(&outbuf); out.write(&outbuf);
} }
self.outbuf = Some(outbuf); self.outbuf = Some(outbuf);
} }
} match_count
}
struct Out<W: io::Write> {
args: Arc<Args>,
wtr: io::BufWriter<W>,
printed: bool,
}
impl<W: io::Write> Out<W> {
fn new(args: Arc<Args>, wtr: W) -> Out<W> {
Out {
args: args,
wtr: io::BufWriter::new(wtr),
printed: false,
}
} }
fn write_file_matches(&mut self, buf: &[u8]) { fn search<R: io::Read, W: io::Write>(
if self.printed && self.args.has_context() { &mut self,
let _ = self.wtr.write_all(b"--\n"); printer: &mut Printer<W>,
} path: &Path,
let _ = self.wtr.write_all(buf); rdr: R,
let _ = self.wtr.flush(); ) -> Result<u64> {
self.printed = true; self.args.searcher(
&mut self.inpbuf,
printer,
&self.grep,
path,
rdr,
).run().map_err(From::from)
} }
} }

45
src/out.rs Normal file
View File

@ -0,0 +1,45 @@
use std::io::{self, Write};
/// Out controls the actual output of all search results for a particular file
/// to the end user.
///
/// (The difference between Out and Printer is that a Printer works with
/// individual search results where as Out works with search results for each
/// file as a whole. For example, it knows when to print a file separator.)
pub struct Out<W: io::Write> {
wtr: io::BufWriter<W>,
printed: bool,
file_separator: Vec<u8>,
}
impl<W: io::Write> Out<W> {
/// Create a new Out that writes to the wtr given.
pub fn new(wtr: W) -> Out<W> {
Out {
wtr: io::BufWriter::new(wtr),
printed: false,
file_separator: vec![],
}
}
/// If set, the separator is printed between matches from different files.
/// By default, no separator is printed.
///
/// If sep is empty, then no file separator is printed.
pub fn file_separator(mut self, sep: Vec<u8>) -> Out<W> {
self.file_separator = sep;
self
}
/// Write the search results of a single file to the underlying wtr and
/// flush wtr.
pub fn write(&mut self, buf: &[u8]) {
if self.printed && !self.file_separator.is_empty() {
let _ = self.wtr.write_all(&self.file_separator);
let _ = self.wtr.write_all(b"\n");
}
let _ = self.wtr.write_all(buf);
let _ = self.wtr.flush();
self.printed = true;
}
}

View File

@ -1,53 +1,121 @@
use std::io; use std::io;
use std::path::Path; use std::path::Path;
macro_rules! wln { use types::FileTypeDef;
($($tt:tt)*) => {
let _ = writeln!($($tt)*);
}
}
macro_rules! w {
($($tt:tt)*) => {
let _ = write!($($tt)*);
}
}
/// Printer encapsulates all output logic for searching.
///
/// Note that we currently ignore all write errors. It's probably worthwhile
/// to fix this, but printers are only ever used for writes to stdout or
/// writes to memory, neither of which commonly fail.
pub struct Printer<W> { pub struct Printer<W> {
/// The underlying writer.
wtr: W, wtr: W,
/// Whether anything has been printed to wtr yet.
has_printed: bool, has_printed: bool,
/// The string to use to separate non-contiguous runs of context lines.
context_separator: Vec<u8>,
/// The end-of-line terminator used by the printer. In general, eols are
/// printed via the match directly, but occasionally we need to insert them
/// ourselves (for example, to print a context separator).
eol: u8,
/// Whether to suppress all output.
quiet: bool,
/// Whether to prefix each match with the corresponding file name.
with_filename: bool,
} }
impl<W: io::Write> Printer<W> { impl<W: io::Write> Printer<W> {
/// Create a new printer that writes to wtr.
pub fn new(wtr: W) -> Printer<W> { pub fn new(wtr: W) -> Printer<W> {
Printer { Printer {
wtr: wtr, wtr: wtr,
has_printed: false, has_printed: false,
context_separator: "--".to_string().into_bytes(),
eol: b'\n',
quiet: false,
with_filename: false,
} }
} }
/// Set the context separator. The default is `--`.
pub fn context_separator(mut self, sep: Vec<u8>) -> Printer<W> {
self.context_separator = sep;
self
}
/// Set the end-of-line terminator. The default is `\n`.
pub fn eol(mut self, eol: u8) -> Printer<W> {
self.eol = eol;
self
}
/// When set, all output is suppressed.
pub fn quiet(mut self, yes: bool) -> Printer<W> {
self.quiet = yes;
self
}
/// When set, each match is prefixed with the file name that it came from.
pub fn with_filename(mut self, yes: bool) -> Printer<W> {
self.with_filename = yes;
self
}
/// Returns true if and only if something has been printed.
pub fn has_printed(&self) -> bool { pub fn has_printed(&self) -> bool {
self.has_printed self.has_printed
} }
pub fn into_inner(self) -> W { /// Flushes the underlying writer and returns it.
pub fn into_inner(mut self) -> W {
let _ = self.wtr.flush();
self.wtr self.wtr
} }
/// Prints a type definition.
pub fn type_def(&mut self, def: &FileTypeDef) {
self.write(def.name().as_bytes());
self.write(b": ");
let mut first = true;
for pat in def.patterns() {
if !first {
self.write(b", ");
}
self.write(pat.as_bytes());
first = false;
}
self.write_eol();
}
/// Prints the given path.
pub fn path<P: AsRef<Path>>(&mut self, path: P) { pub fn path<P: AsRef<Path>>(&mut self, path: P) {
wln!(&mut self.wtr, "{}", path.as_ref().display()); self.write(path.as_ref().to_string_lossy().as_bytes());
self.write_eol();
} }
/// Prints the given path and a count of the number of matches found.
pub fn path_count<P: AsRef<Path>>(&mut self, path: P, count: u64) { pub fn path_count<P: AsRef<Path>>(&mut self, path: P, count: u64) {
wln!(&mut self.wtr, "{}:{}", path.as_ref().display(), count); if self.with_filename {
self.write(path.as_ref().to_string_lossy().as_bytes());
self.write(b":");
}
self.write(count.to_string().as_bytes());
self.write_eol();
} }
pub fn count(&mut self, count: u64) { /// Prints the context separator.
wln!(&mut self.wtr, "{}", count); pub fn context_separate(&mut self) {
// N.B. We can't use `write` here because of borrowing restrictions.
if self.quiet {
return;
} }
if self.context_separator.is_empty() {
pub fn context_separator(&mut self) { return;
wln!(&mut self.wtr, "--"); }
self.has_printed = true;
let _ = self.wtr.write_all(&self.context_separator);
let _ = self.wtr.write_all(&[self.eol]);
} }
pub fn matched<P: AsRef<Path>>( pub fn matched<P: AsRef<Path>>(
@ -58,15 +126,17 @@ impl<W: io::Write> Printer<W> {
end: usize, end: usize,
line_number: Option<u64>, line_number: Option<u64>,
) { ) {
if self.with_filename {
self.write(path.as_ref().to_string_lossy().as_bytes()); self.write(path.as_ref().to_string_lossy().as_bytes());
self.write(b":"); self.write(b":");
}
if let Some(line_number) = line_number { if let Some(line_number) = line_number {
self.write(line_number.to_string().as_bytes()); self.write(line_number.to_string().as_bytes());
self.write(b":"); self.write(b":");
} }
self.write(&buf[start..end]); self.write(&buf[start..end]);
if buf[start..end].last() != Some(&b'\n') { if buf[start..end].last() != Some(&self.eol) {
self.write(b"\n"); self.write_eol();
} }
} }
@ -78,24 +148,30 @@ impl<W: io::Write> Printer<W> {
end: usize, end: usize,
line_number: Option<u64>, line_number: Option<u64>,
) { ) {
if self.with_filename {
self.write(path.as_ref().to_string_lossy().as_bytes()); self.write(path.as_ref().to_string_lossy().as_bytes());
self.write(b"-"); self.write(b"-");
}
if let Some(line_number) = line_number { if let Some(line_number) = line_number {
self.write(line_number.to_string().as_bytes()); self.write(line_number.to_string().as_bytes());
self.write(b"-"); self.write(b"-");
} }
self.write(&buf[start..end]); self.write(&buf[start..end]);
if buf[start..end].last() != Some(&b'\n') { if buf[start..end].last() != Some(&self.eol) {
self.write(b"\n"); self.write_eol();
} }
} }
pub fn binary_matched<P: AsRef<Path>>(&mut self, path: P) {
wln!(&mut self.wtr, "Binary file {} matches", path.as_ref().display());
}
fn write(&mut self, buf: &[u8]) { fn write(&mut self, buf: &[u8]) {
if self.quiet {
return;
}
self.has_printed = true; self.has_printed = true;
let _ = self.wtr.write_all(buf); let _ = self.wtr.write_all(buf);
} }
fn write_eol(&mut self) {
let eol = self.eol;
self.write(&[eol]);
}
} }

View File

@ -20,6 +20,7 @@ const READ_SIZE: usize = 8 * (1<<10);
/// Error describes errors that can occur while searching. /// Error describes errors that can occur while searching.
#[derive(Debug)] #[derive(Debug)]
pub enum Error { pub enum Error {
/// A standard I/O error attached to a particular file path.
Io { Io {
err: io::Error, err: io::Error,
path: PathBuf, path: PathBuf,
@ -57,6 +58,7 @@ impl fmt::Display for Error {
} }
pub struct Searcher<'a, R, W: 'a> { pub struct Searcher<'a, R, W: 'a> {
opts: Options,
inp: &'a mut InputBuffer, inp: &'a mut InputBuffer,
printer: &'a mut Printer<W>, printer: &'a mut Printer<W>,
grep: &'a Grep, grep: &'a Grep,
@ -68,11 +70,32 @@ pub struct Searcher<'a, R, W: 'a> {
last_printed: usize, last_printed: usize,
last_line: usize, last_line: usize,
after_context_remaining: usize, after_context_remaining: usize,
}
/// Options for configuring search.
#[derive(Clone)]
struct Options {
after_context: usize,
before_context: usize,
count: bool, count: bool,
eol: u8,
invert_match: bool, invert_match: bool,
line_number: bool, line_number: bool,
before_context: usize, text: bool,
after_context: usize, }
impl Default for Options {
fn default() -> Options {
Options {
after_context: 0,
before_context: 0,
count: false,
eol: b'\n',
invert_match: false,
line_number: false,
text: false,
}
}
} }
impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> { impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
@ -96,6 +119,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
haystack: R, haystack: R,
) -> Searcher<'a, R, W> { ) -> Searcher<'a, R, W> {
Searcher { Searcher {
opts: Options::default(),
inp: inp, inp: inp,
printer: printer, printer: printer,
grep: grep, grep: grep,
@ -107,47 +131,54 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
last_printed: 0, last_printed: 0,
last_line: 0, last_line: 0,
after_context_remaining: 0, after_context_remaining: 0,
count: false,
invert_match: false,
line_number: false,
before_context: 0,
after_context: 0,
} }
} }
/// If enabled, searching will print a count instead of each match. /// The number of contextual lines to show after each match. The default
/// /// is zero.
/// Disabled by default. pub fn after_context(mut self, count: usize) -> Self {
pub fn count(mut self, yes: bool) -> Self { self.opts.after_context = count;
self.count = yes;
self
}
/// If enabled, matching is inverted so that lines that *don't* match the
/// given pattern are treated as matches.
pub fn invert_match(mut self, yes: bool) -> Self {
self.invert_match = yes;
self
}
/// If enabled, compute line numbers and prefix each line of output with
/// them.
pub fn line_number(mut self, yes: bool) -> Self {
self.line_number = yes;
self self
} }
/// The number of contextual lines to show before each match. The default /// The number of contextual lines to show before each match. The default
/// is zero. /// is zero.
pub fn before_context(mut self, count: usize) -> Self { pub fn before_context(mut self, count: usize) -> Self {
self.before_context = count; self.opts.before_context = count;
self self
} }
/// The number of contextual lines to show after each match. The default /// If enabled, searching will print a count instead of each match.
/// is zero. ///
pub fn after_context(mut self, count: usize) -> Self { /// Disabled by default.
self.after_context = count; pub fn count(mut self, yes: bool) -> Self {
self.opts.count = yes;
self
}
/// Set the end-of-line byte used by this searcher.
pub fn eol(mut self, eol: u8) -> Self {
self.opts.eol = eol;
self
}
/// If enabled, matching is inverted so that lines that *don't* match the
/// given pattern are treated as matches.
pub fn invert_match(mut self, yes: bool) -> Self {
self.opts.invert_match = yes;
self
}
/// If enabled, compute line numbers and prefix each line of output with
/// them.
pub fn line_number(mut self, yes: bool) -> Self {
self.opts.line_number = yes;
self
}
/// If enabled, search binary files as if they were text.
pub fn text(mut self, yes: bool) -> Self {
self.opts.text = yes;
self self
} }
@ -157,16 +188,16 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
pub fn run(mut self) -> Result<u64, Error> { pub fn run(mut self) -> Result<u64, Error> {
self.inp.reset(); self.inp.reset();
self.match_count = 0; self.match_count = 0;
self.line_count = if self.line_number { Some(0) } else { None }; self.line_count = if self.opts.line_number { Some(0) } else { None };
self.last_match = Match::default(); self.last_match = Match::default();
self.after_context_remaining = 0; self.after_context_remaining = 0;
loop { loop {
let upto = self.inp.lastnl; let upto = self.inp.lastnl;
self.print_after_context(upto); self.print_after_context(upto);
if !try!(self.fill()) { if !try!(self.fill()) {
if self.inp.is_binary { break;
self.printer.binary_matched(self.path);
} }
if !self.opts.text && self.inp.is_binary {
break; break;
} }
while self.inp.pos < self.inp.lastnl { while self.inp.pos < self.inp.lastnl {
@ -174,7 +205,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
&mut self.last_match, &mut self.last_match,
&mut self.inp.buf[..self.inp.lastnl], &mut self.inp.buf[..self.inp.lastnl],
self.inp.pos); self.inp.pos);
if self.invert_match { if self.opts.invert_match {
let upto = let upto =
if matched { if matched {
self.last_match.start() self.last_match.start()
@ -189,7 +220,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
} }
} else if matched { } else if matched {
self.match_count += 1; self.match_count += 1;
if !self.count { if !self.opts.count {
let start = self.last_match.start(); let start = self.last_match.start();
let end = self.last_match.end(); let end = self.last_match.end();
self.print_after_context(start); self.print_after_context(start);
@ -204,32 +235,36 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
} }
} }
} }
if self.count && self.match_count > 0 { if self.opts.count && self.match_count > 0 {
self.printer.path_count(self.path, self.match_count); self.printer.path_count(self.path, self.match_count);
} }
Ok(self.match_count) Ok(self.match_count)
} }
#[inline(always)]
fn fill(&mut self) -> Result<bool, Error> { fn fill(&mut self) -> Result<bool, Error> {
let mut keep_from = self.inp.lastnl; let mut keep = self.inp.lastnl;
if self.before_context > 0 || self.after_context > 0 { if self.opts.before_context > 0 || self.opts.after_context > 0 {
keep_from = start_of_previous_lines( let lines = 1 + cmp::max(
self.opts.before_context, self.opts.after_context);
keep = start_of_previous_lines(
self.opts.eol,
&self.inp.buf, &self.inp.buf,
self.inp.lastnl.saturating_sub(1), self.inp.lastnl.saturating_sub(1),
cmp::max(self.before_context, self.after_context) + 1); lines);
} }
if keep_from < self.last_printed { if keep < self.last_printed {
self.last_printed = self.last_printed - keep_from; self.last_printed = self.last_printed - keep;
} else { } else {
self.last_printed = 0; self.last_printed = 0;
} }
if keep_from <= self.last_line { if keep <= self.last_line {
self.last_line = self.last_line - keep_from; self.last_line = self.last_line - keep;
} else { } else {
self.count_lines(keep_from); self.count_lines(keep);
self.last_line = 0; self.last_line = 0;
} }
let ok = try!(self.inp.fill(&mut self.haystack, keep_from).map_err(|err| { let ok = try!(self.inp.fill(&mut self.haystack, keep).map_err(|err| {
Error::from_io(err, &self.path) Error::from_io(err, &self.path)
})); }));
Ok(ok) Ok(ok)
@ -237,10 +272,10 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
#[inline(always)] #[inline(always)]
fn print_inverted_matches(&mut self, upto: usize) { fn print_inverted_matches(&mut self, upto: usize) {
debug_assert!(self.invert_match); debug_assert!(self.opts.invert_match);
let mut it = IterLines::new(self.inp.pos); let mut it = IterLines::new(self.opts.eol, self.inp.pos);
while let Some((start, end)) = it.next(&self.inp.buf[..upto]) { while let Some((start, end)) = it.next(&self.inp.buf[..upto]) {
if !self.count { if !self.opts.count {
self.print_match(start, end); self.print_match(start, end);
} }
self.inp.pos = end; self.inp.pos = end;
@ -250,7 +285,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
#[inline(always)] #[inline(always)]
fn print_before_context(&mut self, upto: usize) { fn print_before_context(&mut self, upto: usize) {
if self.count || self.before_context == 0 { if self.opts.count || self.opts.before_context == 0 {
return; return;
} }
let start = self.last_printed; let start = self.last_printed;
@ -260,10 +295,11 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
} }
let before_context_start = let before_context_start =
start + start_of_previous_lines( start + start_of_previous_lines(
self.opts.eol,
&self.inp.buf[start..], &self.inp.buf[start..],
end - start - 1, end - start - 1,
self.before_context); self.opts.before_context);
let mut it = IterLines::new(before_context_start); let mut it = IterLines::new(self.opts.eol, before_context_start);
while let Some((s, e)) = it.next(&self.inp.buf[..end]) { while let Some((s, e)) = it.next(&self.inp.buf[..end]) {
self.print_separator(s); self.print_separator(s);
self.print_context(s, e); self.print_context(s, e);
@ -272,12 +308,12 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
#[inline(always)] #[inline(always)]
fn print_after_context(&mut self, upto: usize) { fn print_after_context(&mut self, upto: usize) {
if self.count || self.after_context_remaining == 0 { if self.opts.count || self.after_context_remaining == 0 {
return; return;
} }
let start = self.last_printed; let start = self.last_printed;
let end = upto; let end = upto;
let mut it = IterLines::new(start); let mut it = IterLines::new(self.opts.eol, start);
while let Some((s, e)) = it.next(&self.inp.buf[..end]) { while let Some((s, e)) = it.next(&self.inp.buf[..end]) {
self.print_context(s, e); self.print_context(s, e);
self.after_context_remaining -= 1; self.after_context_remaining -= 1;
@ -295,7 +331,7 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
self.printer.matched( self.printer.matched(
&self.path, &self.inp.buf, start, end, self.line_count); &self.path, &self.inp.buf, start, end, self.line_count);
self.last_printed = end; self.last_printed = end;
self.after_context_remaining = self.after_context; self.after_context_remaining = self.opts.after_context;
} }
#[inline(always)] #[inline(always)]
@ -309,21 +345,23 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
#[inline(always)] #[inline(always)]
fn print_separator(&mut self, before: usize) { fn print_separator(&mut self, before: usize) {
if self.before_context == 0 && self.after_context == 0 { if self.opts.before_context == 0 && self.opts.after_context == 0 {
return; return;
} }
if !self.printer.has_printed() { if !self.printer.has_printed() {
return; return;
} }
if (self.last_printed == 0 && before > 0) || self.last_printed < before { if (self.last_printed == 0 && before > 0)
self.printer.context_separator(); || self.last_printed < before {
self.printer.context_separate();
} }
} }
#[inline(always)] #[inline(always)]
fn count_lines(&mut self, upto: usize) { fn count_lines(&mut self, upto: usize) {
if let Some(ref mut line_count) = self.line_count { if let Some(ref mut line_count) = self.line_count {
*line_count += count_lines(&self.inp.buf[self.last_line..upto]); *line_count += count_lines(
&self.inp.buf[self.last_line..upto], self.opts.eol);
self.last_line = upto; self.last_line = upto;
} }
} }
@ -337,15 +375,53 @@ impl<'a, R: io::Read, W: io::Write> Searcher<'a, R, W> {
} }
} }
/// InputBuffer encapsulates the logic of maintaining a ~fixed sized buffer
/// on which to search. There are three key pieces of complexity:
///
/// 1. We must be able to handle lines that are longer than the size of the
/// buffer. For this reason, the buffer is allowed to expand (and is
/// therefore not technically fixed). Note that once a buffer expands, it
/// will never contract.
/// 2. The contents of the buffer may end with a partial line, so we must keep
/// track of where the last complete line ends. Namely, the partial line
/// is only completed on subsequent reads *after* searching up through
/// the last complete line is done.
/// 3. When printing the context of a match, the last N lines of the buffer
/// may need to be rolled over into the next buffer. For example, a match
/// may occur at the beginning of a buffer, in which case, lines at the end
/// of the previous contents of the buffer need to be printed.
///
/// An InputBuffer is designed to be reused and isn't tied to any particular
/// reader.
pub struct InputBuffer { pub struct InputBuffer {
/// The number of bytes to attempt to read at a time. Once set, this is
/// never changed.
read_size: usize, read_size: usize,
/// The end-of-line terminator used in this buffer.
eol: u8,
/// A scratch buffer.
tmp: Vec<u8>,
/// A buffer to read bytes into. All searches are executed directly against
/// this buffer and pos/lastnl/end point into it.
buf: Vec<u8>, buf: Vec<u8>,
tmp1: Vec<u8>, /// The current position in buf. The current position represents where the
tmp2: Vec<u8>, /// next search should start.
pos: usize, pos: usize,
/// The position immediately following the last line terminator in buf.
/// This may be equal to end.
///
/// Searching should never cross this boundary. In particular, the contents
/// of the buffer following this position may correspond to *partial* line.
/// All contents before this position are complete lines.
lastnl: usize, lastnl: usize,
/// The end position of the buffer. Data after this position is not
/// specified.
end: usize, end: usize,
/// Set to true if and only if no reads have occurred yet.
first: bool, first: bool,
/// Set to true if and only if the contents of buf are determined to be
/// "binary" (i.e., not searchable text). Note that its value may be
/// falsely negative *or* falsely positive. It is only a heuristic.
is_binary: bool, is_binary: bool,
} }
@ -367,9 +443,9 @@ impl InputBuffer {
} }
InputBuffer { InputBuffer {
read_size: cap, read_size: cap,
eol: b'\n',
buf: vec![0; cap], buf: vec![0; cap],
tmp1: vec![], tmp: vec![],
tmp2: vec![],
pos: 0, pos: 0,
lastnl: 0, lastnl: 0,
end: 0, end: 0,
@ -378,6 +454,12 @@ impl InputBuffer {
} }
} }
/// Set the end-of-line terminator used by this input buffer.
pub fn eol(&mut self, eol: u8) {
self.eol = eol;
}
/// Resets this buffer so that it may be reused with a new reader.
fn reset(&mut self) { fn reset(&mut self) {
self.pos = 0; self.pos = 0;
self.lastnl = 0; self.lastnl = 0;
@ -386,36 +468,30 @@ impl InputBuffer {
self.is_binary = false; self.is_binary = false;
} }
/// Fill the contents of this buffer with the reader given. The reader
/// given should be the same in every call to fill unless reset has been
/// called.
///
/// The bytes in buf[keep_from..end] are rolled over into the beginning
/// of the buffer.
fn fill<R: io::Read>( fn fill<R: io::Read>(
&mut self, &mut self,
rdr: &mut R, rdr: &mut R,
keep_from: usize, keep_from: usize,
) -> Result<bool, io::Error> { ) -> Result<bool, io::Error> {
self.pos = 0; // Rollover bytes from buf[keep_from..end] and update our various
self.tmp1.clear(); // pointers. N.B. This could be done with the unsafe ptr::copy, but
self.tmp2.clear(); // I haven't been able to produce a benchmark that notices a difference
// in performance. (Invariably, ptr::copy is also clearer IMO.)
// Save the leftovers from the previous fill before anything else. self.tmp.clear();
if self.lastnl < self.end { self.tmp.extend_from_slice(&self.buf[keep_from..self.end]);
self.tmp1.extend_from_slice(&self.buf[self.lastnl..self.end]); self.buf[0..self.tmp.len()].copy_from_slice(&self.tmp);
} self.pos = self.lastnl - keep_from;
// If we need to save lines to account for context, do that here.
// These context lines have already been searched, but make up the
// first bytes of this buffer.
if keep_from < self.lastnl {
self.tmp2.extend_from_slice(&self.buf[keep_from..self.lastnl]);
self.buf[0..self.tmp2.len()].copy_from_slice(&self.tmp2);
self.pos = self.tmp2.len();
}
if !self.tmp1.is_empty() {
let (start, end) = (self.pos, self.pos + self.tmp1.len());
self.buf[start..end].copy_from_slice(&self.tmp1);
self.end = end;
} else {
self.end = self.pos;
}
self.lastnl = 0; self.lastnl = 0;
self.end = self.tmp.len();
while self.lastnl == 0 { while self.lastnl == 0 {
// If our buffer isn't big enough to hold the contents of a full
// read, expand it.
if self.buf.len() - self.end < self.read_size { if self.buf.len() - self.end < self.read_size {
let min_len = self.read_size + self.buf.len() - self.end; let min_len = self.read_size + self.buf.len() - self.end;
let new_len = cmp::max(min_len, self.buf.len() * 2); let new_len = cmp::max(min_len, self.buf.len() * 2);
@ -423,22 +499,28 @@ impl InputBuffer {
} }
let n = try!(rdr.read( let n = try!(rdr.read(
&mut self.buf[self.end..self.end + self.read_size])); &mut self.buf[self.end..self.end + self.read_size]));
if self.first { if self.first && is_binary(&self.buf[self.end..self.end + n]) {
if is_binary(&self.buf[self.end..self.end + n]) {
self.is_binary = true; self.is_binary = true;
return Ok(false);
} }
if self.is_binary {
replace_buf(
&mut self.buf[self.end..self.end + n], b'\x00', self.eol);
} }
self.first = false; self.first = false;
// We assume that reading 0 bytes means we've hit EOF.
if n == 0 { if n == 0 {
// If we've searched everything up to the end of the buffer,
// then there's nothing left to do.
if self.end - self.pos == 0 { if self.end - self.pos == 0 {
return Ok(false); return Ok(false);
} }
// Even if we hit EOF, we might still have to search the
// last line if it didn't contain a trailing terminator.
self.lastnl = self.end; self.lastnl = self.end;
break; break;
} }
self.lastnl = self.lastnl =
memrchr(b'\n', &self.buf[self.end..self.end + n]) memrchr(self.eol, &self.buf[self.end..self.end + n])
.map(|i| self.end + i + 1) .map(|i| self.end + i + 1)
.unwrap_or(0); .unwrap_or(0);
self.end += n; self.end += n;
@ -450,7 +532,7 @@ impl InputBuffer {
/// Returns true if and only if the given buffer is determined to be "binary" /// Returns true if and only if the given buffer is determined to be "binary"
/// or otherwise not contain text data that is usefully searchable. /// or otherwise not contain text data that is usefully searchable.
/// ///
/// Note that this may return both false positives and false negatives! /// Note that this may return both false positives and false negatives.
#[inline(always)] #[inline(always)]
fn is_binary(buf: &[u8]) -> bool { fn is_binary(buf: &[u8]) -> bool {
if buf.len() >= 4 && &buf[0..4] == b"%PDF" { if buf.len() >= 4 && &buf[0..4] == b"%PDF" {
@ -461,15 +543,31 @@ fn is_binary(buf: &[u8]) -> bool {
/// Count the number of lines in the given buffer. /// Count the number of lines in the given buffer.
#[inline(always)] #[inline(always)]
fn count_lines(mut buf: &[u8]) -> u64 { fn count_lines(mut buf: &[u8], eol: u8) -> u64 {
let mut count = 0; let mut count = 0;
while let Some(pos) = memchr(b'\n', buf) { while let Some(pos) = memchr(eol, buf) {
count += 1; count += 1;
buf = &buf[pos + 1..]; buf = &buf[pos + 1..];
} }
count count
} }
/// Replaces a with b in buf.
fn replace_buf(buf: &mut [u8], a: u8, b: u8) {
if a == b {
return;
}
let mut pos = 0;
while let Some(i) = memchr(a, &buf[pos..]).map(|i| pos + i) {
buf[i] = b;
pos = i + 1;
while buf.get(pos) == Some(&a) {
buf[pos] = b;
pos += 1;
}
}
}
/// An "iterator" over lines in a particular buffer. /// An "iterator" over lines in a particular buffer.
/// ///
/// Idiomatic Rust would borrow the buffer and use it as internal state to /// Idiomatic Rust would borrow the buffer and use it as internal state to
@ -477,6 +575,7 @@ fn count_lines(mut buf: &[u8]) -> u64 {
/// the borrow in the search code. (Because the borrow prevents composition /// the borrow in the search code. (Because the borrow prevents composition
/// through other mutable methods.) /// through other mutable methods.)
struct IterLines { struct IterLines {
eol: u8,
pos: usize, pos: usize,
} }
@ -485,8 +584,9 @@ impl IterLines {
/// ///
/// The buffer is passed to the `next` method. /// The buffer is passed to the `next` method.
#[inline(always)] #[inline(always)]
fn new(start: usize) -> IterLines { fn new(eol: u8, start: usize) -> IterLines {
IterLines { IterLines {
eol: eol,
pos: start, pos: start,
} }
} }
@ -497,7 +597,7 @@ impl IterLines {
/// The range returned includes the new line. /// The range returned includes the new line.
#[inline(always)] #[inline(always)]
fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> { fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> {
match memchr(b'\n', &buf[self.pos..]) { match memchr(self.eol, &buf[self.pos..]) {
None => { None => {
if self.pos < buf.len() { if self.pos < buf.len() {
let start = self.pos; let start = self.pos;
@ -528,10 +628,13 @@ impl IterLines {
/// The position returned corresponds to the first byte in the given line. /// The position returned corresponds to the first byte in the given line.
#[inline(always)] #[inline(always)]
fn start_of_previous_lines( fn start_of_previous_lines(
eol: u8,
buf: &[u8], buf: &[u8],
mut end: usize, mut end: usize,
mut count: usize, mut count: usize,
) -> usize { ) -> usize {
// TODO(burntsushi): This function needs to be badly simplified. The case
// analysis is impossible to follow.
if buf[..end].is_empty() { if buf[..end].is_empty() {
return 0; return 0;
} }
@ -541,14 +644,14 @@ fn start_of_previous_lines(
if end == buf.len() { if end == buf.len() {
end -= 1; end -= 1;
} }
if buf[end] == b'\n' { if buf[end] == eol {
if end == 0 { if end == 0 {
return end + 1; return end + 1;
} }
end -= 1; end -= 1;
} }
while count > 0 { while count > 0 {
if buf[end] == b'\n' { if buf[end] == eol {
count -= 1; count -= 1;
if count == 0 { if count == 0 {
return end + 1; return end + 1;
@ -559,7 +662,7 @@ fn start_of_previous_lines(
end -= 1; end -= 1;
continue; continue;
} }
match memrchr(b'\n', &buf[..end]) { match memrchr(eol, &buf[..end]) {
None => { None => {
return 0; return 0;
} }
@ -567,7 +670,7 @@ fn start_of_previous_lines(
count -= 1; count -= 1;
end = i; end = i;
if end == 0 { if end == 0 {
if buf[end] == b'\n' && count == 0 { if buf[end] == eol && count == 0 {
end += 1; end += 1;
} }
return end; return end;
@ -579,10 +682,6 @@ fn start_of_previous_lines(
end + 2 end + 2
} }
fn show(bytes: &[u8]) -> &str {
::std::str::from_utf8(bytes).unwrap()
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::io; use std::io;
@ -668,102 +767,105 @@ fn main() {
#[test] #[test]
fn previous_lines() { fn previous_lines() {
let eol = b'\n';
let text = SHERLOCK.as_bytes(); let text = SHERLOCK.as_bytes();
assert_eq!(366, text.len()); assert_eq!(366, text.len());
assert_eq!(0, start_of_previous_lines(text, 366, 100)); assert_eq!(0, start_of_previous_lines(eol, text, 366, 100));
assert_eq!(366, start_of_previous_lines(text, 366, 0)); assert_eq!(366, start_of_previous_lines(eol, text, 366, 0));
assert_eq!(321, start_of_previous_lines(text, 366, 1)); assert_eq!(321, start_of_previous_lines(eol, text, 366, 1));
assert_eq!(321, start_of_previous_lines(text, 365, 1)); assert_eq!(321, start_of_previous_lines(eol, text, 365, 1));
assert_eq!(321, start_of_previous_lines(text, 364, 1)); assert_eq!(321, start_of_previous_lines(eol, text, 364, 1));
assert_eq!(321, start_of_previous_lines(text, 322, 1)); assert_eq!(321, start_of_previous_lines(eol, text, 322, 1));
assert_eq!(321, start_of_previous_lines(text, 321, 1)); assert_eq!(321, start_of_previous_lines(eol, text, 321, 1));
assert_eq!(258, start_of_previous_lines(text, 320, 1)); assert_eq!(258, start_of_previous_lines(eol, text, 320, 1));
assert_eq!(258, start_of_previous_lines(text, 366, 2)); assert_eq!(258, start_of_previous_lines(eol, text, 366, 2));
assert_eq!(258, start_of_previous_lines(text, 365, 2)); assert_eq!(258, start_of_previous_lines(eol, text, 365, 2));
assert_eq!(258, start_of_previous_lines(text, 364, 2)); assert_eq!(258, start_of_previous_lines(eol, text, 364, 2));
assert_eq!(258, start_of_previous_lines(text, 322, 2)); assert_eq!(258, start_of_previous_lines(eol, text, 322, 2));
assert_eq!(258, start_of_previous_lines(text, 321, 2)); assert_eq!(258, start_of_previous_lines(eol, text, 321, 2));
assert_eq!(193, start_of_previous_lines(text, 320, 2)); assert_eq!(193, start_of_previous_lines(eol, text, 320, 2));
assert_eq!(65, start_of_previous_lines(text, 66, 1)); assert_eq!(65, start_of_previous_lines(eol, text, 66, 1));
assert_eq!(0, start_of_previous_lines(text, 66, 2)); assert_eq!(0, start_of_previous_lines(eol, text, 66, 2));
assert_eq!(64, start_of_previous_lines(text, 64, 0)); assert_eq!(64, start_of_previous_lines(eol, text, 64, 0));
assert_eq!(0, start_of_previous_lines(text, 64, 1)); assert_eq!(0, start_of_previous_lines(eol, text, 64, 1));
assert_eq!(0, start_of_previous_lines(text, 64, 2)); assert_eq!(0, start_of_previous_lines(eol, text, 64, 2));
assert_eq!(0, start_of_previous_lines(text, 0, 2)); assert_eq!(0, start_of_previous_lines(eol, text, 0, 2));
assert_eq!(0, start_of_previous_lines(text, 0, 1)); assert_eq!(0, start_of_previous_lines(eol, text, 0, 1));
} }
#[test] #[test]
fn previous_lines_short() { fn previous_lines_short() {
let eol = b'\n';
let text = &b"a\nb\nc\nd\ne\nf\n"[..]; let text = &b"a\nb\nc\nd\ne\nf\n"[..];
assert_eq!(12, text.len()); assert_eq!(12, text.len());
assert_eq!(10, start_of_previous_lines(text, 12, 1)); assert_eq!(10, start_of_previous_lines(eol, text, 12, 1));
assert_eq!(8, start_of_previous_lines(text, 12, 2)); assert_eq!(8, start_of_previous_lines(eol, text, 12, 2));
assert_eq!(6, start_of_previous_lines(text, 12, 3)); assert_eq!(6, start_of_previous_lines(eol, text, 12, 3));
assert_eq!(4, start_of_previous_lines(text, 12, 4)); assert_eq!(4, start_of_previous_lines(eol, text, 12, 4));
assert_eq!(2, start_of_previous_lines(text, 12, 5)); assert_eq!(2, start_of_previous_lines(eol, text, 12, 5));
assert_eq!(0, start_of_previous_lines(text, 12, 6)); assert_eq!(0, start_of_previous_lines(eol, text, 12, 6));
assert_eq!(0, start_of_previous_lines(text, 12, 7)); assert_eq!(0, start_of_previous_lines(eol, text, 12, 7));
assert_eq!(10, start_of_previous_lines(text, 11, 1)); assert_eq!(10, start_of_previous_lines(eol, text, 11, 1));
assert_eq!(8, start_of_previous_lines(text, 11, 2)); assert_eq!(8, start_of_previous_lines(eol, text, 11, 2));
assert_eq!(6, start_of_previous_lines(text, 11, 3)); assert_eq!(6, start_of_previous_lines(eol, text, 11, 3));
assert_eq!(4, start_of_previous_lines(text, 11, 4)); assert_eq!(4, start_of_previous_lines(eol, text, 11, 4));
assert_eq!(2, start_of_previous_lines(text, 11, 5)); assert_eq!(2, start_of_previous_lines(eol, text, 11, 5));
assert_eq!(0, start_of_previous_lines(text, 11, 6)); assert_eq!(0, start_of_previous_lines(eol, text, 11, 6));
assert_eq!(0, start_of_previous_lines(text, 11, 7)); assert_eq!(0, start_of_previous_lines(eol, text, 11, 7));
assert_eq!(10, start_of_previous_lines(text, 10, 1)); assert_eq!(10, start_of_previous_lines(eol, text, 10, 1));
assert_eq!(8, start_of_previous_lines(text, 10, 2)); assert_eq!(8, start_of_previous_lines(eol, text, 10, 2));
assert_eq!(6, start_of_previous_lines(text, 10, 3)); assert_eq!(6, start_of_previous_lines(eol, text, 10, 3));
assert_eq!(4, start_of_previous_lines(text, 10, 4)); assert_eq!(4, start_of_previous_lines(eol, text, 10, 4));
assert_eq!(2, start_of_previous_lines(text, 10, 5)); assert_eq!(2, start_of_previous_lines(eol, text, 10, 5));
assert_eq!(0, start_of_previous_lines(text, 10, 6)); assert_eq!(0, start_of_previous_lines(eol, text, 10, 6));
assert_eq!(0, start_of_previous_lines(text, 10, 7)); assert_eq!(0, start_of_previous_lines(eol, text, 10, 7));
assert_eq!(8, start_of_previous_lines(text, 9, 1)); assert_eq!(8, start_of_previous_lines(eol, text, 9, 1));
assert_eq!(8, start_of_previous_lines(text, 8, 1)); assert_eq!(8, start_of_previous_lines(eol, text, 8, 1));
assert_eq!(6, start_of_previous_lines(text, 7, 1)); assert_eq!(6, start_of_previous_lines(eol, text, 7, 1));
assert_eq!(6, start_of_previous_lines(text, 6, 1)); assert_eq!(6, start_of_previous_lines(eol, text, 6, 1));
assert_eq!(4, start_of_previous_lines(text, 5, 1)); assert_eq!(4, start_of_previous_lines(eol, text, 5, 1));
assert_eq!(4, start_of_previous_lines(text, 4, 1)); assert_eq!(4, start_of_previous_lines(eol, text, 4, 1));
assert_eq!(2, start_of_previous_lines(text, 3, 1)); assert_eq!(2, start_of_previous_lines(eol, text, 3, 1));
assert_eq!(2, start_of_previous_lines(text, 2, 1)); assert_eq!(2, start_of_previous_lines(eol, text, 2, 1));
assert_eq!(0, start_of_previous_lines(text, 1, 1)); assert_eq!(0, start_of_previous_lines(eol, text, 1, 1));
assert_eq!(0, start_of_previous_lines(text, 0, 1)); assert_eq!(0, start_of_previous_lines(eol, text, 0, 1));
} }
#[test] #[test]
fn previous_lines_empty() { fn previous_lines_empty() {
let eol = b'\n';
let text = &b"\n\n\nd\ne\nf\n"[..]; let text = &b"\n\n\nd\ne\nf\n"[..];
assert_eq!(9, text.len()); assert_eq!(9, text.len());
assert_eq!(7, start_of_previous_lines(text, 9, 1)); assert_eq!(7, start_of_previous_lines(eol, text, 9, 1));
assert_eq!(5, start_of_previous_lines(text, 9, 2)); assert_eq!(5, start_of_previous_lines(eol, text, 9, 2));
assert_eq!(3, start_of_previous_lines(text, 9, 3)); assert_eq!(3, start_of_previous_lines(eol, text, 9, 3));
assert_eq!(2, start_of_previous_lines(text, 9, 4)); assert_eq!(2, start_of_previous_lines(eol, text, 9, 4));
assert_eq!(1, start_of_previous_lines(text, 9, 5)); assert_eq!(1, start_of_previous_lines(eol, text, 9, 5));
assert_eq!(0, start_of_previous_lines(text, 9, 6)); assert_eq!(0, start_of_previous_lines(eol, text, 9, 6));
assert_eq!(0, start_of_previous_lines(text, 9, 7)); assert_eq!(0, start_of_previous_lines(eol, text, 9, 7));
let text = &b"a\n\n\nd\ne\nf\n"[..]; let text = &b"a\n\n\nd\ne\nf\n"[..];
assert_eq!(10, text.len()); assert_eq!(10, text.len());
assert_eq!(8, start_of_previous_lines(text, 10, 1)); assert_eq!(8, start_of_previous_lines(eol, text, 10, 1));
assert_eq!(6, start_of_previous_lines(text, 10, 2)); assert_eq!(6, start_of_previous_lines(eol, text, 10, 2));
assert_eq!(4, start_of_previous_lines(text, 10, 3)); assert_eq!(4, start_of_previous_lines(eol, text, 10, 3));
assert_eq!(3, start_of_previous_lines(text, 10, 4)); assert_eq!(3, start_of_previous_lines(eol, text, 10, 4));
assert_eq!(2, start_of_previous_lines(text, 10, 5)); assert_eq!(2, start_of_previous_lines(eol, text, 10, 5));
assert_eq!(0, start_of_previous_lines(text, 10, 6)); assert_eq!(0, start_of_previous_lines(eol, text, 10, 6));
assert_eq!(0, start_of_previous_lines(text, 10, 7)); assert_eq!(0, start_of_previous_lines(eol, text, 10, 7));
} }
#[test] #[test]
@ -776,6 +878,23 @@ fn main() {
"); ");
} }
#[test]
fn binary() {
let text = "Sherlock\n\x00Holmes\n";
let (count, out) = search("Sherlock|Holmes", text, |s|s);
assert_eq!(0, count);
assert_eq!(out, "");
}
#[test]
fn binary_text() {
let text = "Sherlock\n\x00Holmes\n";
let (count, out) = search("Sherlock|Holmes", text, |s| s.text(true));
assert_eq!(2, count);
assert_eq!(out, "/baz.rs:Sherlock\n/baz.rs:Holmes\n");
}
#[test] #[test]
fn line_numbers() { fn line_numbers() {
let (count, out) = search_smallcap( let (count, out) = search_smallcap(

358
src/types.rs Normal file
View File

@ -0,0 +1,358 @@
/*!
The types module provides a way of associating glob patterns on file names to
file types.
*/
use std::collections::HashMap;
use std::error::Error as StdError;
use std::fmt;
use std::path::Path;
use gitignore::{self, Gitignore, GitignoreBuilder, Match, Pattern};
const TYPE_EXTENSIONS: &'static [(&'static str, &'static [&'static str])] = &[
("asm", &["*.asm", "*.s", "*.S"]),
("awk", &["*.awk"]),
("c", &["*.c", "*.h", "*.H"]),
("cbor", &["*.cbor"]),
("clojure", &["*.clj", "*.cljs"]),
("cmake", &["CMakeLists.txt"]),
("coffeescript", &["*.coffee"]),
("cpp", &[
"*.C", "*.cc", "*.cpp", "*.cxx",
"*.h", "*.H", "*.hh", "*.hpp",
]),
("csharp", &["*.cs"]),
("css", &["*.css"]),
("cython", &["*.pyx"]),
("dart", &["*.dart"]),
("d", &["*.d"]),
("elisp", &["*.el"]),
("erlang", &["*.erl", "*.hrl"]),
("fortran", &[
"*.f", "*.F", "*.f77", "*.F77", "*.pfo",
"*.f90", "*.F90", "*.f95", "*.F95",
]),
("go", &["*.go"]),
("groovy", &["*.groovy"]),
("haskell", &["*.hs", "*.lhs"]),
("html", &["*.htm", "*.html"]),
("java", &["*.java"]),
("js", &["*.js"]),
("json", &["*.json"]),
("jsonl", &["*.jsonl"]),
("lisp", &["*.el", "*.jl", "*.lisp", "*.lsp", "*.sc", "*.scm"]),
("lua", &["*.lua"]),
("m4", &["*.ac", "*.m4"]),
("make", &["gnumakefile", "Gnumakefile", "makefile", "Makefile", "*.mk"]),
("markdown", &["*.md"]),
("matlab", &["*.m"]),
("mk", &["mkfile"]),
("ml", &["*.ml"]),
("objc", &["*.h", "*.m"]),
("objcpp", &["*.h", "*.mm"]),
("ocaml", &["*.ml", "*.mli", "*.mll", "*.mly"]),
("perl", &["*.perl", "*.pl", "*.PL", "*.plh", "*.plx", "*.pm"]),
("php", &["*.php", "*.php3", "*.php4", "*.php5", "*.phtml"]),
("py", &["*.py"]),
("rr", &["*.R"]),
("rst", &["*.rst"]),
("ruby", &["*.rb"]),
("rust", &["*.rs"]),
("scala", &["*.scala"]),
("sh", &["*.bash", "*.csh", "*.ksh", "*.sh", "*.tcsh"]),
("sql", &["*.sql"]),
("tex", &["*.tex", "*.cls", "*.sty"]),
("txt", &["*.txt"]),
("toml", &["*.toml", "Cargo.lock"]),
("vala", &["*.vala"]),
("vimscript", &["*.vim"]),
("xml", &["*.xml"]),
("yacc", &["*.y"]),
("yaml", &["*.yaml", "*.yml"]),
];
/// Describes all the possible failure conditions for building a file type
/// matcher.
#[derive(Debug)]
pub enum Error {
/// We tried to select (or negate) a file type that is not defined.
UnrecognizedFileType(String),
/// A user specified file type definition could not be parsed.
InvalidDefinition,
/// There was an error building the matcher (probably a bad glob).
Gitignore(gitignore::Error),
}
impl StdError for Error {
fn description(&self) -> &str {
match *self {
Error::UnrecognizedFileType(_) => "unrecognized file type",
Error::InvalidDefinition => "invalid definition",
Error::Gitignore(ref err) => err.description(),
}
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Error::UnrecognizedFileType(ref ty) => {
write!(f, "unrecognized file type: {}", ty)
}
Error::InvalidDefinition => {
write!(f, "invalid definition (format is type:glob, e.g., \
html:*.html)")
}
Error::Gitignore(ref err) => err.fmt(f),
}
}
}
impl From<gitignore::Error> for Error {
fn from(err: gitignore::Error) -> Error {
Error::Gitignore(err)
}
}
/// A single file type definition.
#[derive(Clone, Debug)]
pub struct FileTypeDef {
name: String,
pats: Vec<String>,
}
impl FileTypeDef {
/// Return the name of this file type.
pub fn name(&self) -> &str {
&self.name
}
/// Return the glob patterns used to recognize this file type.
pub fn patterns(&self) -> &[String] {
&self.pats
}
}
/// Types is a file type matcher.
#[derive(Clone, Debug)]
pub struct Types {
gi: Option<Gitignore>,
has_selected: bool,
unmatched_pat: Pattern,
}
impl Types {
/// Creates a new file type matcher from the given Gitignore matcher. If
/// not Gitignore matcher is provided, then the file type matcher has no
/// effect.
///
/// If has_selected is true, then at least one file type was selected.
/// Therefore, any non-matches should be ignored.
fn new(gi: Option<Gitignore>, has_selected: bool) -> Types {
Types {
gi: gi,
has_selected: has_selected,
unmatched_pat: Pattern {
from: Path::new("<filetype>").to_path_buf(),
original: "<none>".to_string(),
pat: "<none>".to_string(),
whitelist: false,
only_dir: false,
},
}
}
/// Returns a match for the given path against this file type matcher.
///
/// The path is considered whitelisted if it matches a selected file type.
/// The path is considered ignored if it matched a negated file type.
/// If at least one file type is selected and path doesn't match, then
/// the path is also considered ignored.
pub fn matched<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> Match {
// File types don't apply to directories.
if is_dir {
return Match::None;
}
let path = path.as_ref();
self.gi.as_ref()
.map(|gi| {
let path = &*path.to_string_lossy();
let mat = gi.matched_utf8(path, is_dir).invert();
if self.has_selected && mat.is_none() {
Match::Ignored(&self.unmatched_pat)
} else {
mat
}
})
.unwrap_or(Match::None)
}
}
/// TypesBuilder builds a type matcher from a set of file type definitions and
/// a set of file type selections.
pub struct TypesBuilder {
types: HashMap<String, Vec<String>>,
select: Vec<String>,
select_not: Vec<String>,
}
impl TypesBuilder {
/// Create a new builder for a file type matcher.
pub fn new() -> TypesBuilder {
TypesBuilder {
types: HashMap::new(),
select: vec![],
select_not: vec![],
}
}
/// Build the current set of file type definitions *and* selections into
/// a file type matcher.
pub fn build(&self) -> Result<Types, Error> {
if self.select.is_empty() && self.select_not.is_empty() {
return Ok(Types::new(None, false));
}
let mut bgi = GitignoreBuilder::new("/");
for name in &self.select {
let globs = match self.types.get(name) {
Some(globs) => globs,
None => {
return Err(Error::UnrecognizedFileType(name.to_string()));
}
};
for glob in globs {
try!(bgi.add("<filetype>", glob));
}
}
for name in &self.select_not {
let globs = match self.types.get(name) {
Some(globs) => globs,
None => {
return Err(Error::UnrecognizedFileType(name.to_string()));
}
};
for glob in globs {
try!(bgi.add("<filetype>", &format!("!{}", glob)));
}
}
Ok(Types::new(Some(try!(bgi.build())), !self.select.is_empty()))
}
/// Return the set of current file type definitions.
pub fn definitions(&self) -> Vec<FileTypeDef> {
let mut defs = vec![];
for (ref name, ref pats) in &self.types {
let mut pats = pats.to_vec();
pats.sort();
defs.push(FileTypeDef {
name: name.to_string(),
pats: pats,
});
}
defs.sort_by(|def1, def2| def1.name().cmp(def2.name()));
defs
}
/// Select the file type given by `name`.
pub fn select(&mut self, name: &str) -> &mut TypesBuilder {
self.select.push(name.to_string());
self
}
/// Ignore the file type given by `name`.
pub fn select_not(&mut self, name: &str) -> &mut TypesBuilder {
self.select_not.push(name.to_string());
self
}
/// Clear any file type definitions for the type given.
pub fn clear(&mut self, name: &str) -> &mut TypesBuilder {
self.types.remove(name);
self
}
/// Add a new file type definition. `name` can be arbitrary and `pat`
/// should be a glob recognizing file paths belonging to the `name` type.
pub fn add(&mut self, name: &str, pat: &str) -> &mut TypesBuilder {
self.types.entry(name.to_string())
.or_insert(vec![]).push(pat.to_string());
self
}
/// Add a new file type definition specified in string form. The format
/// is `name:glob`. Names may not include a colon.
pub fn add_def(&mut self, def: &str) -> Result<(), Error> {
let name: String = def.chars().take_while(|&c| c != ':').collect();
let pat: String = def.chars().skip(name.chars().count() + 1).collect();
if name.is_empty() || pat.is_empty() {
return Err(Error::InvalidDefinition);
}
self.add(&name, &pat);
Ok(())
}
/// Add a set of default file type definitions.
pub fn add_defaults(&mut self) -> &mut TypesBuilder {
for &(name, exts) in TYPE_EXTENSIONS {
for ext in exts {
self.add(name, ext);
}
}
self
}
}
#[cfg(test)]
mod tests {
use super::TypesBuilder;
macro_rules! matched {
($name:ident, $types:expr, $sel:expr, $selnot:expr,
$path:expr) => {
matched!($name, $types, $sel, $selnot, $path, true);
};
(not, $name:ident, $types:expr, $sel:expr, $selnot:expr,
$path:expr) => {
matched!($name, $types, $sel, $selnot, $path, false);
};
($name:ident, $types:expr, $sel:expr, $selnot:expr,
$path:expr, $matched:expr) => {
#[test]
fn $name() {
let mut btypes = TypesBuilder::new();
for tydef in $types {
btypes.add_def(tydef).unwrap();
}
for sel in $sel {
btypes.select(sel);
}
for selnot in $selnot {
btypes.select_not(selnot);
}
let types = btypes.build().unwrap();
let mat = types.matched($path, false);
assert_eq!($matched, !mat.is_ignored());
}
};
}
fn types() -> Vec<&'static str> {
vec![
"html:*.html",
"html:*.htm",
"rust:*.rs",
"js:*.js",
]
}
matched!(match1, types(), vec!["rust"], vec![], "lib.rs");
matched!(match2, types(), vec!["html"], vec![], "index.html");
matched!(match3, types(), vec!["html"], vec![], "index.htm");
matched!(match4, types(), vec!["html", "rust"], vec![], "main.rs");
matched!(match5, types(), vec![], vec![], "index.html");
matched!(match6, types(), vec![], vec!["rust"], "index.html");
matched!(not, matchnot1, types(), vec!["rust"], vec![], "index.html");
matched!(not, matchnot2, types(), vec![], vec!["rust"], "main.rs");
}