Move all gitignore matching to separate crate.

This PR introduces a new sub-crate, `ignore`, which primarily provides a
fast recursive directory iterator that respects ignore files like
gitignore and other configurable filtering rules based on globs or even
file types.

This results in a substantial source of complexity moved out of ripgrep's
core and into a reusable component that others can now (hopefully)
benefit from.

While much of the ignore code carried over from ripgrep's core, a
substantial portion of it was rewritten with the following goals in
mind:

1. Reuse matchers built from gitignore files across directory iteration.
2. Design the matcher data structure to be amenable for parallelizing
   directory iteration. (Indeed, writing the parallel iterator is the
   next step.)

Fixes #9, #44, #45
This commit is contained in:
Andrew Gallant
2016-10-11 19:57:09 -04:00
parent 12b2b1f624
commit d79add341b
30 changed files with 3765 additions and 1760 deletions

View File

@@ -14,19 +14,17 @@ use term::Terminal;
use term;
#[cfg(windows)]
use term::WinConsole;
use walkdir::WalkDir;
use atty;
use gitignore::{Gitignore, GitignoreBuilder};
use ignore::Ignore;
use ignore::overrides::{Override, OverrideBuilder};
use ignore::types::{FileTypeDef, Types, TypesBuilder};
use ignore;
use out::{Out, ColoredTerminal};
use printer::Printer;
use search_buffer::BufferSearcher;
use search_stream::{InputBuffer, Searcher};
#[cfg(windows)]
use terminal_win::WindowsBuffer;
use types::{FileTypeDef, Types, TypesBuilder};
use walk;
use Result;
@@ -131,6 +129,13 @@ Less common options:
Search hidden directories and files. (Hidden directories and files are
skipped by default.)
--ignore-file FILE ...
Specify additional ignore files for filtering file paths. Ignore files
should be in the gitignore format and are matched relative to the
current working directory. These ignore files have lower precedence
than all other ignore file types. When specifying multiple ignore
files, earlier files have lower precedence than later files.
-L, --follow
Follow symlinks.
@@ -234,6 +239,7 @@ pub struct RawArgs {
flag_heading: bool,
flag_hidden: bool,
flag_ignore_case: bool,
flag_ignore_file: Vec<String>,
flag_invert_match: bool,
flag_line_number: bool,
flag_fixed_strings: bool,
@@ -279,11 +285,12 @@ pub struct Args {
eol: u8,
files: bool,
follow: bool,
glob_overrides: Option<Gitignore>,
glob_overrides: Override,
grep: Grep,
heading: bool,
hidden: bool,
ignore_case: bool,
ignore_files: Vec<PathBuf>,
invert_match: bool,
line_number: bool,
line_per_match: bool,
@@ -347,14 +354,13 @@ impl RawArgs {
}
let glob_overrides =
if self.flag_glob.is_empty() {
None
Override::empty()
} else {
let cwd = try!(env::current_dir());
let mut bgi = GitignoreBuilder::new(cwd);
let mut ovr = OverrideBuilder::new(try!(env::current_dir()));
for pat in &self.flag_glob {
try!(bgi.add("<argv>", pat));
try!(ovr.add(pat));
}
Some(try!(bgi.build()))
try!(ovr.build())
};
let threads =
if self.flag_threads == 0 {
@@ -382,6 +388,9 @@ impl RawArgs {
let no_ignore = self.flag_no_ignore || self.flag_unrestricted >= 1;
let hidden = self.flag_hidden || self.flag_unrestricted >= 2;
let text = self.flag_text || self.flag_unrestricted >= 3;
let ignore_files: Vec<_> = self.flag_ignore_file.iter().map(|p| {
Path::new(p).to_path_buf()
}).collect();
let mut args = Args {
paths: paths,
after_context: after_context,
@@ -399,6 +408,7 @@ impl RawArgs {
heading: !self.flag_no_heading && self.flag_heading,
hidden: hidden,
ignore_case: self.flag_ignore_case,
ignore_files: ignore_files,
invert_match: self.flag_invert_match,
line_number: !self.flag_no_line_number && self.flag_line_number,
line_per_match: self.flag_vimgrep,
@@ -711,31 +721,30 @@ impl Args {
self.type_list
}
/// Create a new recursive directory iterator at the path given.
pub fn walker(&self, path: &Path) -> Result<walk::Iter> {
// Always follow symlinks for explicitly specified files.
let mut wd = WalkDir::new(path).follow_links(
self.follow || path.is_file());
if let Some(maxdepth) = self.maxdepth {
wd = wd.max_depth(maxdepth);
/// Create a new recursive directory iterator over the paths in argv.
pub fn walker(&self) -> Walk {
let paths = self.paths();
let mut wd = ignore::WalkBuilder::new(&paths[0]);
for path in &paths[1..] {
wd.add(path);
}
let mut ig = Ignore::new();
// Only register ignore rules if this is a directory. If it's a file,
// then it was explicitly given by the end user, so we always search
// it.
if path.is_dir() {
ig.ignore_hidden(!self.hidden);
ig.no_ignore(self.no_ignore);
ig.no_ignore_vcs(self.no_ignore_vcs);
ig.add_types(self.types.clone());
if !self.no_ignore_parent {
try!(ig.push_parents(path));
}
if let Some(ref overrides) = self.glob_overrides {
ig.add_override(overrides.clone());
for path in &self.ignore_files {
if let Some(err) = wd.add_ignore(path) {
eprintln!("{}", err);
}
}
Ok(walk::Iter::new(ig, wd))
wd.follow_links(self.follow);
wd.hidden(!self.hidden);
wd.max_depth(self.maxdepth);
wd.overrides(self.glob_overrides.clone());
wd.types(self.types.clone());
wd.git_global(!self.no_ignore && !self.no_ignore_vcs);
wd.git_ignore(!self.no_ignore && !self.no_ignore_vcs);
wd.git_exclude(!self.no_ignore && !self.no_ignore_vcs);
wd.ignore(!self.no_ignore);
wd.parents(!self.no_ignore_parent);
Walk(wd.build())
}
}
@@ -752,6 +761,34 @@ fn version() -> String {
}
}
/// A simple wrapper around the ignore::Walk iterator. This will
/// automatically emit error messages to stderr and will skip directories.
pub struct Walk(ignore::Walk);
impl Iterator for Walk {
type Item = ignore::DirEntry;
fn next(&mut self) -> Option<ignore::DirEntry> {
while let Some(result) = self.0.next() {
match result {
Ok(dent) => {
if let Some(err) = dent.error() {
eprintln!("{}", err);
}
if dent.file_type().map_or(false, |x| x.is_dir()) {
continue;
}
return Some(dent);
}
Err(err) => {
eprintln!("{}", err);
}
}
}
None
}
}
/// A single state in the state machine used by `unescape`.
#[derive(Clone, Copy, Eq, PartialEq)]
enum State {
@@ -761,7 +798,7 @@ enum State {
Literal,
}
/// Unescapes a string given on the command line. It supports a limit set of
/// Unescapes a string given on the command line. It supports a limited set of
/// escape sequences:
///
/// * \t, \r and \n are mapped to their corresponding ASCII bytes.