Add file size exclusion to walker

A maximum filesize can be specified as an argument to a `WalkBuilder`.
If a file exceeds the specified size it will be ignored as part of the
resulting file/directory set.

The filesize limit never applies to directories.
This commit is contained in:
tiehuis 2017-02-26 11:36:59 +13:00 committed by Andrew Gallant
parent 066f97d855
commit 49fd668712

View File

@ -392,7 +392,9 @@ impl DirEntryRaw {
/// continues. /// continues.
/// * Fifth, if the path hasn't been whitelisted and it is hidden, then the /// * Fifth, if the path hasn't been whitelisted and it is hidden, then the
/// path is skipped. /// path is skipped.
/// * Sixth, if the path has made it this far then it is yielded in the /// * Sixth, unless the path is a directory, the size of the file is compared
/// against the max filesize limit. If it exceeds the limit, it is skipped.
/// * Seventh, if the path has made it this far then it is yielded in the
/// iterator. /// iterator.
#[derive(Clone)] #[derive(Clone)]
pub struct WalkBuilder { pub struct WalkBuilder {
@ -400,6 +402,7 @@ pub struct WalkBuilder {
ig_builder: IgnoreBuilder, ig_builder: IgnoreBuilder,
parents: bool, parents: bool,
max_depth: Option<usize>, max_depth: Option<usize>,
max_filesize: Option<u64>,
follow_links: bool, follow_links: bool,
sorter: Option<Arc<Fn(&OsString, &OsString) -> cmp::Ordering + 'static>>, sorter: Option<Arc<Fn(&OsString, &OsString) -> cmp::Ordering + 'static>>,
threads: usize, threads: usize,
@ -412,6 +415,7 @@ impl fmt::Debug for WalkBuilder {
.field("ig_builder", &self.ig_builder) .field("ig_builder", &self.ig_builder)
.field("parents", &self.parents) .field("parents", &self.parents)
.field("max_depth", &self.max_depth) .field("max_depth", &self.max_depth)
.field("max_filesize", &self.max_filesize)
.field("follow_links", &self.follow_links) .field("follow_links", &self.follow_links)
.field("threads", &self.threads) .field("threads", &self.threads)
.finish() .finish()
@ -431,6 +435,7 @@ impl WalkBuilder {
ig_builder: IgnoreBuilder::new(), ig_builder: IgnoreBuilder::new(),
parents: true, parents: true,
max_depth: None, max_depth: None,
max_filesize: None,
follow_links: false, follow_links: false,
sorter: None, sorter: None,
threads: 0, threads: 0,
@ -464,6 +469,7 @@ impl WalkBuilder {
it: None, it: None,
ig_root: ig_root.clone(), ig_root: ig_root.clone(),
ig: ig_root.clone(), ig: ig_root.clone(),
max_filesize: self.max_filesize,
parents: self.parents, parents: self.parents,
} }
} }
@ -478,6 +484,7 @@ impl WalkBuilder {
paths: self.paths.clone().into_iter(), paths: self.paths.clone().into_iter(),
ig_root: self.ig_builder.build(), ig_root: self.ig_builder.build(),
max_depth: self.max_depth, max_depth: self.max_depth,
max_filesize: self.max_filesize,
follow_links: self.follow_links, follow_links: self.follow_links,
parents: self.parents, parents: self.parents,
threads: self.threads, threads: self.threads,
@ -508,6 +515,12 @@ impl WalkBuilder {
self self
} }
/// Whether to ignore files above the specified limit.
pub fn max_filesize(&mut self, filesize: Option<u64>) -> &mut WalkBuilder {
self.max_filesize = filesize;
self
}
/// The number of threads to use for traversal. /// The number of threads to use for traversal.
/// ///
/// Note that this only has an effect when using `build_parallel`. /// Note that this only has an effect when using `build_parallel`.
@ -650,6 +663,7 @@ pub struct Walk {
it: Option<WalkEventIter>, it: Option<WalkEventIter>,
ig_root: Ignore, ig_root: Ignore,
ig: Ignore, ig: Ignore,
max_filesize: Option<u64>,
parents: bool, parents: bool,
} }
@ -667,7 +681,10 @@ impl Walk {
if ent.depth() == 0 { if ent.depth() == 0 {
return false; return false;
} }
skip_path(&self.ig, ent.path(), ent.file_type().is_dir())
let ft = ent.file_type().is_dir();
skip_path(&self.ig, ent.path(), ft) ||
skip_filesize(self.max_filesize, ent.path(), &ent.metadata().ok(), ft)
} }
} }
@ -824,6 +841,7 @@ pub struct WalkParallel {
paths: vec::IntoIter<PathBuf>, paths: vec::IntoIter<PathBuf>,
ig_root: Ignore, ig_root: Ignore,
parents: bool, parents: bool,
max_filesize: Option<u64>,
max_depth: Option<usize>, max_depth: Option<usize>,
follow_links: bool, follow_links: bool,
threads: usize, threads: usize,
@ -886,6 +904,7 @@ impl WalkParallel {
threads: threads, threads: threads,
parents: self.parents, parents: self.parents,
max_depth: self.max_depth, max_depth: self.max_depth,
max_filesize: self.max_filesize,
follow_links: self.follow_links, follow_links: self.follow_links,
}; };
handles.push(thread::spawn(|| worker.run())); handles.push(thread::spawn(|| worker.run()));
@ -1000,6 +1019,9 @@ struct Worker {
/// The maximum depth of directories to descend. A value of `0` means no /// The maximum depth of directories to descend. A value of `0` means no
/// descension at all. /// descension at all.
max_depth: Option<usize>, max_depth: Option<usize>,
/// The maximum size a searched file can be (in bytes). If a file exceeds
/// this size it will be skipped.
max_filesize: Option<u64>,
/// Whether to follow symbolic links or not. When this is enabled, loop /// Whether to follow symbolic links or not. When this is enabled, loop
/// detection is performed. /// detection is performed.
follow_links: bool, follow_links: bool,
@ -1106,7 +1128,10 @@ impl Worker {
} }
} }
let is_dir = dent.file_type().map_or(false, |ft| ft.is_dir()); let is_dir = dent.file_type().map_or(false, |ft| ft.is_dir());
if !skip_path(ig, dent.path(), is_dir) { if !skip_path(ig, dent.path(), is_dir) &&
!skip_filesize(self.max_filesize, dent.path(),
&dent.metadata().ok(), is_dir)
{
self.queue.push(Message::Work(Work { self.queue.push(Message::Work(Work {
dent: dent, dent: dent,
ignore: ig.clone(), ignore: ig.clone(),
@ -1253,6 +1278,34 @@ fn check_symlink_loop(
Ok(()) Ok(())
} }
fn skip_filesize(
max_filesize: Option<u64>,
path: &Path,
ent: &Option<Metadata>,
is_dir: bool
) -> bool {
if is_dir {
return false;
}
let filesize = match *ent {
Some(ref md) => Some(md.len()),
None => None
};
match (filesize, max_filesize) {
(Some(fs), Some(m_fs)) => {
if fs > m_fs {
debug!("ignoring {}: {} bytes", path.display(), fs);
true
} else {
false
}
}
_ => false
}
}
fn skip_path(ig: &Ignore, path: &Path, is_dir: bool) -> bool { fn skip_path(ig: &Ignore, path: &Path, is_dir: bool) -> bool {
let m = ig.matched(path, is_dir); let m = ig.matched(path, is_dir);
if m.is_ignore() { if m.is_ignore() {
@ -1282,6 +1335,11 @@ mod tests {
file.write_all(contents.as_bytes()).unwrap(); file.write_all(contents.as_bytes()).unwrap();
} }
fn wfile_size<P: AsRef<Path>>(path: P, size: u64) {
let file = File::create(path).unwrap();
file.set_len(size).unwrap();
}
#[cfg(unix)] #[cfg(unix)]
fn symlink<P: AsRef<Path>, Q: AsRef<Path>>(src: P, dst: Q) { fn symlink<P: AsRef<Path>, Q: AsRef<Path>>(src: P, dst: Q) {
use std::os::unix::fs::symlink; use std::os::unix::fs::symlink;
@ -1438,6 +1496,32 @@ mod tests {
]); ]);
} }
#[test]
fn max_filesize() {
let td = TempDir::new("walk-test-").unwrap();
mkdirp(td.path().join("a/b"));
wfile_size(td.path().join("foo"), 0);
wfile_size(td.path().join("bar"), 400);
wfile_size(td.path().join("baz"), 600);
wfile_size(td.path().join("a/foo"), 600);
wfile_size(td.path().join("a/bar"), 500);
wfile_size(td.path().join("a/baz"), 200);
let mut builder = WalkBuilder::new(td.path());
assert_paths(td.path(), &builder, &[
"a", "a/b", "foo", "bar", "baz", "a/foo", "a/bar", "a/baz",
]);
assert_paths(td.path(), builder.max_filesize(Some(0)), &[
"a", "a/b", "foo"
]);
assert_paths(td.path(), builder.max_filesize(Some(500)), &[
"a", "a/b", "foo", "bar", "a/bar", "a/baz"
]);
assert_paths(td.path(), builder.max_filesize(Some(50000)), &[
"a", "a/b", "foo", "bar", "baz", "a/foo", "a/bar", "a/baz",
]);
}
#[cfg(unix)] // because symlinks on windows are weird #[cfg(unix)] // because symlinks on windows are weird
#[test] #[test]
fn symlinks() { fn symlinks() {