search: add support for searching compressed files

This commit adds opt-in support for searching compressed files during
recursive search. This behavior is only enabled when the
`-z/--search-zip` flag is passed to ripgrep. When enabled, a limited set
of common compression formats are recognized via file extension, and a
new process is spawned to perform the decompression. ripgrep then
searches the stdout of that spawned process.

Closes #539
This commit is contained in:
Balaji Sivaraman
2018-01-07 21:35:58 +05:30
committed by Andrew Gallant
parent a8543f798d
commit f007f940c5
18 changed files with 373 additions and 24 deletions

View File

@@ -191,6 +191,7 @@ pub fn app() -> App<'static, 'static> {
.arg(flag("type-clear")
.value_name("TYPE").takes_value(true)
.multiple(true).number_of_values(1))
.arg(flag("search-zip").short("z"))
}
struct Usage {
@@ -450,7 +451,8 @@ lazy_static! {
can be specified by using the --ignore-file flag several times. \
When specifying multiple ignore files, earlier files have lower \
precedence than later files. If you are looking for a way to \
include or exclude files and directories directly used -g instead.");
include or exclude files and directories directly used -g \
instead.");
doc!(h, "follow",
"Follow symbolic links.");
doc!(h, "max-count",
@@ -592,6 +594,11 @@ lazy_static! {
only clears the default type definitions that are found inside \
of ripgrep.\n\nNote that this MUST be passed to every \
invocation of ripgrep. Type settings are NOT persisted.");
doc!(h, "search-zip",
"Search in compressed files.",
"Search in compressed files. Currently gz, bz2, xz, and \
lzma files are supported. This option expects the decompression \
binaries to be available in the system PATH.");
h
};
@@ -599,8 +606,9 @@ lazy_static! {
fn validate_line_number_width(s: String) -> Result<(), String> {
if s.starts_with("0") {
Err(String::from("Custom padding characters are currently not supported. \
Please enter only a numeric value."))
Err(String::from(
"Custom padding characters are currently not supported. \
Please enter only a numeric value."))
} else {
validate_number(s)
}

View File

@@ -77,6 +77,7 @@ pub struct Args {
type_list: bool,
types: Types,
with_filename: bool,
search_zip_files: bool
}
impl Args {
@@ -229,6 +230,7 @@ impl Args {
.no_messages(self.no_messages)
.quiet(self.quiet)
.text(self.text)
.search_zip_files(self.search_zip_files)
.build()
}
@@ -365,6 +367,7 @@ impl<'a> ArgMatches<'a> {
type_list: self.is_present("type-list"),
types: self.types()?,
with_filename: with_filename,
search_zip_files: self.is_present("search-zip")
};
if args.mmap {
debug!("will try to use memory maps");

191
src/decompressor.rs Normal file
View File

@@ -0,0 +1,191 @@
use std::collections::HashMap;
use std::ffi::OsStr;
use std::fmt;
use std::io::{self, Read};
use std::path::Path;
use std::process::{self, Stdio};
use globset::{Glob, GlobSet, GlobSetBuilder};
/// A decompression command, contains the command to be spawned as well as any
/// necessary CLI args.
#[derive(Clone, Copy, Debug)]
struct DecompressionCommand {
cmd: &'static str,
args: &'static [&'static str],
}
impl DecompressionCommand {
/// Create a new decompress command
fn new(
cmd: &'static str,
args: &'static [&'static str],
) -> DecompressionCommand {
DecompressionCommand {
cmd, args
}
}
}
impl fmt::Display for DecompressionCommand {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{} {}", self.cmd, self.args.join(" "))
}
}
lazy_static! {
static ref DECOMPRESSION_COMMANDS: HashMap<
&'static str,
DecompressionCommand,
> = {
let mut m = HashMap::new();
const ARGS: &[&str] = &["-d", "-c"];
m.insert("gz", DecompressionCommand::new("gzip", ARGS));
m.insert("bz2", DecompressionCommand::new("bzip2", ARGS));
m.insert("xz", DecompressionCommand::new("xz", ARGS));
const LZMA_ARGS: &[&str] = &["--format=lzma", "-d", "-c"];
m.insert("lzma", DecompressionCommand::new("xz", LZMA_ARGS));
m
};
static ref SUPPORTED_COMPRESSION_FORMATS: GlobSet = {
let mut builder = GlobSetBuilder::new();
builder.add(Glob::new("*.gz").unwrap());
builder.add(Glob::new("*.bz2").unwrap());
builder.add(Glob::new("*.xz").unwrap());
builder.add(Glob::new("*.lzma").unwrap());
builder.build().unwrap()
};
static ref TAR_ARCHIVE_FORMATS: GlobSet = {
let mut builder = GlobSetBuilder::new();
builder.add(Glob::new("*.tar.gz").unwrap());
builder.add(Glob::new("*.tar.xz").unwrap());
builder.add(Glob::new("*.tar.bz2").unwrap());
builder.add(Glob::new("*.tgz").unwrap());
builder.add(Glob::new("*.txz").unwrap());
builder.add(Glob::new("*.tbz2").unwrap());
builder.build().unwrap()
};
}
/// DecompressionReader provides an `io::Read` implementation for a limited
/// set of compression formats.
#[derive(Debug)]
pub struct DecompressionReader {
cmd: DecompressionCommand,
child: process::Child,
done: bool,
}
impl DecompressionReader {
/// Returns a handle to the stdout of the spawned decompression process for
/// `path`, which can be directly searched in the worker. When the returned
/// value is exhausted, the underlying process is reaped. If the underlying
/// process fails, then its stderr is read and converted into a normal
/// io::Error.
///
/// If there is any error in spawning the decompression command, then
/// return `None`, after outputting any necessary debug or error messages.
pub fn from_path(path: &Path) -> Option<DecompressionReader> {
if is_tar_archive(path) {
debug!("{}: skipping tar archive", path.display());
return None;
}
let extension = match path.extension().and_then(OsStr::to_str) {
Some(extension) => extension,
None => {
debug!(
"{}: failed to get compresson extension", path.display());
return None;
}
};
let decompression_cmd = match DECOMPRESSION_COMMANDS.get(extension) {
Some(cmd) => cmd,
None => {
debug!(
"{}: failed to get decompression command", path.display());
return None;
}
};
let cmd = process::Command::new(decompression_cmd.cmd)
.args(decompression_cmd.args)
.arg(path)
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn();
let child = match cmd {
Ok(process) => process,
Err(_) => {
debug!(
"{}: decompression command '{}' not found",
path.display(), decompression_cmd.cmd);
return None;
}
};
Some(DecompressionReader::new(*decompression_cmd, child))
}
fn new(
cmd: DecompressionCommand,
child: process::Child,
) -> DecompressionReader {
DecompressionReader {
cmd: cmd,
child: child,
done: false,
}
}
fn read_error(&mut self) -> io::Result<io::Error> {
let mut errbytes = vec![];
self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
let errstr = String::from_utf8_lossy(&errbytes);
let errstr = errstr.trim();
Ok(if errstr.is_empty() {
let msg = format!("decompression command failed: '{}'", self.cmd);
io::Error::new(io::ErrorKind::Other, msg)
} else {
let msg = format!(
"decompression command '{}' failed: {}", self.cmd, errstr);
io::Error::new(io::ErrorKind::Other, msg)
})
}
}
impl io::Read for DecompressionReader {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if self.done {
return Ok(0);
}
let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
if nread == 0 {
self.done = true;
// Reap the child now that we're done reading.
// If the command failed, report stderr as an error.
if !self.child.wait()?.success() {
return Err(self.read_error()?);
}
}
Ok(nread)
}
}
/// Returns true if the given path contains a supported compression format or
/// is a TAR archive.
pub fn is_compressed(path: &Path) -> bool {
is_supported_compression_format(path) || is_tar_archive(path)
}
/// Returns true if the given path matches any one of the supported compression
/// formats
fn is_supported_compression_format(path: &Path) -> bool {
SUPPORTED_COMPRESSION_FORMATS.is_match(path)
}
/// Returns true if the given path matches any of the known TAR file formats.
fn is_tar_archive(path: &Path) -> bool {
TAR_ARCHIVE_FORMATS.is_match(path)
}

View File

@@ -4,6 +4,7 @@ extern crate bytecount;
extern crate clap;
extern crate encoding_rs;
extern crate env_logger;
extern crate globset;
extern crate grep;
extern crate ignore;
#[macro_use]
@@ -44,6 +45,7 @@ macro_rules! eprintln {
mod app;
mod args;
mod decoder;
mod decompressor;
mod pathutil;
mod printer;
mod search_buffer;

View File

@@ -9,6 +9,7 @@ use memmap::Mmap;
use termcolor::WriteColor;
use decoder::DecodeReader;
use decompressor::{self, DecompressionReader};
use pathutil::strip_prefix;
use printer::Printer;
use search_buffer::BufferSearcher;
@@ -42,6 +43,7 @@ struct Options {
no_messages: bool,
quiet: bool,
text: bool,
search_zip_files: bool
}
impl Default for Options {
@@ -61,6 +63,7 @@ impl Default for Options {
no_messages: false,
quiet: false,
text: false,
search_zip_files: false,
}
}
}
@@ -190,6 +193,12 @@ impl WorkerBuilder {
self.opts.text = yes;
self
}
/// If enabled, search through compressed files as well
pub fn search_zip_files(mut self, yes: bool) -> Self {
self.opts.search_zip_files = yes;
self
}
}
/// Worker is responsible for executing searches on file paths, while choosing
@@ -218,22 +227,33 @@ impl Worker {
}
Work::DirEntry(dent) => {
let mut path = dent.path();
let file = match File::open(path) {
Ok(file) => file,
Err(err) => {
if !self.opts.no_messages {
eprintln!("{}: {}", path.display(), err);
if self.opts.search_zip_files
&& decompressor::is_compressed(path)
{
match DecompressionReader::from_path(path) {
Some(reader) => self.search(printer, path, reader),
None => {
return 0;
}
return 0;
}
};
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.opts.mmap {
self.search_mmap(printer, path, &file)
} else {
self.search(printer, path, file)
let file = match File::open(path) {
Ok(file) => file,
Err(err) => {
if !self.opts.no_messages {
eprintln!("{}: {}", path.display(), err);
}
return 0;
}
};
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.opts.mmap {
self.search_mmap(printer, path, &file)
} else {
self.search(printer, path, file)
}
}
}
};