ripgrep: add --pre flag

The preprocessor flag accepts a command program and executes this
program for every input file that is searched. Instead of searching the
file directly, ripgrep will instead search the stdout contents of the
program.

Closes #978, Closes #981
This commit is contained in:
Charles Blake 2018-07-13 09:54:51 -04:00 committed by Andrew Gallant
parent 1d09d4d31b
commit 231456c409
9 changed files with 211 additions and 5 deletions

View File

@ -50,6 +50,8 @@ Feature enhancements:
* [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967):
Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth`
as a synonym for backwards compatibility.
* [FEATURE #978](https://github.com/BurntSushi/ripgrep/issues/978):
Add a `--pre` option to filter inputs with an arbitrary program.
* [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d):
Improve zsh completion.

View File

@ -107,6 +107,9 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
specifically specified with the `-E/--encoding` flag.)
* ripgrep supports searching files compressed in a common format (gzip, xz,
lzma, bzip2 or lz4) with the `-z/--search-zip` flag.
* ripgrep supports arbitrary input preprocessing filters which could be PDF
text extraction, less supported decompression, decrypting, automatic encoding
detection and so on.
In other words, use ripgrep if you like speed, filtering by default, fewer
bugs, and Unicode support.

View File

@ -170,7 +170,8 @@ _rg() {
{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
{-x,--line-regexp}'[only show matches surrounded by line boundaries]'
+ '(zip)' # Compressed-file options
+ '(input-decoding)' # Input decoding options
'--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
{-z,--search-zip}'[search in compressed files]'
$no"--no-search-zip[don't search in compressed files]"

View File

@ -534,6 +534,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
flag_only_matching(&mut args);
flag_path_separator(&mut args);
flag_passthru(&mut args);
flag_pre(&mut args);
flag_pretty(&mut args);
flag_quiet(&mut args);
flag_regex_size_limit(&mut args);
@ -1453,12 +1454,62 @@ This flag can be disabled with --no-search-zip.
");
let arg = RGArg::switch("search-zip").short("z")
.help(SHORT).long_help(LONG)
.overrides("no-search-zip");
.overrides("no-search-zip")
.overrides("pre");
args.push(arg);
let arg = RGArg::switch("no-search-zip")
.hidden()
.overrides("search-zip");
.overrides("search-zip")
.overrides("pre");
args.push(arg);
}
fn flag_pre(args: &mut Vec<RGArg>) {
const SHORT: &str = "search outputs of COMMAND FILE for each FILE";
const LONG: &str = long!("\
For each input FILE, search the standard output of COMMAND FILE rather than the
contents of FILE. This option expects the COMMAND program to either be an
absolute path or to be available in your PATH. An empty string COMMAND
deactivates this feature.
A preprocessor is not run when ripgrep is searching stdin.
When searching over sets of files that may require one of several decoders
as preprocessors, COMMAND should be a wrapper program or script which first
classifies FILE based on magic numbers/content or based on the FILE name and
then dispatches to an appropriate preprocessor. Each COMMAND also has its
standard input connected to FILE for convenience.
For example, a shell script for COMMAND might look like:
case \"$1\" in
*.pdf)
exec pdftotext \"$1\" -
;;
*)
case $(file \"$1\") in
*Zstandard*)
exec pzstd -cdq
;;
*)
exec cat
;;
esac
;;
esac
The above script uses `pdftotext` to convert a PDF file to plain text. For
all other files, the script uses the `file` utility to sniff the type of the
file based on its contents. If it is a compressed file in the Zstandard format,
then `pzstd` is used to decompress the contents to stdout.
This overrides the -z/--search-zip flag.
");
let arg = RGArg::flag("pre", "COMMAND")
.help(SHORT).long_help(LONG)
.overrides("search-zip")
.overrides("no-search-zip");
args.push(arg);
}

View File

@ -80,6 +80,7 @@ pub struct Args {
types: Types,
with_filename: bool,
search_zip_files: bool,
preprocessor: Option<PathBuf>,
stats: bool
}
@ -288,6 +289,7 @@ impl Args {
.quiet(self.quiet)
.text(self.text)
.search_zip_files(self.search_zip_files)
.preprocessor(self.preprocessor.clone())
.build()
}
@ -429,6 +431,7 @@ impl<'a> ArgMatches<'a> {
types: self.types()?,
with_filename: with_filename,
search_zip_files: self.is_present("search-zip"),
preprocessor: self.preprocessor(),
stats: self.stats()
};
if args.mmap {
@ -722,6 +725,19 @@ impl<'a> ArgMatches<'a> {
}
}
/// Returns the preprocessor command
fn preprocessor(&self) -> Option<PathBuf> {
if let Some(path) = self.value_of_os("pre") {
if path.is_empty() {
None
} else {
Some(Path::new(path).to_path_buf())
}
} else {
None
}
}
/// Returns the unescaped path separator in UTF-8 bytes.
fn path_separator(&self) -> Result<Option<u8>> {
match self.value_of_lossy("path-separator") {

View File

@ -43,6 +43,7 @@ mod args;
mod config;
mod decoder;
mod decompressor;
mod preprocessor;
mod logger;
mod pathutil;
mod printer;

92
src/preprocessor.rs Normal file
View File

@ -0,0 +1,92 @@
use std::fs::File;
use std::io::{self, Read};
use std::path::{Path, PathBuf};
use std::process::{self, Stdio};
use Result;
/// PreprocessorReader provides an `io::Read` impl to read kids output.
#[derive(Debug)]
pub struct PreprocessorReader {
cmd: PathBuf,
path: PathBuf,
child: process::Child,
done: bool,
}
impl PreprocessorReader {
/// Returns a handle to the stdout of the spawned preprocessor process for
/// `path`, which can be directly searched in the worker. When the returned
/// value is exhausted, the underlying process is reaped. If the underlying
/// process fails, then its stderr is read and converted into a normal
/// io::Error.
///
/// If there is any error in spawning the preprocessor command, then
/// return the corresponding error.
pub fn from_cmd_path(
cmd: PathBuf,
path: &Path,
) -> Result<PreprocessorReader> {
let child = process::Command::new(&cmd)
.arg(path)
.stdin(Stdio::from(File::open(path)?))
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|err| {
format!(
"error running preprocessor command '{}': {}",
cmd.display(),
err,
)
})?;
Ok(PreprocessorReader {
cmd: cmd,
path: path.to_path_buf(),
child: child,
done: false,
})
}
fn read_error(&mut self) -> io::Result<io::Error> {
let mut errbytes = vec![];
self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
let errstr = String::from_utf8_lossy(&errbytes);
let errstr = errstr.trim();
Ok(if errstr.is_empty() {
let msg = format!(
"preprocessor command failed: '{} {}'",
self.cmd.display(),
self.path.display(),
);
io::Error::new(io::ErrorKind::Other, msg)
} else {
let msg = format!(
"preprocessor command failed: '{} {}': {}",
self.cmd.display(),
self.path.display(),
errstr,
);
io::Error::new(io::ErrorKind::Other, msg)
})
}
}
impl io::Read for PreprocessorReader {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if self.done {
return Ok(0);
}
let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
if nread == 0 {
self.done = true;
// Reap the child now that we're done reading.
// If the command failed, report stderr as an error.
if !self.child.wait()?.success() {
return Err(self.read_error()?);
}
}
Ok(nread)
}
}

View File

@ -1,6 +1,6 @@
use std::fs::File;
use std::io;
use std::path::Path;
use std::path::{Path, PathBuf};
use encoding_rs::Encoding;
use grep::Grep;
@ -10,6 +10,7 @@ use termcolor::WriteColor;
use decoder::DecodeReader;
use decompressor::{self, DecompressionReader};
use preprocessor::PreprocessorReader;
use pathutil::strip_prefix;
use printer::Printer;
use search_buffer::BufferSearcher;
@ -45,6 +46,7 @@ struct Options {
no_messages: bool,
quiet: bool,
text: bool,
preprocessor: Option<PathBuf>,
search_zip_files: bool
}
@ -68,6 +70,7 @@ impl Default for Options {
quiet: false,
text: false,
search_zip_files: false,
preprocessor: None,
}
}
}
@ -222,6 +225,12 @@ impl WorkerBuilder {
self.opts.search_zip_files = yes;
self
}
/// If non-empty, search output of preprocessor run on each file
pub fn preprocessor(mut self, command: Option<PathBuf>) -> Self {
self.opts.preprocessor = command;
self
}
}
/// Worker is responsible for executing searches on file paths, while choosing
@ -250,7 +259,18 @@ impl Worker {
}
Work::DirEntry(dent) => {
let mut path = dent.path();
if self.opts.search_zip_files
if self.opts.preprocessor.is_some() {
let cmd = self.opts.preprocessor.clone().unwrap();
match PreprocessorReader::from_cmd_path(cmd, path) {
Ok(reader) => self.search(printer, path, reader),
Err(err) => {
if !self.opts.no_messages {
eprintln!("{}", err);
}
return 0;
}
}
} else if self.opts.search_zip_files
&& decompressor::is_compressed(path)
{
match DecompressionReader::from_path(path) {

View File

@ -1732,6 +1732,26 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
assert_eq!(lines, "sherlock\x002\n");
});
#[test]
fn preprocessing() {
if !cmd_exists("xzcat") {
return;
}
let xz_file = include_bytes!("./data/sherlock.xz");
let wd = WorkDir::new("feature_preprocessing");
wd.create_bytes("sherlock.xz", xz_file);
let mut cmd = wd.command();
cmd.arg("--pre").arg("xzcat").arg("Sherlock").arg("sherlock.xz");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
}
#[test]
fn compressed_gzip() {
if !cmd_exists("gzip") {