mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
ripgrep: add --pre flag
The preprocessor flag accepts a command program and executes this program for every input file that is searched. Instead of searching the file directly, ripgrep will instead search the stdout contents of the program. Closes #978, Closes #981
This commit is contained in:
parent
1d09d4d31b
commit
231456c409
@ -50,6 +50,8 @@ Feature enhancements:
|
||||
* [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967):
|
||||
Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth`
|
||||
as a synonym for backwards compatibility.
|
||||
* [FEATURE #978](https://github.com/BurntSushi/ripgrep/issues/978):
|
||||
Add a `--pre` option to filter inputs with an arbitrary program.
|
||||
* [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d):
|
||||
Improve zsh completion.
|
||||
|
||||
|
@ -107,6 +107,9 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
|
||||
specifically specified with the `-E/--encoding` flag.)
|
||||
* ripgrep supports searching files compressed in a common format (gzip, xz,
|
||||
lzma, bzip2 or lz4) with the `-z/--search-zip` flag.
|
||||
* ripgrep supports arbitrary input preprocessing filters which could be PDF
|
||||
text extraction, less supported decompression, decrypting, automatic encoding
|
||||
detection and so on.
|
||||
|
||||
In other words, use ripgrep if you like speed, filtering by default, fewer
|
||||
bugs, and Unicode support.
|
||||
|
@ -170,7 +170,8 @@ _rg() {
|
||||
{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
|
||||
{-x,--line-regexp}'[only show matches surrounded by line boundaries]'
|
||||
|
||||
+ '(zip)' # Compressed-file options
|
||||
+ '(input-decoding)' # Input decoding options
|
||||
'--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
|
||||
{-z,--search-zip}'[search in compressed files]'
|
||||
$no"--no-search-zip[don't search in compressed files]"
|
||||
|
||||
|
55
src/app.rs
55
src/app.rs
@ -534,6 +534,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
|
||||
flag_only_matching(&mut args);
|
||||
flag_path_separator(&mut args);
|
||||
flag_passthru(&mut args);
|
||||
flag_pre(&mut args);
|
||||
flag_pretty(&mut args);
|
||||
flag_quiet(&mut args);
|
||||
flag_regex_size_limit(&mut args);
|
||||
@ -1453,12 +1454,62 @@ This flag can be disabled with --no-search-zip.
|
||||
");
|
||||
let arg = RGArg::switch("search-zip").short("z")
|
||||
.help(SHORT).long_help(LONG)
|
||||
.overrides("no-search-zip");
|
||||
.overrides("no-search-zip")
|
||||
.overrides("pre");
|
||||
args.push(arg);
|
||||
|
||||
let arg = RGArg::switch("no-search-zip")
|
||||
.hidden()
|
||||
.overrides("search-zip");
|
||||
.overrides("search-zip")
|
||||
.overrides("pre");
|
||||
args.push(arg);
|
||||
}
|
||||
|
||||
fn flag_pre(args: &mut Vec<RGArg>) {
|
||||
const SHORT: &str = "search outputs of COMMAND FILE for each FILE";
|
||||
const LONG: &str = long!("\
|
||||
For each input FILE, search the standard output of COMMAND FILE rather than the
|
||||
contents of FILE. This option expects the COMMAND program to either be an
|
||||
absolute path or to be available in your PATH. An empty string COMMAND
|
||||
deactivates this feature.
|
||||
|
||||
A preprocessor is not run when ripgrep is searching stdin.
|
||||
|
||||
When searching over sets of files that may require one of several decoders
|
||||
as preprocessors, COMMAND should be a wrapper program or script which first
|
||||
classifies FILE based on magic numbers/content or based on the FILE name and
|
||||
then dispatches to an appropriate preprocessor. Each COMMAND also has its
|
||||
standard input connected to FILE for convenience.
|
||||
|
||||
For example, a shell script for COMMAND might look like:
|
||||
|
||||
case \"$1\" in
|
||||
*.pdf)
|
||||
exec pdftotext \"$1\" -
|
||||
;;
|
||||
*)
|
||||
case $(file \"$1\") in
|
||||
*Zstandard*)
|
||||
exec pzstd -cdq
|
||||
;;
|
||||
*)
|
||||
exec cat
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
esac
|
||||
|
||||
The above script uses `pdftotext` to convert a PDF file to plain text. For
|
||||
all other files, the script uses the `file` utility to sniff the type of the
|
||||
file based on its contents. If it is a compressed file in the Zstandard format,
|
||||
then `pzstd` is used to decompress the contents to stdout.
|
||||
|
||||
This overrides the -z/--search-zip flag.
|
||||
");
|
||||
let arg = RGArg::flag("pre", "COMMAND")
|
||||
.help(SHORT).long_help(LONG)
|
||||
.overrides("search-zip")
|
||||
.overrides("no-search-zip");
|
||||
args.push(arg);
|
||||
}
|
||||
|
||||
|
16
src/args.rs
16
src/args.rs
@ -80,6 +80,7 @@ pub struct Args {
|
||||
types: Types,
|
||||
with_filename: bool,
|
||||
search_zip_files: bool,
|
||||
preprocessor: Option<PathBuf>,
|
||||
stats: bool
|
||||
}
|
||||
|
||||
@ -288,6 +289,7 @@ impl Args {
|
||||
.quiet(self.quiet)
|
||||
.text(self.text)
|
||||
.search_zip_files(self.search_zip_files)
|
||||
.preprocessor(self.preprocessor.clone())
|
||||
.build()
|
||||
}
|
||||
|
||||
@ -429,6 +431,7 @@ impl<'a> ArgMatches<'a> {
|
||||
types: self.types()?,
|
||||
with_filename: with_filename,
|
||||
search_zip_files: self.is_present("search-zip"),
|
||||
preprocessor: self.preprocessor(),
|
||||
stats: self.stats()
|
||||
};
|
||||
if args.mmap {
|
||||
@ -722,6 +725,19 @@ impl<'a> ArgMatches<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the preprocessor command
|
||||
fn preprocessor(&self) -> Option<PathBuf> {
|
||||
if let Some(path) = self.value_of_os("pre") {
|
||||
if path.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Path::new(path).to_path_buf())
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the unescaped path separator in UTF-8 bytes.
|
||||
fn path_separator(&self) -> Result<Option<u8>> {
|
||||
match self.value_of_lossy("path-separator") {
|
||||
|
@ -43,6 +43,7 @@ mod args;
|
||||
mod config;
|
||||
mod decoder;
|
||||
mod decompressor;
|
||||
mod preprocessor;
|
||||
mod logger;
|
||||
mod pathutil;
|
||||
mod printer;
|
||||
|
92
src/preprocessor.rs
Normal file
92
src/preprocessor.rs
Normal file
@ -0,0 +1,92 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{self, Stdio};
|
||||
|
||||
use Result;
|
||||
|
||||
/// PreprocessorReader provides an `io::Read` impl to read kids output.
|
||||
#[derive(Debug)]
|
||||
pub struct PreprocessorReader {
|
||||
cmd: PathBuf,
|
||||
path: PathBuf,
|
||||
child: process::Child,
|
||||
done: bool,
|
||||
}
|
||||
|
||||
impl PreprocessorReader {
|
||||
/// Returns a handle to the stdout of the spawned preprocessor process for
|
||||
/// `path`, which can be directly searched in the worker. When the returned
|
||||
/// value is exhausted, the underlying process is reaped. If the underlying
|
||||
/// process fails, then its stderr is read and converted into a normal
|
||||
/// io::Error.
|
||||
///
|
||||
/// If there is any error in spawning the preprocessor command, then
|
||||
/// return the corresponding error.
|
||||
pub fn from_cmd_path(
|
||||
cmd: PathBuf,
|
||||
path: &Path,
|
||||
) -> Result<PreprocessorReader> {
|
||||
let child = process::Command::new(&cmd)
|
||||
.arg(path)
|
||||
.stdin(Stdio::from(File::open(path)?))
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.map_err(|err| {
|
||||
format!(
|
||||
"error running preprocessor command '{}': {}",
|
||||
cmd.display(),
|
||||
err,
|
||||
)
|
||||
})?;
|
||||
Ok(PreprocessorReader {
|
||||
cmd: cmd,
|
||||
path: path.to_path_buf(),
|
||||
child: child,
|
||||
done: false,
|
||||
})
|
||||
}
|
||||
|
||||
fn read_error(&mut self) -> io::Result<io::Error> {
|
||||
let mut errbytes = vec![];
|
||||
self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
|
||||
let errstr = String::from_utf8_lossy(&errbytes);
|
||||
let errstr = errstr.trim();
|
||||
|
||||
Ok(if errstr.is_empty() {
|
||||
let msg = format!(
|
||||
"preprocessor command failed: '{} {}'",
|
||||
self.cmd.display(),
|
||||
self.path.display(),
|
||||
);
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
} else {
|
||||
let msg = format!(
|
||||
"preprocessor command failed: '{} {}': {}",
|
||||
self.cmd.display(),
|
||||
self.path.display(),
|
||||
errstr,
|
||||
);
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl io::Read for PreprocessorReader {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
if self.done {
|
||||
return Ok(0);
|
||||
}
|
||||
let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
|
||||
if nread == 0 {
|
||||
self.done = true;
|
||||
// Reap the child now that we're done reading.
|
||||
// If the command failed, report stderr as an error.
|
||||
if !self.child.wait()?.success() {
|
||||
return Err(self.read_error()?);
|
||||
}
|
||||
}
|
||||
Ok(nread)
|
||||
}
|
||||
}
|
@ -1,6 +1,6 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use encoding_rs::Encoding;
|
||||
use grep::Grep;
|
||||
@ -10,6 +10,7 @@ use termcolor::WriteColor;
|
||||
|
||||
use decoder::DecodeReader;
|
||||
use decompressor::{self, DecompressionReader};
|
||||
use preprocessor::PreprocessorReader;
|
||||
use pathutil::strip_prefix;
|
||||
use printer::Printer;
|
||||
use search_buffer::BufferSearcher;
|
||||
@ -45,6 +46,7 @@ struct Options {
|
||||
no_messages: bool,
|
||||
quiet: bool,
|
||||
text: bool,
|
||||
preprocessor: Option<PathBuf>,
|
||||
search_zip_files: bool
|
||||
}
|
||||
|
||||
@ -68,6 +70,7 @@ impl Default for Options {
|
||||
quiet: false,
|
||||
text: false,
|
||||
search_zip_files: false,
|
||||
preprocessor: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -222,6 +225,12 @@ impl WorkerBuilder {
|
||||
self.opts.search_zip_files = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// If non-empty, search output of preprocessor run on each file
|
||||
pub fn preprocessor(mut self, command: Option<PathBuf>) -> Self {
|
||||
self.opts.preprocessor = command;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Worker is responsible for executing searches on file paths, while choosing
|
||||
@ -250,7 +259,18 @@ impl Worker {
|
||||
}
|
||||
Work::DirEntry(dent) => {
|
||||
let mut path = dent.path();
|
||||
if self.opts.search_zip_files
|
||||
if self.opts.preprocessor.is_some() {
|
||||
let cmd = self.opts.preprocessor.clone().unwrap();
|
||||
match PreprocessorReader::from_cmd_path(cmd, path) {
|
||||
Ok(reader) => self.search(printer, path, reader),
|
||||
Err(err) => {
|
||||
if !self.opts.no_messages {
|
||||
eprintln!("{}", err);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else if self.opts.search_zip_files
|
||||
&& decompressor::is_compressed(path)
|
||||
{
|
||||
match DecompressionReader::from_path(path) {
|
||||
|
@ -1732,6 +1732,26 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
|
||||
assert_eq!(lines, "sherlock\x002\n");
|
||||
});
|
||||
|
||||
#[test]
|
||||
fn preprocessing() {
|
||||
if !cmd_exists("xzcat") {
|
||||
return;
|
||||
}
|
||||
let xz_file = include_bytes!("./data/sherlock.xz");
|
||||
|
||||
let wd = WorkDir::new("feature_preprocessing");
|
||||
wd.create_bytes("sherlock.xz", xz_file);
|
||||
|
||||
let mut cmd = wd.command();
|
||||
cmd.arg("--pre").arg("xzcat").arg("Sherlock").arg("sherlock.xz");
|
||||
let lines: String = wd.stdout(&mut cmd);
|
||||
let expected = "\
|
||||
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||
";
|
||||
assert_eq!(lines, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compressed_gzip() {
|
||||
if !cmd_exists("gzip") {
|
||||
|
Loading…
x
Reference in New Issue
Block a user