mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
ripgrep: add --pre flag
The preprocessor flag accepts a command program and executes this program for every input file that is searched. Instead of searching the file directly, ripgrep will instead search the stdout contents of the program. Closes #978, Closes #981
This commit is contained in:
parent
1d09d4d31b
commit
231456c409
@ -50,6 +50,8 @@ Feature enhancements:
|
|||||||
* [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967):
|
* [FEATURE #967](https://github.com/BurntSushi/ripgrep/issues/967):
|
||||||
Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth`
|
Rename `--maxdepth` to `--max-depth` for consistency. We retain `--maxdepth`
|
||||||
as a synonym for backwards compatibility.
|
as a synonym for backwards compatibility.
|
||||||
|
* [FEATURE #978](https://github.com/BurntSushi/ripgrep/issues/978):
|
||||||
|
Add a `--pre` option to filter inputs with an arbitrary program.
|
||||||
* [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d):
|
* [FEATURE fca9709d](https://github.com/BurntSushi/ripgrep/commit/fca9709d):
|
||||||
Improve zsh completion.
|
Improve zsh completion.
|
||||||
|
|
||||||
|
@ -107,6 +107,9 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
|
|||||||
specifically specified with the `-E/--encoding` flag.)
|
specifically specified with the `-E/--encoding` flag.)
|
||||||
* ripgrep supports searching files compressed in a common format (gzip, xz,
|
* ripgrep supports searching files compressed in a common format (gzip, xz,
|
||||||
lzma, bzip2 or lz4) with the `-z/--search-zip` flag.
|
lzma, bzip2 or lz4) with the `-z/--search-zip` flag.
|
||||||
|
* ripgrep supports arbitrary input preprocessing filters which could be PDF
|
||||||
|
text extraction, less supported decompression, decrypting, automatic encoding
|
||||||
|
detection and so on.
|
||||||
|
|
||||||
In other words, use ripgrep if you like speed, filtering by default, fewer
|
In other words, use ripgrep if you like speed, filtering by default, fewer
|
||||||
bugs, and Unicode support.
|
bugs, and Unicode support.
|
||||||
|
@ -170,7 +170,8 @@ _rg() {
|
|||||||
{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
|
{-w,--word-regexp}'[only show matches surrounded by word boundaries]'
|
||||||
{-x,--line-regexp}'[only show matches surrounded by line boundaries]'
|
{-x,--line-regexp}'[only show matches surrounded by line boundaries]'
|
||||||
|
|
||||||
+ '(zip)' # Compressed-file options
|
+ '(input-decoding)' # Input decoding options
|
||||||
|
'--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
|
||||||
{-z,--search-zip}'[search in compressed files]'
|
{-z,--search-zip}'[search in compressed files]'
|
||||||
$no"--no-search-zip[don't search in compressed files]"
|
$no"--no-search-zip[don't search in compressed files]"
|
||||||
|
|
||||||
|
55
src/app.rs
55
src/app.rs
@ -534,6 +534,7 @@ pub fn all_args_and_flags() -> Vec<RGArg> {
|
|||||||
flag_only_matching(&mut args);
|
flag_only_matching(&mut args);
|
||||||
flag_path_separator(&mut args);
|
flag_path_separator(&mut args);
|
||||||
flag_passthru(&mut args);
|
flag_passthru(&mut args);
|
||||||
|
flag_pre(&mut args);
|
||||||
flag_pretty(&mut args);
|
flag_pretty(&mut args);
|
||||||
flag_quiet(&mut args);
|
flag_quiet(&mut args);
|
||||||
flag_regex_size_limit(&mut args);
|
flag_regex_size_limit(&mut args);
|
||||||
@ -1453,12 +1454,62 @@ This flag can be disabled with --no-search-zip.
|
|||||||
");
|
");
|
||||||
let arg = RGArg::switch("search-zip").short("z")
|
let arg = RGArg::switch("search-zip").short("z")
|
||||||
.help(SHORT).long_help(LONG)
|
.help(SHORT).long_help(LONG)
|
||||||
.overrides("no-search-zip");
|
.overrides("no-search-zip")
|
||||||
|
.overrides("pre");
|
||||||
args.push(arg);
|
args.push(arg);
|
||||||
|
|
||||||
let arg = RGArg::switch("no-search-zip")
|
let arg = RGArg::switch("no-search-zip")
|
||||||
.hidden()
|
.hidden()
|
||||||
.overrides("search-zip");
|
.overrides("search-zip")
|
||||||
|
.overrides("pre");
|
||||||
|
args.push(arg);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flag_pre(args: &mut Vec<RGArg>) {
|
||||||
|
const SHORT: &str = "search outputs of COMMAND FILE for each FILE";
|
||||||
|
const LONG: &str = long!("\
|
||||||
|
For each input FILE, search the standard output of COMMAND FILE rather than the
|
||||||
|
contents of FILE. This option expects the COMMAND program to either be an
|
||||||
|
absolute path or to be available in your PATH. An empty string COMMAND
|
||||||
|
deactivates this feature.
|
||||||
|
|
||||||
|
A preprocessor is not run when ripgrep is searching stdin.
|
||||||
|
|
||||||
|
When searching over sets of files that may require one of several decoders
|
||||||
|
as preprocessors, COMMAND should be a wrapper program or script which first
|
||||||
|
classifies FILE based on magic numbers/content or based on the FILE name and
|
||||||
|
then dispatches to an appropriate preprocessor. Each COMMAND also has its
|
||||||
|
standard input connected to FILE for convenience.
|
||||||
|
|
||||||
|
For example, a shell script for COMMAND might look like:
|
||||||
|
|
||||||
|
case \"$1\" in
|
||||||
|
*.pdf)
|
||||||
|
exec pdftotext \"$1\" -
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
case $(file \"$1\") in
|
||||||
|
*Zstandard*)
|
||||||
|
exec pzstd -cdq
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
exec cat
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
The above script uses `pdftotext` to convert a PDF file to plain text. For
|
||||||
|
all other files, the script uses the `file` utility to sniff the type of the
|
||||||
|
file based on its contents. If it is a compressed file in the Zstandard format,
|
||||||
|
then `pzstd` is used to decompress the contents to stdout.
|
||||||
|
|
||||||
|
This overrides the -z/--search-zip flag.
|
||||||
|
");
|
||||||
|
let arg = RGArg::flag("pre", "COMMAND")
|
||||||
|
.help(SHORT).long_help(LONG)
|
||||||
|
.overrides("search-zip")
|
||||||
|
.overrides("no-search-zip");
|
||||||
args.push(arg);
|
args.push(arg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
16
src/args.rs
16
src/args.rs
@ -80,6 +80,7 @@ pub struct Args {
|
|||||||
types: Types,
|
types: Types,
|
||||||
with_filename: bool,
|
with_filename: bool,
|
||||||
search_zip_files: bool,
|
search_zip_files: bool,
|
||||||
|
preprocessor: Option<PathBuf>,
|
||||||
stats: bool
|
stats: bool
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -288,6 +289,7 @@ impl Args {
|
|||||||
.quiet(self.quiet)
|
.quiet(self.quiet)
|
||||||
.text(self.text)
|
.text(self.text)
|
||||||
.search_zip_files(self.search_zip_files)
|
.search_zip_files(self.search_zip_files)
|
||||||
|
.preprocessor(self.preprocessor.clone())
|
||||||
.build()
|
.build()
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -429,6 +431,7 @@ impl<'a> ArgMatches<'a> {
|
|||||||
types: self.types()?,
|
types: self.types()?,
|
||||||
with_filename: with_filename,
|
with_filename: with_filename,
|
||||||
search_zip_files: self.is_present("search-zip"),
|
search_zip_files: self.is_present("search-zip"),
|
||||||
|
preprocessor: self.preprocessor(),
|
||||||
stats: self.stats()
|
stats: self.stats()
|
||||||
};
|
};
|
||||||
if args.mmap {
|
if args.mmap {
|
||||||
@ -722,6 +725,19 @@ impl<'a> ArgMatches<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the preprocessor command
|
||||||
|
fn preprocessor(&self) -> Option<PathBuf> {
|
||||||
|
if let Some(path) = self.value_of_os("pre") {
|
||||||
|
if path.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(Path::new(path).to_path_buf())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the unescaped path separator in UTF-8 bytes.
|
/// Returns the unescaped path separator in UTF-8 bytes.
|
||||||
fn path_separator(&self) -> Result<Option<u8>> {
|
fn path_separator(&self) -> Result<Option<u8>> {
|
||||||
match self.value_of_lossy("path-separator") {
|
match self.value_of_lossy("path-separator") {
|
||||||
|
@ -43,6 +43,7 @@ mod args;
|
|||||||
mod config;
|
mod config;
|
||||||
mod decoder;
|
mod decoder;
|
||||||
mod decompressor;
|
mod decompressor;
|
||||||
|
mod preprocessor;
|
||||||
mod logger;
|
mod logger;
|
||||||
mod pathutil;
|
mod pathutil;
|
||||||
mod printer;
|
mod printer;
|
||||||
|
92
src/preprocessor.rs
Normal file
92
src/preprocessor.rs
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, Read};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::process::{self, Stdio};
|
||||||
|
|
||||||
|
use Result;
|
||||||
|
|
||||||
|
/// PreprocessorReader provides an `io::Read` impl to read kids output.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct PreprocessorReader {
|
||||||
|
cmd: PathBuf,
|
||||||
|
path: PathBuf,
|
||||||
|
child: process::Child,
|
||||||
|
done: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PreprocessorReader {
|
||||||
|
/// Returns a handle to the stdout of the spawned preprocessor process for
|
||||||
|
/// `path`, which can be directly searched in the worker. When the returned
|
||||||
|
/// value is exhausted, the underlying process is reaped. If the underlying
|
||||||
|
/// process fails, then its stderr is read and converted into a normal
|
||||||
|
/// io::Error.
|
||||||
|
///
|
||||||
|
/// If there is any error in spawning the preprocessor command, then
|
||||||
|
/// return the corresponding error.
|
||||||
|
pub fn from_cmd_path(
|
||||||
|
cmd: PathBuf,
|
||||||
|
path: &Path,
|
||||||
|
) -> Result<PreprocessorReader> {
|
||||||
|
let child = process::Command::new(&cmd)
|
||||||
|
.arg(path)
|
||||||
|
.stdin(Stdio::from(File::open(path)?))
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.stderr(Stdio::piped())
|
||||||
|
.spawn()
|
||||||
|
.map_err(|err| {
|
||||||
|
format!(
|
||||||
|
"error running preprocessor command '{}': {}",
|
||||||
|
cmd.display(),
|
||||||
|
err,
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
Ok(PreprocessorReader {
|
||||||
|
cmd: cmd,
|
||||||
|
path: path.to_path_buf(),
|
||||||
|
child: child,
|
||||||
|
done: false,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_error(&mut self) -> io::Result<io::Error> {
|
||||||
|
let mut errbytes = vec![];
|
||||||
|
self.child.stderr.as_mut().unwrap().read_to_end(&mut errbytes)?;
|
||||||
|
let errstr = String::from_utf8_lossy(&errbytes);
|
||||||
|
let errstr = errstr.trim();
|
||||||
|
|
||||||
|
Ok(if errstr.is_empty() {
|
||||||
|
let msg = format!(
|
||||||
|
"preprocessor command failed: '{} {}'",
|
||||||
|
self.cmd.display(),
|
||||||
|
self.path.display(),
|
||||||
|
);
|
||||||
|
io::Error::new(io::ErrorKind::Other, msg)
|
||||||
|
} else {
|
||||||
|
let msg = format!(
|
||||||
|
"preprocessor command failed: '{} {}': {}",
|
||||||
|
self.cmd.display(),
|
||||||
|
self.path.display(),
|
||||||
|
errstr,
|
||||||
|
);
|
||||||
|
io::Error::new(io::ErrorKind::Other, msg)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl io::Read for PreprocessorReader {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||||
|
if self.done {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
let nread = self.child.stdout.as_mut().unwrap().read(buf)?;
|
||||||
|
if nread == 0 {
|
||||||
|
self.done = true;
|
||||||
|
// Reap the child now that we're done reading.
|
||||||
|
// If the command failed, report stderr as an error.
|
||||||
|
if !self.child.wait()?.success() {
|
||||||
|
return Err(self.read_error()?);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(nread)
|
||||||
|
}
|
||||||
|
}
|
@ -1,6 +1,6 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::path::Path;
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
use grep::Grep;
|
use grep::Grep;
|
||||||
@ -10,6 +10,7 @@ use termcolor::WriteColor;
|
|||||||
|
|
||||||
use decoder::DecodeReader;
|
use decoder::DecodeReader;
|
||||||
use decompressor::{self, DecompressionReader};
|
use decompressor::{self, DecompressionReader};
|
||||||
|
use preprocessor::PreprocessorReader;
|
||||||
use pathutil::strip_prefix;
|
use pathutil::strip_prefix;
|
||||||
use printer::Printer;
|
use printer::Printer;
|
||||||
use search_buffer::BufferSearcher;
|
use search_buffer::BufferSearcher;
|
||||||
@ -45,6 +46,7 @@ struct Options {
|
|||||||
no_messages: bool,
|
no_messages: bool,
|
||||||
quiet: bool,
|
quiet: bool,
|
||||||
text: bool,
|
text: bool,
|
||||||
|
preprocessor: Option<PathBuf>,
|
||||||
search_zip_files: bool
|
search_zip_files: bool
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,6 +70,7 @@ impl Default for Options {
|
|||||||
quiet: false,
|
quiet: false,
|
||||||
text: false,
|
text: false,
|
||||||
search_zip_files: false,
|
search_zip_files: false,
|
||||||
|
preprocessor: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -222,6 +225,12 @@ impl WorkerBuilder {
|
|||||||
self.opts.search_zip_files = yes;
|
self.opts.search_zip_files = yes;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// If non-empty, search output of preprocessor run on each file
|
||||||
|
pub fn preprocessor(mut self, command: Option<PathBuf>) -> Self {
|
||||||
|
self.opts.preprocessor = command;
|
||||||
|
self
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Worker is responsible for executing searches on file paths, while choosing
|
/// Worker is responsible for executing searches on file paths, while choosing
|
||||||
@ -250,7 +259,18 @@ impl Worker {
|
|||||||
}
|
}
|
||||||
Work::DirEntry(dent) => {
|
Work::DirEntry(dent) => {
|
||||||
let mut path = dent.path();
|
let mut path = dent.path();
|
||||||
if self.opts.search_zip_files
|
if self.opts.preprocessor.is_some() {
|
||||||
|
let cmd = self.opts.preprocessor.clone().unwrap();
|
||||||
|
match PreprocessorReader::from_cmd_path(cmd, path) {
|
||||||
|
Ok(reader) => self.search(printer, path, reader),
|
||||||
|
Err(err) => {
|
||||||
|
if !self.opts.no_messages {
|
||||||
|
eprintln!("{}", err);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if self.opts.search_zip_files
|
||||||
&& decompressor::is_compressed(path)
|
&& decompressor::is_compressed(path)
|
||||||
{
|
{
|
||||||
match DecompressionReader::from_path(path) {
|
match DecompressionReader::from_path(path) {
|
||||||
|
@ -1732,6 +1732,26 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
|
|||||||
assert_eq!(lines, "sherlock\x002\n");
|
assert_eq!(lines, "sherlock\x002\n");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preprocessing() {
|
||||||
|
if !cmd_exists("xzcat") {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let xz_file = include_bytes!("./data/sherlock.xz");
|
||||||
|
|
||||||
|
let wd = WorkDir::new("feature_preprocessing");
|
||||||
|
wd.create_bytes("sherlock.xz", xz_file);
|
||||||
|
|
||||||
|
let mut cmd = wd.command();
|
||||||
|
cmd.arg("--pre").arg("xzcat").arg("Sherlock").arg("sherlock.xz");
|
||||||
|
let lines: String = wd.stdout(&mut cmd);
|
||||||
|
let expected = "\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
";
|
||||||
|
assert_eq!(lines, expected);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn compressed_gzip() {
|
fn compressed_gzip() {
|
||||||
if !cmd_exists("gzip") {
|
if !cmd_exists("gzip") {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user