Lots of progress:

- Refactored interaction between CLI args and rest of xrep. - Filling in a lot more options, including file type filtering. - Fixing some bugs in globbing/ignoring. - More documentation.
2025-08-16 12:43:49 -07:00 · 2016-09-05 00:52:23 -04:00
parent 0bf278e72f
commit 812cdb13c6
9 changed files with 1567 additions and 408 deletions
--- a/src/args.rs
+++ b/src/args.rs
@@ -0,0 +1,551 @@
+use std::cmp;
+use std::env;
+use std::io;
+use std::path::{Path, PathBuf};
+
+use docopt::Docopt;
+use env_logger;
+use grep::{Grep, GrepBuilder};
+use log;
+use num_cpus;
+use regex;
+use walkdir::WalkDir;
+
+use gitignore::{Gitignore, GitignoreBuilder};
+use ignore::Ignore;
+use out::Out;
+use printer::Printer;
+use search::{InputBuffer, Searcher};
+use types::{FileTypeDef, Types, TypesBuilder};
+use walk;
+
+use Result;
+
+/// The Docopt usage string.
+///
+/// If you've never heard of Docopt before, see: http://docopt.org
+/// (TL;DR: The CLI parser is generated from the usage string below.)
+const USAGE: &'static str = "
+Usage: xrep [options] <pattern> [<path> ...]
+       xrep [options] --files [<path> ...]
+       xrep [options] --type-list
+       xrep --help
+       xrep --version
+
+xrep is like the silver searcher and grep, but faster than both.
+
+Common options:
+    -a, --text                 Search binary files as if they were text.
+    -c, --count                Only show count of line matches for each file.
+    -g, --glob GLOB ...        Include or exclude files for searching that
+                               match the given glob. This always overrides any
+                               other ignore logic. Multiple glob flags may be
+                               used. Globbing rules match .gitignore globs.
+                               Precede a glob with a '!' to exclude it.
+    -h, --help                 Show this usage message.
+    -i, --ignore-case          Case insensitive search.
+    -n, --line-number          Show line numbers (1-based).
+    -q, --quiet                Do not print anything to stdout.
+    -t, --type TYPE ...        Only search files matching TYPE. Multiple type
+                               flags may be provided. Use the --type-list flag
+                               to list all available types.
+    -T, --type-not TYPE ...    Do not search files matching TYPE. Multiple
+                               not-type flags may be provided.
+    -v, --invert-match         Invert matching.
+    -w, --word-regexp          Only show matches surrounded by word boundaries.
+                               This is equivalent to putting \\b before and
+                               after the search pattern.
+
+Less common options:
+    -A, --after-context NUM
+        Show NUM lines after each match.
+
+    -B, --before-context NUM
+        Show NUM lines before each match.
+
+    -C, --context NUM
+        Show NUM lines before and after each match.
+
+    --context-separator ARG
+        The string to use when separating non-continuous context lines. Escape
+        sequences may be used. [default: --]
+
+    --debug
+        Show debug messages.
+
+    --files
+        Print each file that would be searched (but don't search).
+
+    -H, --with-filename
+        Prefix each match with the file name that contains it. This is the
+        default when more than one file is searched.
+
+    --hidden
+        Search hidden directories and files.
+
+    -L, --follow
+        Follow symlinks.
+
+    --line-terminator ARG
+        The byte to use for a line terminator. Escape sequences may be used.
+        [default: \\n]
+
+    --no-ignore
+        Don't respect ignore files (.gitignore, .xrepignore, etc.)
+
+    -Q, --literal
+        Treat the pattern as a literal string instead of a regular expression.
+
+    --threads ARG
+        The number of threads to use. Defaults to the number of logical CPUs
+        (capped at 6). [default: 0]
+
+    --version
+        Show the version number of xrep and exit.
+
+File type management options:
+    --type-list
+        Show all supported file types and their associated globs.
+
+    --type-add ARG ...
+        Add a new glob for a particular file type.
+        Example: --type-add html:*.html,*.htm
+
+    --type-clear TYPE ...
+        Clear the file type globs for TYPE.
+";
+
+/// RawArgs are the args as they are parsed from Docopt. They aren't used
+/// directly by the rest of xrep.
+#[derive(Debug, RustcDecodable)]
+pub struct RawArgs {
+    arg_pattern: String,
+    arg_path: Vec<String>,
+    flag_after_context: usize,
+    flag_before_context: usize,
+    flag_context: usize,
+    flag_context_separator: String,
+    flag_count: bool,
+    flag_debug: bool,
+    flag_files: bool,
+    flag_follow: bool,
+    flag_glob: Vec<String>,
+    flag_hidden: bool,
+    flag_ignore_case: bool,
+    flag_invert_match: bool,
+    flag_line_number: bool,
+    flag_line_terminator: String,
+    flag_literal: bool,
+    flag_no_ignore: bool,
+    flag_quiet: bool,
+    flag_text: bool,
+    flag_threads: usize,
+    flag_type: Vec<String>,
+    flag_type_not: Vec<String>,
+    flag_type_list: bool,
+    flag_type_add: Vec<String>,
+    flag_type_clear: Vec<String>,
+    flag_with_filename: bool,
+    flag_word_regexp: bool,
+}
+
+/// Args are transformed/normalized from RawArgs.
+#[derive(Debug)]
+pub struct Args {
+    pattern: String,
+    paths: Vec<PathBuf>,
+    after_context: usize,
+    before_context: usize,
+    context_separator: Vec<u8>,
+    count: bool,
+    eol: u8,
+    files: bool,
+    follow: bool,
+    glob_overrides: Option<Gitignore>,
+    hidden: bool,
+    ignore_case: bool,
+    invert_match: bool,
+    line_number: bool,
+    no_ignore: bool,
+    quiet: bool,
+    text: bool,
+    threads: usize,
+    type_defs: Vec<FileTypeDef>,
+    type_list: bool,
+    types: Types,
+    with_filename: bool,
+}
+
+impl RawArgs {
+    /// Convert arguments parsed into a configuration used by xrep.
+    fn to_args(&self) -> Result<Args> {
+        let pattern = {
+            let pattern =
+                if self.flag_literal {
+                    regex::quote(&self.arg_pattern)
+                } else {
+                    self.arg_pattern.clone()
+                };
+            if self.flag_word_regexp {
+                format!(r"\b{}\b", pattern)
+            } else {
+                pattern
+            }
+        };
+        let paths =
+            if self.arg_path.is_empty() {
+                vec![Path::new("./").to_path_buf()]
+            } else {
+                self.arg_path.iter().map(|p| {
+                    Path::new(p).to_path_buf()
+                }).collect()
+            };
+        let (after_context, before_context) =
+            if self.flag_context > 0 {
+                (self.flag_context, self.flag_context)
+            } else {
+                (self.flag_after_context, self.flag_before_context)
+            };
+        let eol = {
+            let eol = unescape(&self.flag_line_terminator);
+            if eol.is_empty() {
+                errored!("Empty line terminator is not allowed.");
+            } else if eol.len() > 1 {
+                errored!("Line terminators are limited to exactly 1 byte.");
+            }
+            eol[0]
+        };
+        let glob_overrides =
+            if self.flag_glob.is_empty() {
+                None
+            } else {
+                let cwd = try!(env::current_dir());
+                let mut bgi = GitignoreBuilder::new(cwd);
+                for pat in &self.flag_glob {
+                    try!(bgi.add("<argv>", pat));
+                }
+                Some(try!(bgi.build()))
+            };
+        let threads =
+            if self.flag_threads == 0 {
+                cmp::min(6, num_cpus::get())
+            } else {
+                self.flag_threads
+            };
+        let mut with_filename = self.flag_with_filename;
+        if !with_filename {
+            with_filename = paths.len() > 1 || paths[0].is_dir();
+        }
+        let mut btypes = TypesBuilder::new();
+        btypes.add_defaults();
+        try!(self.add_types(&mut btypes));
+        let types = try!(btypes.build());
+        Ok(Args {
+            pattern: pattern,
+            paths: paths,
+            after_context: after_context,
+            before_context: before_context,
+            context_separator: unescape(&self.flag_context_separator),
+            count: self.flag_count,
+            eol: eol,
+            files: self.flag_files,
+            follow: self.flag_follow,
+            glob_overrides: glob_overrides,
+            hidden: self.flag_hidden,
+            ignore_case: self.flag_ignore_case,
+            invert_match: self.flag_invert_match,
+            line_number: self.flag_line_number,
+            no_ignore: self.flag_no_ignore,
+            quiet: self.flag_quiet,
+            text: self.flag_text,
+            threads: threads,
+            type_defs: btypes.definitions(),
+            type_list: self.flag_type_list,
+            types: types,
+            with_filename: with_filename,
+        })
+    }
+
+    fn add_types(&self, types: &mut TypesBuilder) -> Result<()> {
+        for ty in &self.flag_type_clear {
+            types.clear(ty);
+        }
+        for def in &self.flag_type_add {
+            try!(types.add_def(def));
+        }
+        for ty in &self.flag_type {
+            types.select(ty);
+        }
+        for ty in &self.flag_type_not {
+            types.select_not(ty);
+        }
+        Ok(())
+    }
+}
+
+impl Args {
+    /// Parse the command line arguments for this process.
+    ///
+    /// If a CLI usage error occurred, then exit the process and print a usage
+    /// or error message. Similarly, if the user requested the version of
+    /// xrep, then print the version and exit.
+    ///
+    /// Also, initialize a global logger.
+    pub fn parse() -> Result<Args> {
+        let raw: RawArgs =
+            Docopt::new(USAGE)
+                .and_then(|d| d.version(Some(version())).decode())
+                .unwrap_or_else(|e| e.exit());
+
+        let mut logb = env_logger::LogBuilder::new();
+        if raw.flag_debug {
+            logb.filter(None, log::LogLevelFilter::Debug);
+        } else {
+            logb.filter(None, log::LogLevelFilter::Warn);
+        }
+        if let Err(err) = logb.init() {
+            errored!("failed to initialize logger: {}", err);
+        }
+
+        raw.to_args().map_err(From::from)
+    }
+
+    /// Returns true if xrep should print the files it will search and exit
+    /// (but not do any actual searching).
+    pub fn files(&self) -> bool {
+        self.files
+    }
+
+    /// Create a new line based matcher. The matcher returned can be used
+    /// across multiple threads simultaneously. This matcher only supports
+    /// basic searching of regular expressions in a single buffer.
+    ///
+    /// The pattern and other flags are taken from the command line.
+    pub fn grep(&self) -> Result<Grep> {
+        GrepBuilder::new(&self.pattern)
+            .case_insensitive(self.ignore_case)
+            .line_terminator(self.eol)
+            .build()
+            .map_err(From::from)
+    }
+
+    /// Creates a new input buffer that is used in searching.
+    pub fn input_buffer(&self) -> InputBuffer {
+        let mut inp = InputBuffer::new();
+        inp.eol(self.eol);
+        inp
+    }
+
+    /// Create a new printer of individual search results that writes to the
+    /// writer given.
+    pub fn printer<W: io::Write>(&self, wtr: W) -> Printer<W> {
+        Printer::new(wtr)
+            .context_separator(self.context_separator.clone())
+            .eol(self.eol)
+            .quiet(self.quiet)
+            .with_filename(self.with_filename)
+    }
+
+    /// Create a new printer of search results for an entire file that writes
+    /// to the writer given.
+    pub fn out<W: io::Write>(&self, wtr: W) -> Out<W> {
+        let mut out = Out::new(wtr);
+        if self.before_context > 0 || self.after_context > 0 {
+            out = out.file_separator(self.context_separator.clone());
+        }
+        out
+    }
+
+    /// Return the paths that should be searched.
+    pub fn paths(&self) -> &[PathBuf] {
+        &self.paths
+    }
+
+    /// Create a new line based searcher whose configuration is taken from the
+    /// command line. This searcher supports a dizzying array of features:
+    /// inverted matching, line counting, context control and more.
+    pub fn searcher<'a, R: io::Read, W: io::Write>(
+        &self,
+        inp: &'a mut InputBuffer,
+        printer: &'a mut Printer<W>,
+        grep: &'a Grep,
+        path: &'a Path,
+        rdr: R,
+    ) -> Searcher<'a, R, W> {
+        Searcher::new(inp, printer, grep, path, rdr)
+            .after_context(self.after_context)
+            .before_context(self.before_context)
+            .count(self.count)
+            .eol(self.eol)
+            .line_number(self.line_number)
+            .invert_match(self.invert_match)
+            .text(self.text)
+    }
+
+    /// Returns the number of worker search threads that should be used.
+    pub fn threads(&self) -> usize {
+        self.threads
+    }
+
+    /// Returns a list of type definitions currently loaded.
+    pub fn type_defs(&self) -> &[FileTypeDef] {
+        &self.type_defs
+    }
+
+    /// Returns true if xrep should print the type definitions currently loaded
+    /// and then exit.
+    pub fn type_list(&self) -> bool {
+        self.type_list
+    }
+
+    /// Create a new recursive directory iterator at the path given.
+    pub fn walker(&self, path: &Path) -> walk::Iter {
+        let wd = WalkDir::new(path).follow_links(self.follow);
+        let mut ig = Ignore::new();
+        ig.ignore_hidden(!self.hidden);
+        ig.no_ignore(self.no_ignore);
+        ig.add_types(self.types.clone());
+        if let Some(ref overrides) = self.glob_overrides {
+            ig.add_override(overrides.clone());
+        }
+        walk::Iter::new(ig, wd)
+    }
+}
+
+fn version() -> String {
+    let (maj, min, pat) = (
+        option_env!("CARGO_PKG_VERSION_MAJOR"),
+        option_env!("CARGO_PKG_VERSION_MINOR"),
+        option_env!("CARGO_PKG_VERSION_PATCH"),
+    );
+    match (maj, min, pat) {
+        (Some(maj), Some(min), Some(pat)) =>
+            format!("{}.{}.{}", maj, min, pat),
+        _ => "".to_owned(),
+    }
+}
+
+/// A single state in the state machine used by `unescape`.
+#[derive(Clone, Copy, Eq, PartialEq)]
+enum State {
+    Escape,
+    HexFirst,
+    HexSecond(char),
+    Literal,
+}
+
+/// Unescapes a string given on the command line. It supports a limit set of
+/// escape sequences:
+///
+/// * \t, \r and \n are mapped to their corresponding ASCII bytes.
+/// * \xZZ hexadecimal escapes are mapped to their byte.
+fn unescape(s: &str) -> Vec<u8> {
+    use self::State::*;
+
+    let mut bytes = vec![];
+    let mut state = Literal;
+    for c in s.chars() {
+        match state {
+            Escape => {
+                match c {
+                    'n' => { bytes.push(b'\n'); state = Literal; }
+                    'r' => { bytes.push(b'\r'); state = Literal; }
+                    't' => { bytes.push(b'\t'); state = Literal; }
+                    'x' => { state = HexFirst; }
+                    c => {
+                        bytes.extend(&format!(r"\{}", c).into_bytes());
+                        state = Literal;
+                    }
+                }
+            }
+            HexFirst => {
+                match c {
+                    '0'...'9' | 'A'...'F' | 'a'...'f' => {
+                        state = HexSecond(c);
+                    }
+                    c => {
+                        bytes.extend(&format!(r"\x{}", c).into_bytes());
+                        state = Literal;
+                    }
+                }
+            }
+            HexSecond(first) => {
+                match c {
+                    '0'...'9' | 'A'...'F' | 'a'...'f' => {
+                        let ordinal = format!("{}{}", first, c);
+                        let byte = u8::from_str_radix(&ordinal, 16).unwrap();
+                        bytes.push(byte);
+                        state = Literal;
+                    }
+                    c => {
+                        let original = format!(r"\x{}{}", first, c);
+                        bytes.extend(&original.into_bytes());
+                        state = Literal;
+                    }
+                }
+            }
+            Literal => {
+                match c {
+                    '\\' => { state = Escape; }
+                    c => { bytes.extend(c.to_string().as_bytes()); }
+                }
+            }
+        }
+    }
+    match state {
+        Escape => bytes.push(b'\\'),
+        HexFirst => bytes.extend(b"\\x"),
+        HexSecond(c) => bytes.extend(&format!("\\x{}", c).into_bytes()),
+        Literal => {}
+    }
+    bytes
+}
+
+#[cfg(test)]
+mod tests {
+    use super::unescape;
+
+    fn b(bytes: &'static [u8]) -> Vec<u8> {
+        bytes.to_vec()
+    }
+
+    #[test]
+    fn unescape_nul() {
+        assert_eq!(b(b"\x00"), unescape(r"\x00"));
+    }
+
+    #[test]
+    fn unescape_nl() {
+        assert_eq!(b(b"\n"), unescape(r"\n"));
+    }
+
+    #[test]
+    fn unescape_tab() {
+        assert_eq!(b(b"\t"), unescape(r"\t"));
+    }
+
+    #[test]
+    fn unescape_carriage() {
+        assert_eq!(b(b"\r"), unescape(r"\r"));
+    }
+
+    #[test]
+    fn unescape_nothing_simple() {
+        assert_eq!(b(b"\\a"), unescape(r"\a"));
+    }
+
+    #[test]
+    fn unescape_nothing_hex0() {
+        assert_eq!(b(b"\\x"), unescape(r"\x"));
+    }
+
+    #[test]
+    fn unescape_nothing_hex1() {
+        assert_eq!(b(b"\\xz"), unescape(r"\xz"));
+    }
+
+    #[test]
+    fn unescape_nothing_hex2() {
+        assert_eq!(b(b"\\xzz"), unescape(r"\xzz"));
+    }
+}