diff --git a/Cargo.lock b/Cargo.lock index 21640763..cce72a7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -82,8 +82,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" name = "globset" version = "0.1.0" dependencies = [ + "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", "fnv 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", ] diff --git a/globset/Cargo.toml b/globset/Cargo.toml index 48e375fb..cf63f397 100644 --- a/globset/Cargo.toml +++ b/globset/Cargo.toml @@ -4,7 +4,9 @@ version = "0.1.0" authors = ["Andrew Gallant "] [dependencies] +aho-corasick = "0.5.3" fnv = "1.0" lazy_static = "0.2" +log = "0.3" memchr = "0.1" regex = "0.1.77" diff --git a/globset/src/lib.rs b/globset/src/lib.rs index b5cbb5be..f608a74a 100644 --- a/globset/src/lib.rs +++ b/globset/src/lib.rs @@ -13,40 +13,42 @@ that rigamorole when I wrote this. In particular, it could be fast/good enough to make its way into `glob` proper. */ -// TODO(burntsushi): I'm pretty dismayed by the performance of regex sets -// here. For example, we do a first pass single-regex-of-all-globs filter -// before actually running the regex set. This turns out to be faster, -// especially in fresh checkouts of repos that don't have a lot of ignored -// files. It's not clear how hard it is to make the regex set faster. -// -// An alternative avenue is to stop doing "regex all the things." (Which, to -// be fair, is pretty fast---I just expected it to be faster.) We could do -// something clever using assumptions along the lines of "oh, most ignore -// patterns are either literals or are for ignoring file extensions." (Look -// at the .gitignore for the chromium repo---just about every pattern satisfies -// that assumption.) +#![deny(missing_docs)] +extern crate aho_corasick; extern crate fnv; #[macro_use] extern crate lazy_static; +#[macro_use] +extern crate log; extern crate memchr; extern crate regex; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::error::Error as StdError; use std::ffi::{OsStr, OsString}; use std::fmt; use std::hash; -use std::iter; use std::path::Path; use std::str; -use regex::bytes::Regex; +use aho_corasick::{Automaton, AcAutomaton, FullAcAutomaton}; +use regex::bytes::{Regex, RegexBuilder, RegexSet}; -use pathutil::file_name; +use pathutil::{file_name, file_name_ext, os_str_bytes, path_bytes}; +use pattern::MatchStrategy; +pub use pattern::{Pattern, PatternBuilder, PatternMatcher}; mod pathutil; +mod pattern; + +macro_rules! eprintln { + ($($tt:tt)*) => {{ + use std::io::Write; + let _ = writeln!(&mut ::std::io::stderr(), $($tt)*); + }} +} lazy_static! { static ref FILE_SEPARATORS: String = regex::quote(r"/\"); @@ -55,12 +57,24 @@ lazy_static! { /// Represents an error that can occur when parsing a glob pattern. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { + /// Occurs when a use of `**` is invalid. Namely, `**` can only appear + /// adjacent to a path separator, or the beginning/end of a glob. InvalidRecursive, + /// Occurs when a character class (e.g., `[abc]`) is not closed. UnclosedClass, + /// Occurs when a range in a character (e.g., `[a-z]`) is invalid. For + /// example, if the range starts with a lexicographically larger character + /// than it ends with. InvalidRange(char, char), + /// Occurs when a `}` is found without a matching `{`. UnopenedAlternates, + /// Occurs when a `{` is found without a matching `}`. UnclosedAlternates, + /// Occurs when an alternating group is nested inside another alternating + /// group, e.g., `{{a,b},{c,d}}`. NestedAlternates, + /// An error associated with parsing or compiling a regex. + Regex(String), } impl StdError for Error { @@ -86,6 +100,7 @@ impl StdError for Error { Error::NestedAlternates => { "nested alternate groups are not allowed" } + Error::Regex(ref err) => err, } } } @@ -97,7 +112,8 @@ impl fmt::Display for Error { | Error::UnclosedClass | Error::UnopenedAlternates | Error::UnclosedAlternates - | Error::NestedAlternates => { + | Error::NestedAlternates + | Error::Regex(_) => { write!(f, "{}", self.description()) } Error::InvalidRange(s, e) => { @@ -107,34 +123,18 @@ impl fmt::Display for Error { } } -/// SetYesNo represents a group of globs that can be matched together in a -/// single pass. SetYesNo can only determine whether a particular path matched -/// any pattern in the set. -#[derive(Clone, Debug)] -pub struct SetYesNo { - re: Regex, +fn new_regex(pat: &str) -> Result { + RegexBuilder::new(pat) + .dot_matches_new_line(true) + .size_limit(10 * (1 << 20)) + .dfa_size_limit(10 * (1 << 20)) + .compile() + .map_err(|err| Error::Regex(err.to_string())) } -impl SetYesNo { - /// Returns true if and only if the given path matches at least one glob - /// in this set. - pub fn is_match>(&self, path: T) -> bool { - self.re.is_match(&*path_bytes(path.as_ref())) - } - - fn new( - pats: &[(Pattern, MatchOptions)], - ) -> Result { - let mut joined = String::new(); - for &(ref p, ref o) in pats { - let part = format!("(?:{})", p.to_regex_with(o)); - if !joined.is_empty() { - joined.push('|'); - } - joined.push_str(&part); - } - Ok(SetYesNo { re: try!(Regex::new(&joined)) }) - } +fn new_regex_set(pats: I) -> Result + where S: AsRef, I: IntoIterator { + RegexSet::new(pats).map_err(|err| Error::Regex(err.to_string())) } type Fnv = hash::BuildHasherDefault; @@ -143,20 +143,21 @@ type Fnv = hash::BuildHasherDefault; /// pass. #[derive(Clone, Debug)] pub struct Set { - exts: HashMap, Fnv>, - literals: HashMap, Vec, Fnv>, - base_literals: HashMap, Vec, Fnv>, - base_prefixes: Vec>, - base_prefixes_map: Vec, - base_suffixes: Vec>, - base_suffixes_map: Vec, - base_regexes: Vec, - base_regexes_map: Vec, - regexes: Vec, - regexes_map: Vec, + strats: Vec, } impl Set { + /// Returns true if any glob in this set matches the path given. + pub fn is_match>(&self, path: T) -> bool { + let candidate = Candidate::new(path.as_ref()); + for strat in &self.strats { + if strat.is_match(&candidate) { + return true; + } + } + false + } + /// Returns the sequence number of every glob pattern that matches the /// given path. #[allow(dead_code)] @@ -174,110 +175,67 @@ impl Set { into: &mut Vec, ) { into.clear(); - let path = path.as_ref(); - let path_bytes = &*path_bytes(path); - let basename = file_name(path).map(|b| os_str_bytes(b)); - if !self.exts.is_empty() { - if let Some(ext) = path.extension() { - if let Some(matches) = self.exts.get(ext) { - into.extend(matches.as_slice()); - } - } - } - if !self.literals.is_empty() { - if let Some(matches) = self.literals.get(path_bytes) { - into.extend(matches.as_slice()); - } - } - if !self.base_literals.is_empty() { - if let Some(ref basename) = basename { - if let Some(matches) = self.base_literals.get(&**basename) { - into.extend(matches.as_slice()); - } - } - } - if !self.base_prefixes.is_empty() { - if let Some(ref basename) = basename { - let basename = &**basename; - for (i, pre) in self.base_prefixes.iter().enumerate() { - if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] { - into.push(self.base_prefixes_map[i]); - } - } - } - } - if !self.base_suffixes.is_empty() { - if let Some(ref basename) = basename { - let basename = &**basename; - for (i, suf) in self.base_suffixes.iter().enumerate() { - if suf.len() > basename.len() { - continue; - } - let (s, e) = (basename.len() - suf.len(), basename.len()); - if &**suf == &basename[s..e] { - into.push(self.base_suffixes_map[i]); - } - } - } - } - if let Some(ref basename) = basename { - for (i, re) in self.base_regexes.iter().enumerate() { - if re.is_match(&**basename) { - into.push(self.base_regexes_map[i]); - } - } - } - for (i, re) in self.regexes.iter().enumerate() { - if re.is_match(path_bytes) { - into.push(self.regexes_map[i]); - } + let candidate = Candidate::new(path.as_ref()); + for strat in &self.strats { + strat.matches_into(&candidate, into); } into.sort(); + into.dedup(); } - fn new(pats: &[(Pattern, MatchOptions)]) -> Result { - let fnv = Fnv::default(); - let mut exts = HashMap::with_hasher(fnv.clone()); - let mut literals = HashMap::with_hasher(fnv.clone()); - let mut base_literals = HashMap::with_hasher(fnv.clone()); - let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]); - let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]); - let (mut regexes, mut regexes_map) = (vec![], vec![]); - let (mut base_regexes, mut base_regexes_map) = (vec![], vec![]); - for (i, &(ref p, ref o)) in pats.iter().enumerate() { - if let Some(ext) = p.ext() { - exts.entry(ext).or_insert(vec![]).push(i); - } else if let Some(literal) = p.literal() { - literals.entry(literal.into_bytes()).or_insert(vec![]).push(i); - } else if let Some(literal) = p.base_literal() { - base_literals - .entry(literal.into_bytes()).or_insert(vec![]).push(i); - } else if let Some(literal) = p.base_literal_prefix() { - base_prefixes.push(literal.into_bytes()); - base_prefixes_map.push(i); - } else if let Some(literal) = p.base_literal_suffix() { - base_suffixes.push(literal.into_bytes()); - base_suffixes_map.push(i); - } else if p.is_only_basename() { - base_regexes.push(try!(Regex::new(&p.to_regex_with(o)))); - base_regexes_map.push(i); - } else { - regexes.push(try!(Regex::new(&p.to_regex_with(o)))); - regexes_map.push(i); + fn new(pats: &[Pattern]) -> Result { + let mut lits = LiteralStrategy::new(); + let mut base_lits = BasenameLiteralStrategy::new(); + let mut exts = ExtensionStrategy::new(); + let mut prefixes = MultiStrategyBuilder::new(); + let mut suffixes = MultiStrategyBuilder::new(); + let mut required_exts = RequiredExtensionStrategyBuilder::new(); + let mut regexes = MultiStrategyBuilder::new(); + for (i, p) in pats.iter().enumerate() { + match MatchStrategy::new(p) { + MatchStrategy::Literal(lit) => { + lits.add(i, lit); + } + MatchStrategy::BasenameLiteral(lit) => { + base_lits.add(i, lit); + } + MatchStrategy::Extension(ext) => { + exts.add(i, ext); + } + MatchStrategy::Prefix(prefix) => { + prefixes.add(i, prefix); + } + MatchStrategy::Suffix { suffix, component } => { + if component { + lits.add(i, suffix[1..].to_string()); + } + suffixes.add(i, suffix); + } + MatchStrategy::RequiredExtension(ext) => { + required_exts.add(i, ext, p.regex().to_owned()); + } + MatchStrategy::Regex => { + debug!("glob converted to regex: {:?}", p); + regexes.add(i, p.regex().to_owned()); + } } } + debug!("built glob set; {} literals, {} basenames, {} extensions, \ + {} prefixes, {} suffixes, {} required extensions, {} regexes", + lits.0.len(), base_lits.0.len(), exts.0.len(), + prefixes.literals.len(), suffixes.literals.len(), + required_exts.0.len(), regexes.literals.len()); Ok(Set { - exts: exts, - literals: literals, - base_literals: base_literals, - base_prefixes: base_prefixes, - base_prefixes_map: base_prefixes_map, - base_suffixes: base_suffixes, - base_suffixes_map: base_suffixes_map, - base_regexes: base_regexes, - base_regexes_map: base_regexes_map, - regexes: regexes, - regexes_map: regexes_map, + strats: vec![ + SetMatchStrategy::Extension(exts), + SetMatchStrategy::BasenameLiteral(base_lits), + SetMatchStrategy::Literal(lits), + SetMatchStrategy::Suffix(suffixes.suffix()), + SetMatchStrategy::Prefix(prefixes.prefix()), + SetMatchStrategy::RequiredExtension( + try!(required_exts.build())), + SetMatchStrategy::Regex(try!(regexes.regex_set())), + ], }) } } @@ -285,7 +243,7 @@ impl Set { /// SetBuilder builds a group of patterns that can be used to simultaneously /// match a file path. pub struct SetBuilder { - pats: Vec<(Pattern, MatchOptions)>, + pats: Vec, } impl SetBuilder { @@ -299,858 +257,374 @@ impl SetBuilder { /// Builds a new matcher from all of the glob patterns added so far. /// /// Once a matcher is built, no new patterns can be added to it. - pub fn build(&self) -> Result { + pub fn build(&self) -> Result { Set::new(&self.pats) } - /// Like `build`, but returns a matcher that can only answer yes/no. - pub fn build_yesno(&self) -> Result { - SetYesNo::new(&self.pats) - } - /// Add a new pattern to this set. - /// - /// If the pattern could not be parsed as a glob, then an error is - /// returned. #[allow(dead_code)] - pub fn add(&mut self, pat: &str) -> Result<(), Error> { - self.add_with(pat, &MatchOptions::default()) - } - - /// Like add, but sets the match options for this particular pattern. - pub fn add_with( - &mut self, - pat: &str, - opts: &MatchOptions, - ) -> Result<(), Error> { - let parsed = try!(Pattern::new(pat)); - // if let Some(ext) = parsed.ext() { - // eprintln!("ext :: {:?} :: {:?}", ext, pat); - // } else if let Some(lit) = parsed.literal() { - // eprintln!("literal :: {:?} :: {:?}", lit, pat); - // } else if let Some(lit) = parsed.base_literal() { - // eprintln!("base_literal :: {:?} :: {:?}", lit, pat); - // } else if let Some(lit) = parsed.base_literal_prefix() { - // eprintln!("base_literal_prefix :: {:?} :: {:?}", lit, pat); - // } else if let Some(lit) = parsed.base_literal_suffix() { - // eprintln!("base_literal_suffix :: {:?} :: {:?}", lit, pat); - // } else if parsed.is_only_basename() { - // eprintln!("basename-regex :: {:?} :: {:?}", pat, parsed); - // } else { - // eprintln!("regex :: {:?} :: {:?}", pat, parsed); - // } - self.pats.push((parsed, opts.clone())); - Ok(()) + pub fn add(&mut self, pat: Pattern) -> &mut SetBuilder { + self.pats.push(pat); + self } } -/// Pattern represents a successfully parsed shell glob pattern. -/// -/// It cannot be used directly to match file paths, but it can be converted -/// to a regular expression string. -#[derive(Clone, Debug, Default, Eq, PartialEq)] -pub struct Pattern { - tokens: Vec, +#[derive(Clone, Debug)] +struct Candidate<'a> { + path: Cow<'a, [u8]>, + basename: Cow<'a, [u8]>, + ext: &'a OsStr, } -/// Options to control the matching semantics of a glob. The default value -/// has all options disabled. -#[derive(Clone, Debug, Default)] -pub struct MatchOptions { - /// When true, matching is done case insensitively. - pub case_insensitive: bool, - /// When true, neither `*` nor `?` match the current system's path - /// separator. - pub require_literal_separator: bool, -} +impl<'a> Candidate<'a> { + fn new + ?Sized>(path: &'a P) -> Candidate<'a> { + let path = path.as_ref(); + let basename = file_name(path).unwrap_or(OsStr::new("")); + Candidate { + path: path_bytes(path), + basename: os_str_bytes(basename), + ext: file_name_ext(basename).unwrap_or(OsStr::new("")), + } + } -#[derive(Clone, Debug, Eq, PartialEq)] -enum Token { - Literal(char), - Any, - ZeroOrMore, - RecursivePrefix, - RecursiveSuffix, - RecursiveZeroOrMore, - Class { - negated: bool, - ranges: Vec<(char, char)>, - }, - Alternates(Vec), -} - -impl Pattern { - /// Parse a shell glob pattern. - /// - /// If the pattern is not a valid glob, then an error is returned. - pub fn new(pat: &str) -> Result { - let mut p = Parser { - stack: vec![Pattern::default()], - chars: pat.chars().peekable(), - prev: None, - cur: None, - }; - try!(p.parse()); - if p.stack.is_empty() { - Err(Error::UnopenedAlternates) - } else if p.stack.len() > 1 { - Err(Error::UnclosedAlternates) + fn path_prefix(&self, max: usize) -> &[u8] { + if self.path.len() <= max { + &*self.path } else { - Ok(p.stack.pop().unwrap()) + &self.path[..max] } } - /// Returns an extension if this pattern exclusively matches it. - pub fn ext(&self) -> Option { - if self.tokens.len() <= 3 { - return None; - } - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - match self.tokens.get(1) { - Some(&Token::ZeroOrMore) => {} - _ => return None, - } - match self.tokens.get(2) { - Some(&Token::Literal(c)) if c == '.' => {} - _ => return None, - } - let mut lit = OsString::new(); - for t in self.tokens[3..].iter() { - match *t { - Token::Literal(c) if c == '/' || c == '\\' || c == '.' => { - return None; - } - Token::Literal(c) => lit.push(c.to_string()), - _ => return None, - } - } - Some(lit) - } - - /// Returns the pattern as a literal if and only if the pattern exclusiely - /// matches the basename of a file path *and* is a literal. - /// - /// The basic format of these patterns is `**/{literal}`, where `{literal}` - /// does not contain a path separator. - pub fn base_literal(&self) -> Option { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - let mut lit = String::new(); - for t in &self.tokens[1..] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return None, - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Returns true if and only if this pattern only inspects the basename - /// of a path. - pub fn is_only_basename(&self) -> bool { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return false, - } - for t in &self.tokens[1..] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return false, - Token::RecursivePrefix - | Token::RecursiveSuffix - | Token::RecursiveZeroOrMore => return false, - _ => {} - } - } - true - } - - /// Returns the pattern as a literal if and only if the pattern must match - /// an entire path exactly. - /// - /// The basic format of these patterns is `{literal}`. - pub fn literal(&self) -> Option { - let mut lit = String::new(); - for t in &self.tokens { - match *t { - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Returns a basename literal prefix of this pattern. - pub fn base_literal_prefix(&self) -> Option { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - match self.tokens.last() { - Some(&Token::ZeroOrMore) => {} - _ => return None, - } - let mut lit = String::new(); - for t in &self.tokens[1..self.tokens.len()-1] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return None, - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Returns a basename literal suffix of this pattern. - pub fn base_literal_suffix(&self) -> Option { - match self.tokens.get(0) { - Some(&Token::RecursivePrefix) => {} - _ => return None, - } - match self.tokens.get(1) { - Some(&Token::ZeroOrMore) => {} - _ => return None, - } - let mut lit = String::new(); - for t in &self.tokens[2..] { - match *t { - Token::Literal(c) if c == '/' || c == '\\' => return None, - Token::Literal(c) => lit.push(c), - _ => return None, - } - } - Some(lit) - } - - /// Convert this pattern to a string that is guaranteed to be a valid - /// regular expression and will represent the matching semantics of this - /// glob pattern. This uses a default set of options. - #[allow(dead_code)] - pub fn to_regex(&self) -> String { - self.to_regex_with(&MatchOptions::default()) - } - - /// Convert this pattern to a string that is guaranteed to be a valid - /// regular expression and will represent the matching semantics of this - /// glob pattern and the options given. - pub fn to_regex_with(&self, options: &MatchOptions) -> String { - let mut re = String::new(); - re.push_str("(?-u)"); - if options.case_insensitive { - re.push_str("(?i)"); - } - re.push('^'); - // Special case. If the entire glob is just `**`, then it should match - // everything. - if self.tokens.len() == 1 && self.tokens[0] == Token::RecursivePrefix { - re.push_str(".*"); - re.push('$'); - return re; - } - self.tokens_to_regex(options, &self.tokens, &mut re); - re.push('$'); - re - } - - fn tokens_to_regex( - &self, - options: &MatchOptions, - tokens: &[Token], - re: &mut String, - ) { - let seps = &*FILE_SEPARATORS; - - for tok in tokens { - match *tok { - Token::Literal(c) => { - re.push_str(®ex::quote(&c.to_string())); - } - Token::Any => { - if options.require_literal_separator { - re.push_str(&format!("[^{}]", seps)); - } else { - re.push_str("."); - } - } - Token::ZeroOrMore => { - if options.require_literal_separator { - re.push_str(&format!("[^{}]*", seps)); - } else { - re.push_str(".*"); - } - } - Token::RecursivePrefix => { - re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps)); - } - Token::RecursiveSuffix => { - re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps)); - } - Token::RecursiveZeroOrMore => { - re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])", - sep=seps)); - } - Token::Class { negated, ref ranges } => { - re.push('['); - if negated { - re.push('^'); - } - for r in ranges { - if r.0 == r.1 { - // Not strictly necessary, but nicer to look at. - re.push_str(®ex::quote(&r.0.to_string())); - } else { - re.push_str(®ex::quote(&r.0.to_string())); - re.push('-'); - re.push_str(®ex::quote(&r.1.to_string())); - } - } - re.push(']'); - } - Token::Alternates(ref patterns) => { - let mut parts = vec![]; - for pat in patterns { - let mut altre = String::new(); - self.tokens_to_regex(options, &pat.tokens, &mut altre); - parts.push(altre); - } - re.push_str(&parts.join("|")); - } - } - } - } -} - -struct Parser<'a> { - stack: Vec, - chars: iter::Peekable>, - prev: Option, - cur: Option, -} - -impl<'a> Parser<'a> { - fn parse(&mut self) -> Result<(), Error> { - while let Some(c) = self.bump() { - match c { - '?' => try!(self.push_token(Token::Any)), - '*' => try!(self.parse_star()), - '[' => try!(self.parse_class()), - '{' => try!(self.push_alternate()), - '}' => try!(self.pop_alternate()), - ',' => try!(self.parse_comma()), - c => try!(self.push_token(Token::Literal(c))), - } - } - Ok(()) - } - - fn push_alternate(&mut self) -> Result<(), Error> { - if self.stack.len() > 1 { - return Err(Error::NestedAlternates); - } - Ok(self.stack.push(Pattern::default())) - } - - fn pop_alternate(&mut self) -> Result<(), Error> { - let mut alts = vec![]; - while self.stack.len() >= 2 { - alts.push(self.stack.pop().unwrap()); - } - self.push_token(Token::Alternates(alts)) - } - - fn push_token(&mut self, tok: Token) -> Result<(), Error> { - match self.stack.last_mut() { - None => Err(Error::UnopenedAlternates), - Some(ref mut pat) => Ok(pat.tokens.push(tok)), - } - } - - fn pop_token(&mut self) -> Result { - match self.stack.last_mut() { - None => Err(Error::UnopenedAlternates), - Some(ref mut pat) => Ok(pat.tokens.pop().unwrap()), - } - } - - fn have_tokens(&self) -> Result { - match self.stack.last() { - None => Err(Error::UnopenedAlternates), - Some(ref pat) => Ok(!pat.tokens.is_empty()), - } - } - - fn parse_comma(&mut self) -> Result<(), Error> { - // If we aren't inside a group alternation, then don't - // treat commas specially. Otherwise, we need to start - // a new alternate. - if self.stack.len() <= 1 { - self.push_token(Token::Literal(',')) + fn path_suffix(&self, max: usize) -> &[u8] { + if self.path.len() <= max { + &*self.path } else { - Ok(self.stack.push(Pattern::default())) + &self.path[self.path.len() - max..] + } + } +} + +#[derive(Clone, Debug)] +enum SetMatchStrategy { + Literal(LiteralStrategy), + BasenameLiteral(BasenameLiteralStrategy), + Extension(ExtensionStrategy), + Prefix(PrefixStrategy), + Suffix(SuffixStrategy), + RequiredExtension(RequiredExtensionStrategy), + Regex(RegexSetStrategy), +} + +impl SetMatchStrategy { + fn is_match(&self, candidate: &Candidate) -> bool { + use self::SetMatchStrategy::*; + match *self { + Literal(ref s) => s.is_match(candidate), + BasenameLiteral(ref s) => s.is_match(candidate), + Extension(ref s) => s.is_match(candidate), + Prefix(ref s) => s.is_match(candidate), + Suffix(ref s) => s.is_match(candidate), + RequiredExtension(ref s) => s.is_match(candidate), + Regex(ref s) => s.is_match(candidate), } } - fn parse_star(&mut self) -> Result<(), Error> { - let prev = self.prev; - if self.chars.peek() != Some(&'*') { - try!(self.push_token(Token::ZeroOrMore)); - return Ok(()); + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + use self::SetMatchStrategy::*; + match *self { + Literal(ref s) => s.matches_into(candidate, matches), + BasenameLiteral(ref s) => s.matches_into(candidate, matches), + Extension(ref s) => s.matches_into(candidate, matches), + Prefix(ref s) => s.matches_into(candidate, matches), + Suffix(ref s) => s.matches_into(candidate, matches), + RequiredExtension(ref s) => s.matches_into(candidate, matches), + Regex(ref s) => s.matches_into(candidate, matches), } - assert!(self.bump() == Some('*')); - if !try!(self.have_tokens()) { - try!(self.push_token(Token::RecursivePrefix)); - let next = self.bump(); - if !next.is_none() && next != Some('/') { - return Err(Error::InvalidRecursive); - } - return Ok(()); + } +} + +#[derive(Clone, Debug)] +struct LiteralStrategy(BTreeMap, Vec>); + +impl LiteralStrategy { + fn new() -> LiteralStrategy { + LiteralStrategy(BTreeMap::new()) + } + + fn add(&mut self, global_index: usize, lit: String) { + self.0.entry(lit.into_bytes()).or_insert(vec![]).push(global_index); + } + + fn is_match(&self, candidate: &Candidate) -> bool { + self.0.contains_key(&*candidate.path) + } + + #[inline(never)] + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + if let Some(hits) = self.0.get(&*candidate.path) { + matches.extend(hits); } - try!(self.pop_token()); - if prev != Some('/') { - if self.stack.len() <= 1 - || (prev != Some(',') && prev != Some('{')) { - return Err(Error::InvalidRecursive); + } +} + +#[derive(Clone, Debug)] +struct BasenameLiteralStrategy(BTreeMap, Vec>); + +impl BasenameLiteralStrategy { + fn new() -> BasenameLiteralStrategy { + BasenameLiteralStrategy(BTreeMap::new()) + } + + fn add(&mut self, global_index: usize, lit: String) { + self.0.entry(lit.into_bytes()).or_insert(vec![]).push(global_index); + } + + fn is_match(&self, candidate: &Candidate) -> bool { + if candidate.basename.is_empty() { + return false; + } + self.0.contains_key(&*candidate.basename) + } + + #[inline(never)] + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + if candidate.basename.is_empty() { + return; + } + if let Some(hits) = self.0.get(&*candidate.basename) { + matches.extend(hits); + } + } +} + +#[derive(Clone, Debug)] +struct ExtensionStrategy(HashMap, Fnv>); + +impl ExtensionStrategy { + fn new() -> ExtensionStrategy { + ExtensionStrategy(HashMap::with_hasher(Fnv::default())) + } + + fn add(&mut self, global_index: usize, ext: OsString) { + self.0.entry(ext).or_insert(vec![]).push(global_index); + } + + fn is_match(&self, candidate: &Candidate) -> bool { + if candidate.ext.is_empty() { + return false; + } + self.0.contains_key(candidate.ext) + } + + #[inline(never)] + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + if candidate.ext.is_empty() { + return; + } + if let Some(hits) = self.0.get(candidate.ext) { + matches.extend(hits); + } + } +} + +#[derive(Clone, Debug)] +struct PrefixStrategy { + matcher: FullAcAutomaton>, + map: Vec, + longest: usize, +} + +impl PrefixStrategy { + fn is_match(&self, candidate: &Candidate) -> bool { + let path = candidate.path_prefix(self.longest); + for m in self.matcher.find_overlapping(path) { + if m.start == 0 { + return true; } } - match self.chars.peek() { - None => { - assert!(self.bump().is_none()); - self.push_token(Token::RecursiveSuffix) + false + } + + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + let path = candidate.path_prefix(self.longest); + for m in self.matcher.find_overlapping(path) { + if m.start == 0 { + matches.push(self.map[m.pati]); } - Some(&',') | Some(&'}') if self.stack.len() >= 2 => { - self.push_token(Token::RecursiveSuffix) + } + } +} + +#[derive(Clone, Debug)] +struct SuffixStrategy { + matcher: FullAcAutomaton>, + map: Vec, + longest: usize, +} + +impl SuffixStrategy { + fn is_match(&self, candidate: &Candidate) -> bool { + let path = candidate.path_suffix(self.longest); + for m in self.matcher.find_overlapping(path) { + if m.end == path.len() { + return true; } - Some(&'/') => { - assert!(self.bump() == Some('/')); - self.push_token(Token::RecursiveZeroOrMore) + } + false + } + + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + let path = candidate.path_suffix(self.longest); + for m in self.matcher.find_overlapping(path) { + if m.end == path.len() { + matches.push(self.map[m.pati]); + } + } + } +} + +#[derive(Clone, Debug)] +struct RequiredExtensionStrategy(HashMap, Fnv>); + +impl RequiredExtensionStrategy { + fn is_match(&self, candidate: &Candidate) -> bool { + if candidate.ext.is_empty() { + return false; + } + match self.0.get(candidate.ext) { + None => false, + Some(regexes) => { + for &(_, ref re) in regexes { + if re.is_match(&*candidate.path) { + return true; + } + } + false } - _ => Err(Error::InvalidRecursive), } } - fn parse_class(&mut self) -> Result<(), Error> { - fn add_to_last_range( - r: &mut (char, char), - add: char, - ) -> Result<(), Error> { - r.1 = add; - if r.1 < r.0 { - Err(Error::InvalidRange(r.0, r.1)) - } else { - Ok(()) - } + #[inline(never)] + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + if candidate.ext.is_empty() { + return; } - let mut negated = false; - let mut ranges = vec![]; - if self.chars.peek() == Some(&'!') { - assert!(self.bump() == Some('!')); - negated = true; - } - let mut first = true; - let mut in_range = false; - loop { - let c = match self.bump() { - Some(c) => c, - // The only way to successfully break this loop is to observe - // a ']'. - None => return Err(Error::UnclosedClass), - }; - match c { - ']' => { - if first { - ranges.push((']', ']')); - } else { - break; - } - } - '-' => { - if first { - ranges.push(('-', '-')); - } else if in_range { - // invariant: in_range is only set when there is - // already at least one character seen. - let r = ranges.last_mut().unwrap(); - try!(add_to_last_range(r, '-')); - in_range = false; - } else { - assert!(!ranges.is_empty()); - in_range = true; - } - } - c => { - if in_range { - // invariant: in_range is only set when there is - // already at least one character seen. - try!(add_to_last_range(ranges.last_mut().unwrap(), c)); - } else { - ranges.push((c, c)); - } - in_range = false; + if let Some(regexes) = self.0.get(candidate.ext) { + for &(global_index, ref re) in regexes { + if re.is_match(&*candidate.path) { + matches.push(global_index); } } - first = false; } - if in_range { - // Means that the last character in the class was a '-', so add - // it as a literal. - ranges.push(('-', '-')); + } +} + +#[derive(Clone, Debug)] +struct RegexSetStrategy { + matcher: RegexSet, + map: Vec, +} + +impl RegexSetStrategy { + fn is_match(&self, candidate: &Candidate) -> bool { + self.matcher.is_match(&*candidate.path) + } + + fn matches_into(&self, candidate: &Candidate, matches: &mut Vec) { + for i in self.matcher.matches(&*candidate.path) { + matches.push(self.map[i]); } - self.push_token(Token::Class { - negated: negated, - ranges: ranges, + } +} + +#[derive(Clone, Debug)] +struct MultiStrategyBuilder { + literals: Vec, + map: Vec, + longest: usize, +} + +impl MultiStrategyBuilder { + fn new() -> MultiStrategyBuilder { + MultiStrategyBuilder { + literals: vec![], + map: vec![], + longest: 0, + } + } + + fn add(&mut self, global_index: usize, literal: String) { + if literal.len() > self.longest { + self.longest = literal.len(); + } + self.map.push(global_index); + self.literals.push(literal); + } + + fn prefix(self) -> PrefixStrategy { + let it = self.literals.into_iter().map(|s| s.into_bytes()); + PrefixStrategy { + matcher: AcAutomaton::new(it).into_full(), + map: self.map, + longest: self.longest, + } + } + + fn suffix(self) -> SuffixStrategy { + let it = self.literals.into_iter().map(|s| s.into_bytes()); + SuffixStrategy { + matcher: AcAutomaton::new(it).into_full(), + map: self.map, + longest: self.longest, + } + } + + fn regex_set(self) -> Result { + Ok(RegexSetStrategy { + matcher: try!(new_regex_set(self.literals)), + map: self.map, }) } +} - fn bump(&mut self) -> Option { - self.prev = self.cur; - self.cur = self.chars.next(); - self.cur +#[derive(Clone, Debug)] +struct RequiredExtensionStrategyBuilder( + HashMap>, +); + +impl RequiredExtensionStrategyBuilder { + fn new() -> RequiredExtensionStrategyBuilder { + RequiredExtensionStrategyBuilder(HashMap::new()) } -} -fn path_bytes(path: &Path) -> Cow<[u8]> { - os_str_bytes(path.as_os_str()) -} + fn add(&mut self, global_index: usize, ext: OsString, regex: String) { + self.0.entry(ext).or_insert(vec![]).push((global_index, regex)); + } -#[cfg(unix)] -fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { - use std::os::unix::ffi::OsStrExt; - Cow::Borrowed(s.as_bytes()) -} - -#[cfg(not(unix))] -fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { - // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even - // if we could get at the raw bytes, they wouldn't be useful. We *must* - // convert to UTF-8 before doing path matching. Unfortunate, but necessary. - match s.to_string_lossy() { - Cow::Owned(s) => Cow::Owned(s.into_bytes()), - Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + fn build(self) -> Result { + let mut exts = HashMap::with_hasher(Fnv::default()); + for (ext, regexes) in self.0.into_iter() { + exts.insert(ext.clone(), vec![]); + for (global_index, regex) in regexes { + let compiled = try!(new_regex(®ex)); + exts.get_mut(&ext).unwrap().push((global_index, compiled)); + } + } + Ok(RequiredExtensionStrategy(exts)) } } #[cfg(test)] mod tests { - use std::path::Path; - - use regex::bytes::Regex; - - use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token}; - use super::Token::*; - - macro_rules! syntax { - ($name:ident, $pat:expr, $tokens:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - assert_eq!($tokens, pat.tokens); - } - } - } - - macro_rules! syntaxerr { - ($name:ident, $pat:expr, $err:expr) => { - #[test] - fn $name() { - let err = Pattern::new($pat).unwrap_err(); - assert_eq!($err, err); - } - } - } - - macro_rules! toregex { - ($name:ident, $pat:expr, $re:expr) => { - toregex!($name, $pat, $re, MatchOptions::default()); - }; - ($name:ident, $pat:expr, $re:expr, $options:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - assert_eq!( - format!("(?-u){}", $re), pat.to_regex_with(&$options)); - } - }; - } - - macro_rules! matches { - ($name:ident, $pat:expr, $path:expr) => { - matches!($name, $pat, $path, MatchOptions::default()); - }; - ($name:ident, $pat:expr, $path:expr, $options:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - let path = &Path::new($path).to_str().unwrap(); - let re = Regex::new(&pat.to_regex_with(&$options)).unwrap(); - assert!(re.is_match(path.as_bytes())); - } - }; - } - - macro_rules! nmatches { - ($name:ident, $pat:expr, $path:expr) => { - nmatches!($name, $pat, $path, MatchOptions::default()); - }; - ($name:ident, $pat:expr, $path:expr, $options:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - let path = &Path::new($path).to_str().unwrap(); - let re = Regex::new(&pat.to_regex_with(&$options)).unwrap(); - assert!(!re.is_match(path.as_bytes())); - } - }; - } - - macro_rules! ext { - ($name:ident, $pat:expr, $ext:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - let ext = pat.ext().map(|e| e.to_string_lossy().into_owned()); - assert_eq!($ext, ext.as_ref().map(|s| &**s)); - } - }; - } - - macro_rules! baseliteral { - ($name:ident, $pat:expr, $yes:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - assert_eq!($yes, pat.base_literal().is_some()); - } - }; - } - - macro_rules! basesuffix { - ($name:ident, $pat:expr, $yes:expr) => { - #[test] - fn $name() { - let pat = Pattern::new($pat).unwrap(); - assert_eq!($yes, pat.is_literal_suffix()); - } - }; - } - - fn class(s: char, e: char) -> Token { - Class { negated: false, ranges: vec![(s, e)] } - } - - fn classn(s: char, e: char) -> Token { - Class { negated: true, ranges: vec![(s, e)] } - } - - fn rclass(ranges: &[(char, char)]) -> Token { - Class { negated: false, ranges: ranges.to_vec() } - } - - fn rclassn(ranges: &[(char, char)]) -> Token { - Class { negated: true, ranges: ranges.to_vec() } - } - - syntax!(literal1, "a", vec![Literal('a')]); - syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]); - syntax!(any1, "?", vec![Any]); - syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]); - syntax!(seq1, "*", vec![ZeroOrMore]); - syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]); - syntax!(seq3, "*a*b*", vec![ - ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore, - ]); - syntax!(rseq1, "**", vec![RecursivePrefix]); - syntax!(rseq2, "**/", vec![RecursivePrefix]); - syntax!(rseq3, "/**", vec![RecursiveSuffix]); - syntax!(rseq4, "/**/", vec![RecursiveZeroOrMore]); - syntax!(rseq5, "a/**/b", vec![ - Literal('a'), RecursiveZeroOrMore, Literal('b'), - ]); - syntax!(cls1, "[a]", vec![class('a', 'a')]); - syntax!(cls2, "[!a]", vec![classn('a', 'a')]); - syntax!(cls3, "[a-z]", vec![class('a', 'z')]); - syntax!(cls4, "[!a-z]", vec![classn('a', 'z')]); - syntax!(cls5, "[-]", vec![class('-', '-')]); - syntax!(cls6, "[]]", vec![class(']', ']')]); - syntax!(cls7, "[*]", vec![class('*', '*')]); - syntax!(cls8, "[!!]", vec![classn('!', '!')]); - syntax!(cls9, "[a-]", vec![rclass(&[('a', 'a'), ('-', '-')])]); - syntax!(cls10, "[-a-z]", vec![rclass(&[('-', '-'), ('a', 'z')])]); - syntax!(cls11, "[a-z-]", vec![rclass(&[('a', 'z'), ('-', '-')])]); - syntax!(cls12, "[-a-z-]", vec![ - rclass(&[('-', '-'), ('a', 'z'), ('-', '-')]), - ]); - syntax!(cls13, "[]-z]", vec![class(']', 'z')]); - syntax!(cls14, "[--z]", vec![class('-', 'z')]); - syntax!(cls15, "[ --]", vec![class(' ', '-')]); - syntax!(cls16, "[0-9a-z]", vec![rclass(&[('0', '9'), ('a', 'z')])]); - syntax!(cls17, "[a-z0-9]", vec![rclass(&[('a', 'z'), ('0', '9')])]); - syntax!(cls18, "[!0-9a-z]", vec![rclassn(&[('0', '9'), ('a', 'z')])]); - syntax!(cls19, "[!a-z0-9]", vec![rclassn(&[('a', 'z'), ('0', '9')])]); - - syntaxerr!(err_rseq1, "a**", Error::InvalidRecursive); - syntaxerr!(err_rseq2, "**a", Error::InvalidRecursive); - syntaxerr!(err_rseq3, "a**b", Error::InvalidRecursive); - syntaxerr!(err_rseq4, "***", Error::InvalidRecursive); - syntaxerr!(err_rseq5, "/a**", Error::InvalidRecursive); - syntaxerr!(err_rseq6, "/**a", Error::InvalidRecursive); - syntaxerr!(err_rseq7, "/a**b", Error::InvalidRecursive); - syntaxerr!(err_unclosed1, "[", Error::UnclosedClass); - syntaxerr!(err_unclosed2, "[]", Error::UnclosedClass); - syntaxerr!(err_unclosed3, "[!", Error::UnclosedClass); - syntaxerr!(err_unclosed4, "[!]", Error::UnclosedClass); - syntaxerr!(err_range1, "[z-a]", Error::InvalidRange('z', 'a')); - syntaxerr!(err_range2, "[z--]", Error::InvalidRange('z', '-')); - - const SLASHLIT: MatchOptions = MatchOptions { - case_insensitive: false, - require_literal_separator: true, - }; - const CASEI: MatchOptions = MatchOptions { - case_insensitive: true, - require_literal_separator: false, - }; - - toregex!(re_casei, "a", "(?i)^a$", &CASEI); - - toregex!(re_slash1, "?", r"^[^/\\]$", SLASHLIT); - toregex!(re_slash2, "*", r"^[^/\\]*$", SLASHLIT); - - toregex!(re1, "a", "^a$"); - toregex!(re2, "?", "^.$"); - toregex!(re3, "*", "^.*$"); - toregex!(re4, "a?", "^a.$"); - toregex!(re5, "?a", "^.a$"); - toregex!(re6, "a*", "^a.*$"); - toregex!(re7, "*a", "^.*a$"); - toregex!(re8, "[*]", r"^[\*]$"); - toregex!(re9, "[+]", r"^[\+]$"); - toregex!(re10, "+", r"^\+$"); - toregex!(re11, "**", r"^.*$"); - - ext!(ext1, "**/*.rs", Some("rs")); - - baseliteral!(lit1, "**", true); - baseliteral!(lit2, "**/a", true); - baseliteral!(lit3, "**/ab", true); - baseliteral!(lit4, "**/a*b", false); - baseliteral!(lit5, "z/**/a*b", false); - baseliteral!(lit6, "[ab]", false); - baseliteral!(lit7, "?", false); - - matches!(match1, "a", "a"); - matches!(match2, "a*b", "a_b"); - matches!(match3, "a*b*c", "abc"); - matches!(match4, "a*b*c", "a_b_c"); - matches!(match5, "a*b*c", "a___b___c"); - matches!(match6, "abc*abc*abc", "abcabcabcabcabcabcabc"); - matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd"); - - matches!(matchrec1, "some/**/needle.txt", "some/needle.txt"); - matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt"); - matches!(matchrec3, "some/**/needle.txt", "some/one/two/needle.txt"); - matches!(matchrec4, "some/**/needle.txt", "some/other/needle.txt"); - matches!(matchrec5, "**", "abcde"); - matches!(matchrec6, "**", ""); - matches!(matchrec7, "**", ".asdf"); - matches!(matchrec8, "**", "/x/.asdf"); - matches!(matchrec9, "some/**/**/needle.txt", "some/needle.txt"); - matches!(matchrec10, "some/**/**/needle.txt", "some/one/needle.txt"); - matches!(matchrec11, "some/**/**/needle.txt", "some/one/two/needle.txt"); - matches!(matchrec12, "some/**/**/needle.txt", "some/other/needle.txt"); - matches!(matchrec13, "**/test", "one/two/test"); - matches!(matchrec14, "**/test", "one/test"); - matches!(matchrec15, "**/test", "test"); - matches!(matchrec16, "/**/test", "/one/two/test"); - matches!(matchrec17, "/**/test", "/one/test"); - matches!(matchrec18, "/**/test", "/test"); - matches!(matchrec19, "**/.*", ".abc"); - matches!(matchrec20, "**/.*", "abc/.abc"); - matches!(matchrec21, ".*/**", ".abc"); - matches!(matchrec22, ".*/**", ".abc/abc"); - matches!(matchnot23, "foo/**", "foo"); - - matches!(matchrange1, "a[0-9]b", "a0b"); - matches!(matchrange2, "a[0-9]b", "a9b"); - matches!(matchrange3, "a[!0-9]b", "a_b"); - matches!(matchrange4, "[a-z123]", "1"); - matches!(matchrange5, "[1a-z23]", "1"); - matches!(matchrange6, "[123a-z]", "1"); - matches!(matchrange7, "[abc-]", "-"); - matches!(matchrange8, "[-abc]", "-"); - matches!(matchrange9, "[-a-c]", "b"); - matches!(matchrange10, "[a-c-]", "b"); - matches!(matchrange11, "[-]", "-"); - - matches!(matchpat1, "*hello.txt", "hello.txt"); - matches!(matchpat2, "*hello.txt", "gareth_says_hello.txt"); - matches!(matchpat3, "*hello.txt", "some/path/to/hello.txt"); - matches!(matchpat4, "*hello.txt", "some\\path\\to\\hello.txt"); - matches!(matchpat5, "*hello.txt", "/an/absolute/path/to/hello.txt"); - matches!(matchpat6, "*some/path/to/hello.txt", "some/path/to/hello.txt"); - matches!(matchpat7, "*some/path/to/hello.txt", - "a/bigger/some/path/to/hello.txt"); - - matches!(matchescape, "_[[]_[]]_[?]_[*]_!_", "_[_]_?_*_!_"); - - matches!(matchcasei1, "aBcDeFg", "aBcDeFg", CASEI); - matches!(matchcasei2, "aBcDeFg", "abcdefg", CASEI); - matches!(matchcasei3, "aBcDeFg", "ABCDEFG", CASEI); - matches!(matchcasei4, "aBcDeFg", "AbCdEfG", CASEI); - - matches!(matchalt1, "a,b", "a,b"); - matches!(matchalt2, ",", ","); - matches!(matchalt3, "{a,b}", "a"); - matches!(matchalt4, "{a,b}", "b"); - matches!(matchalt5, "{**/src/**,foo}", "abc/src/bar"); - matches!(matchalt6, "{**/src/**,foo}", "foo"); - matches!(matchalt7, "{[}],foo}", "}"); - matches!(matchalt8, "{foo}", "foo"); - matches!(matchalt9, "{}", ""); - matches!(matchalt10, "{,}", ""); - matches!(matchalt11, "{*.foo,*.bar,*.wat}", "test.foo"); - matches!(matchalt12, "{*.foo,*.bar,*.wat}", "test.bar"); - matches!(matchalt13, "{*.foo,*.bar,*.wat}", "test.wat"); - - matches!(matchslash1, "abc/def", "abc/def", SLASHLIT); - nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT); - nmatches!(matchslash2_win, "abc?def", "abc\\def", SLASHLIT); - nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT); - matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs - - nmatches!(matchnot1, "a*b*c", "abcd"); - nmatches!(matchnot2, "abc*abc*abc", "abcabcabcabcabcabcabca"); - nmatches!(matchnot3, "some/**/needle.txt", "some/other/notthis.txt"); - nmatches!(matchnot4, "some/**/**/needle.txt", "some/other/notthis.txt"); - nmatches!(matchnot5, "/**/test", "test"); - nmatches!(matchnot6, "/**/test", "/one/notthis"); - nmatches!(matchnot7, "/**/test", "/notthis"); - nmatches!(matchnot8, "**/.*", "ab.c"); - nmatches!(matchnot9, "**/.*", "abc/ab.c"); - nmatches!(matchnot10, ".*/**", "a.bc"); - nmatches!(matchnot11, ".*/**", "abc/a.bc"); - nmatches!(matchnot12, "a[0-9]b", "a_b"); - nmatches!(matchnot13, "a[!0-9]b", "a0b"); - nmatches!(matchnot14, "a[!0-9]b", "a9b"); - nmatches!(matchnot15, "[!-]", "-"); - nmatches!(matchnot16, "*hello.txt", "hello.txt-and-then-some"); - nmatches!(matchnot17, "*hello.txt", "goodbye.txt"); - nmatches!(matchnot18, "*some/path/to/hello.txt", - "some/path/to/hello.txt-and-then-some"); - nmatches!(matchnot19, "*some/path/to/hello.txt", - "some/other/path/to/hello.txt"); + use super::{Set, SetBuilder}; + use pattern::Pattern; #[test] fn set_works() { let mut builder = SetBuilder::new(); - builder.add("src/**/*.rs").unwrap(); - builder.add("*.c").unwrap(); - builder.add("src/lib.rs").unwrap(); + builder.add(Pattern::new("src/**/*.rs").unwrap()); + builder.add(Pattern::new("*.c").unwrap()); + builder.add(Pattern::new("src/lib.rs").unwrap()); let set = builder.build().unwrap(); fn is_match(set: &Set, s: &str) -> bool { diff --git a/globset/src/pathutil.rs b/globset/src/pathutil.rs index 73caf0e5..3e89f7bb 100644 --- a/globset/src/pathutil.rs +++ b/globset/src/pathutil.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::ffi::OsStr; use std::path::Path; @@ -36,3 +37,98 @@ pub fn file_name<'a, P: AsRef + ?Sized>( ) -> Option<&'a OsStr> { path.as_ref().file_name() } + +/// Return a file extension given a path's file name. +/// +/// Note that this does NOT match the semantics of std::path::Path::extension. +/// Namely, the extension includes the `.` and matching is otherwise more +/// liberal. Specifically, the extenion is: +/// +/// * None, if the file name given is empty; +/// * None, if there is no embedded `.`; +/// * Otherwise, the portion of the file name starting with the final `.`. +/// +/// e.g., A file name of `.rs` has an extension `.rs`. +/// +/// N.B. This is done to make certain glob match optimizations easier. Namely, +/// a pattern like `*.rs` is obviously trying to match files with a `rs` +/// extension, but it also matches files like `.rs`, which doesn't have an +/// extension according to std::path::Path::extension. +pub fn file_name_ext(name: &OsStr) -> Option<&OsStr> { + // Yes, these functions are awful, and yes, we are completely violating + // the abstraction barrier of std::ffi. The barrier we're violating is + // that an OsStr's encoding is *ASCII compatible*. While this is obviously + // true on Unix systems, it's also true on Windows because an OsStr uses + // WTF-8 internally: https://simonsapin.github.io/wtf-8/ + // + // We should consider doing the same for the other path utility functions. + // Right now, we don't break any barriers, but Windows users are paying + // for it. + // + // Got any better ideas that don't cost anything? Hit me up. ---AG + unsafe fn os_str_as_u8_slice(s: &OsStr) -> &[u8] { + ::std::mem::transmute(s) + } + unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr { + ::std::mem::transmute(s) + } + if name.is_empty() { + return None; + } + let name = unsafe { os_str_as_u8_slice(name) }; + for (i, &b) in name.iter().enumerate().rev() { + if b == b'.' { + return Some(unsafe { u8_slice_as_os_str(&name[i..]) }); + } + } + None +} + +/// Return raw bytes of a path, transcoded to UTF-8 if necessary. +pub fn path_bytes(path: &Path) -> Cow<[u8]> { + os_str_bytes(path.as_os_str()) +} + +/// Return the raw bytes of the given OS string, transcoded to UTF-8 if +/// necessary. +#[cfg(unix)] +pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { + use std::os::unix::ffi::OsStrExt; + Cow::Borrowed(s.as_bytes()) +} + +/// Return the raw bytes of the given OS string, transcoded to UTF-8 if +/// necessary. +#[cfg(not(unix))] +pub fn os_str_bytes(s: &OsStr) -> Cow<[u8]> { + // TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even + // if we could get at the raw bytes, they wouldn't be useful. We *must* + // convert to UTF-8 before doing path matching. Unfortunate, but necessary. + match s.to_string_lossy() { + Cow::Owned(s) => Cow::Owned(s.into_bytes()), + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + } +} + +#[cfg(test)] +mod tests { + use std::ffi::OsStr; + + use super::file_name_ext; + + macro_rules! ext { + ($name:ident, $file_name:expr, $ext:expr) => { + #[test] + fn $name() { + let got = file_name_ext(OsStr::new($file_name)); + assert_eq!($ext.map(OsStr::new), got); + } + }; + } + + ext!(ext1, "foo.rs", Some(".rs")); + ext!(ext2, ".rs", Some(".rs")); + ext!(ext3, "..rs", Some(".rs")); + ext!(ext4, "", None::<&str>); + ext!(ext5, "foo", None::<&str>); +} diff --git a/globset/src/pattern.rs b/globset/src/pattern.rs new file mode 100644 index 00000000..1eff726a --- /dev/null +++ b/globset/src/pattern.rs @@ -0,0 +1,1379 @@ +use std::ffi::{OsStr, OsString}; +use std::fmt; +use std::iter; +use std::ops::{Deref, DerefMut}; +use std::path::Path; +use std::str; + +use regex; +use regex::bytes::Regex; + +use {Error, FILE_SEPARATORS, new_regex}; +use pathutil::path_bytes; + +/// Describes a matching strategy for a particular pattern. +/// +/// This provides a way to more quickly determine whether a pattern matches +/// a particular file path in a way that scales with a large number of +/// patterns. For example, if many patterns are of the form `*.ext`, then it's +/// possible to test whether any of those patterns matches by looking up a +/// file path's extension in a hash table. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum MatchStrategy { + /// A pattern matches if and only if the entire file path matches this + /// literal string. + Literal(String), + /// A pattern matches if and only if the file path's basename matches this + /// literal string. + BasenameLiteral(String), + /// A pattern matches if and only if the file path's extension matches this + /// literal string. + Extension(OsString), + /// A pattern matches if and only if this prefix literal is a prefix of the + /// candidate file path. + Prefix(String), + /// A pattern matches if and only if this prefix literal is a prefix of the + /// candidate file path. + /// + /// An exception: if `component` is true, then `suffix` must appear at the + /// beginning of a file path or immediately following a `/`. + Suffix { + /// The actual suffix. + suffix: String, + /// Whether this must start at the beginning of a path component. + component: bool, + }, + /// A pattern matches only if the given extension matches the file path's + /// extension. Note that this is a necessary but NOT sufficient criterion. + /// Namely, if the extension matches, then a full regex search is still + /// required. + RequiredExtension(OsString), + /// A regex needs to be used for matching. + Regex, +} + +impl MatchStrategy { + /// Returns a matching strategy for the given pattern. + pub fn new(pat: &Pattern) -> MatchStrategy { + if let Some(lit) = pat.basename_literal() { + MatchStrategy::BasenameLiteral(lit) + } else if let Some(lit) = pat.literal() { + MatchStrategy::Literal(lit) + } else if let Some(ext) = pat.ext() { + MatchStrategy::Extension(ext) + } else if let Some(prefix) = pat.prefix() { + MatchStrategy::Prefix(prefix) + } else if let Some((suffix, component)) = pat.suffix() { + MatchStrategy::Suffix { suffix: suffix, component: component } + } else if let Some(ext) = pat.required_ext() { + MatchStrategy::RequiredExtension(ext) + } else { + MatchStrategy::Regex + } + } +} + +/// Pattern represents a successfully parsed shell glob pattern. +/// +/// It cannot be used directly to match file paths, but it can be converted +/// to a regular expression string. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct Pattern { + glob: String, + re: String, + opts: PatternOptions, + tokens: Tokens, +} + +impl fmt::Display for Pattern { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.glob.fmt(f) + } +} + +/// A matcher for a single pattern. +#[derive(Clone, Debug)] +pub struct PatternMatcher { + /// The underlying pattern. + pat: Pattern, + /// The pattern, as a compiled regex. + re: Regex, +} + +impl PatternMatcher { + /// Tests whether the given path matches this pattern or not. + pub fn is_match>(&self, path: P) -> bool { + self.re.is_match(&*path_bytes(path.as_ref())) + } +} + +/// A strategic matcher for a single pattern. +#[cfg(test)] +#[derive(Clone, Debug)] +struct PatternStrategic { + /// The match strategy to use. + strategy: MatchStrategy, + /// The underlying pattern. + pat: Pattern, + /// The pattern, as a compiled regex. + re: Regex, +} + +#[cfg(test)] +impl PatternStrategic { + /// Tests whether the given path matches this pattern or not. + pub fn is_match>(&self, path: P) -> bool { + use pathutil::file_name_ext; + + let cow_path = path_bytes(path.as_ref()); + let byte_path = &*cow_path; + + match self.strategy { + MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path, + MatchStrategy::BasenameLiteral(ref lit) => { + let lit = OsStr::new(lit); + path.as_ref().file_name().map(|n| n == lit).unwrap_or(false) + } + MatchStrategy::Extension(ref ext) => { + path.as_ref().file_name() + .and_then(file_name_ext) + .map(|got| got == ext) + .unwrap_or(false) + } + MatchStrategy::Prefix(ref pre) => { + starts_with(pre.as_bytes(), byte_path) + } + MatchStrategy::Suffix { ref suffix, component } => { + if component && byte_path == &suffix.as_bytes()[1..] { + return true; + } + ends_with(suffix.as_bytes(), byte_path) + } + MatchStrategy::RequiredExtension(ref ext) => { + path.as_ref().file_name() + .and_then(file_name_ext) + .map(|got| got == ext && self.re.is_match(byte_path)) + .unwrap_or(false) + } + MatchStrategy::Regex => self.re.is_match(byte_path), + } + } +} + +/// A builder for a pattern. +/// +/// This builder enables configuring the match semantics of a pattern. For +/// example, one can make matching case insensitive. +/// +/// The lifetime `'a` refers to the lifetime of the pattern string. +#[derive(Clone, Debug)] +pub struct PatternBuilder<'a> { + /// The glob pattern to compile. + glob: &'a str, + /// Options for the pattern. + opts: PatternOptions, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +struct PatternOptions { + /// Whether to match case insensitively. + case_insensitive: bool, + /// Whether to require a literal separator to match a separator in a file + /// path. e.g., when enabled, `*` won't match `/`. + literal_separator: bool, +} + +#[derive(Clone, Debug, Default, Eq, PartialEq)] +struct Tokens(Vec); + +impl Deref for Tokens { + type Target = Vec; + fn deref(&self) -> &Vec { &self.0 } +} + +impl DerefMut for Tokens { + fn deref_mut(&mut self) -> &mut Vec { &mut self.0 } +} + +#[derive(Clone, Debug, Eq, PartialEq)] +enum Token { + Literal(char), + Any, + ZeroOrMore, + RecursivePrefix, + RecursiveSuffix, + RecursiveZeroOrMore, + Class { + negated: bool, + ranges: Vec<(char, char)>, + }, + Alternates(Vec), +} + +impl Pattern { + /// Builds a new pattern with default options. + pub fn new(glob: &str) -> Result { + PatternBuilder::new(glob).build() + } + + /// Returns a matcher for this pattern. + pub fn compile_matcher(&self) -> PatternMatcher { + let re = new_regex(&self.re) + .expect("regex compilation shouldn't fail"); + PatternMatcher { + pat: self.clone(), + re: re, + } + } + + /// Returns a strategic matcher. + /// + /// This isn't exposed because it's not clear whether it's actually + /// faster than just running a regex for a *single* pattern. If it + /// is faster, then PatternMatcher should do it automatically. + #[cfg(test)] + fn compile_strategic_matcher(&self) -> PatternStrategic { + let strategy = MatchStrategy::new(self); + let re = new_regex(&self.re) + .expect("regex compilation shouldn't fail"); + PatternStrategic { + strategy: strategy, + pat: self.clone(), + re: re, + } + } + + /// Returns the original glob pattern used to build this pattern. + pub fn glob(&self) -> &str { + &self.glob + } + + /// Returns the regular expression string for this glob. + pub fn regex(&self) -> &str { + &self.re + } + + /// Returns true if and only if this pattern only inspects the basename + /// of a path. + pub fn is_only_basename(&self) -> bool { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return false, + } + for t in &self.tokens[1..] { + match *t { + Token::Literal(c) if c == '/' || c == '\\' => return false, + Token::RecursivePrefix + | Token::RecursiveSuffix + | Token::RecursiveZeroOrMore => return false, + _ => {} + } + } + true + } + + /// Returns the pattern as a literal if and only if the pattern must match + /// an entire path exactly. + /// + /// The basic format of these patterns is `{literal}`. + pub fn literal(&self) -> Option { + if self.opts.case_insensitive { + return None; + } + let mut lit = String::new(); + for t in &*self.tokens { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + if lit.is_empty() { + None + } else { + Some(lit) + } + } + + /// Returns an extension if this pattern matches a file path if and only + /// if the file path has the extension returned. + /// + /// Note that this extension returned differs from the extension that + /// std::path::Path::extension returns. Namely, this extension includes + /// the '.'. Also, paths like `.rs` are considered to have an extension + /// of `.rs`. + pub fn ext(&self) -> Option { + if self.opts.case_insensitive { + return None; + } + let start = match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => 1, + Some(_) => 0, + _ => return None, + }; + match self.tokens.get(start) { + Some(&Token::ZeroOrMore) => { + // If there was no recursive prefix, then we only permit + // `*` if `*` can match a `/`. For example, if `*` can't + // match `/`, then `*.c` doesn't match `foo/bar.c`. + if start == 0 && self.opts.literal_separator { + return None; + } + } + _ => return None, + } + match self.tokens.get(start + 1) { + Some(&Token::Literal('.')) => {} + _ => return None, + } + let mut lit = OsStr::new(".").to_os_string(); + for t in self.tokens[start + 2..].iter() { + match *t { + Token::Literal('.') | Token::Literal('/') => return None, + Token::Literal(c) => lit.push(c.to_string()), + _ => return None, + } + } + if lit.is_empty() { + None + } else { + Some(lit) + } + } + + /// This is like `ext`, but returns an extension even if it isn't sufficent + /// to imply a match. Namely, if an extension is returned, then it is + /// necessary but not sufficient for a match. + pub fn required_ext(&self) -> Option { + if self.opts.case_insensitive { + return None; + } + // We don't care at all about the beginning of this pattern. All we + // need to check for is if it ends with a literal of the form `.ext`. + let mut ext: Vec = vec![]; // built in reverse + for t in self.tokens.iter().rev() { + match *t { + Token::Literal('/') => return None, + Token::Literal(c) => { + ext.push(c); + if c == '.' { + break; + } + } + _ => return None, + } + } + if ext.last() != Some(&'.') { + None + } else { + ext.reverse(); + Some(OsString::from(ext.into_iter().collect::())) + } + } + + /// Returns a literal prefix of this pattern if the entire pattern matches + /// if the literal prefix matches. + pub fn prefix(&self) -> Option { + if self.opts.case_insensitive { + return None; + } + let end = match self.tokens.last() { + Some(&Token::ZeroOrMore) => { + if self.opts.literal_separator { + // If a trailing `*` can't match a `/`, then we can't + // assume a match of the prefix corresponds to a match + // of the overall pattern. e.g., `foo/*` with + // `literal_separator` enabled matches `foo/bar` but not + // `foo/bar/baz`, even though `foo/bar/baz` has a `foo/` + // literal prefix. + return None; + } + self.tokens.len() - 1 + } + _ => self.tokens.len(), + }; + let mut lit = String::new(); + for t in &self.tokens[0..end] { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + if lit.is_empty() { + None + } else { + Some(lit) + } + } + + /// Returns a literal suffix of this pattern if the entire pattern matches + /// if the literal suffix matches. + /// + /// If a literal suffix is returned and it must match either the entire + /// file path or be preceded by a `/`, then also return true. This happens + /// with a pattern like `**/foo/bar`. Namely, this pattern matches + /// `foo/bar` and `baz/foo/bar`, but not `foofoo/bar`. In this case, the + /// suffix returned is `/foo/bar` (but should match the entire path + /// `foo/bar`). + /// + /// When this returns true, the suffix literal is guaranteed to start with + /// a `/`. + pub fn suffix(&self) -> Option<(String, bool)> { + if self.opts.case_insensitive { + return None; + } + let mut lit = String::new(); + let (start, entire) = match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => { + // We only care if this follows a path component if the next + // token is a literal. + if let Some(&Token::Literal(_)) = self.tokens.get(1) { + lit.push('/'); + (1, true) + } else { + (1, false) + } + } + _ => (0, false), + }; + let start = match self.tokens.get(start) { + Some(&Token::ZeroOrMore) => { + // If literal_separator is enabled, then a `*` can't + // necessarily match everything, so reporting a suffix match + // as a match of the pattern would be a false positive. + if self.opts.literal_separator { + return None; + } + start + 1 + } + _ => start, + }; + for t in &self.tokens[start..] { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + if lit.is_empty() || lit == "/" { + None + } else { + Some((lit, entire)) + } + } + + /// If this pattern only needs to inspect the basename of a file path, + /// then the tokens corresponding to only the basename match are returned. + /// + /// For example, given a pattern of `**/*.foo`, only the tokens + /// corresponding to `*.foo` are returned. + /// + /// Note that this will return None if any match of the basename tokens + /// doesn't correspond to a match of the entire pattern. For example, the + /// glob `foo` only matches when a file path has a basename of `foo`, but + /// doesn't *always* match when a file path has a basename of `foo`. e.g., + /// `foo` doesn't match `abc/foo`. + fn basename_tokens(&self) -> Option<&[Token]> { + if self.opts.case_insensitive { + return None; + } + let start = match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => 1, + _ => { + // With nothing to gobble up the parent portion of a path, + // we can't assume that matching on only the basename is + // correct. + return None; + } + }; + if self.tokens[start..].is_empty() { + return None; + } + for t in &self.tokens[start..] { + match *t { + Token::Literal('/') => return None, + Token::Literal(_) => {} // OK + Token::Any | Token::ZeroOrMore => { + if !self.opts.literal_separator { + // In this case, `*` and `?` can match a path + // separator, which means this could reach outside + // the basename. + return None; + } + } + Token::RecursivePrefix + | Token::RecursiveSuffix + | Token::RecursiveZeroOrMore => { + return None; + } + Token::Class{..} | Token::Alternates(..) => { + // We *could* be a little smarter here, but either one + // of these is going to prevent our literal optimizations + // anyway, so give up. + return None; + } + } + } + Some(&self.tokens[start..]) + } + + /// Returns the pattern as a literal if and only if the pattern exclusiely + /// matches the basename of a file path *and* is a literal. + /// + /// The basic format of these patterns is `**/{literal}`, where `{literal}` + /// does not contain a path separator. + pub fn basename_literal(&self) -> Option { + self.base_literal() + } + + /// Returns the pattern as a literal if and only if the pattern exclusiely + /// matches the basename of a file path *and* is a literal. + /// + /// The basic format of these patterns is `**/{literal}`, where `{literal}` + /// does not contain a path separator. + pub fn base_literal(&self) -> Option { + let tokens = match self.basename_tokens() { + None => return None, + Some(tokens) => tokens, + }; + let mut lit = String::new(); + for t in tokens { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns a literal prefix of this pattern if and only if the entire + /// pattern matches if the literal prefix matches. + pub fn literal_prefix(&self) -> Option { + match self.tokens.last() { + Some(&Token::ZeroOrMore) => {} + _ => return None, + } + let mut lit = String::new(); + for t in &self.tokens[0..self.tokens.len()-1] { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns a literal suffix of this pattern if and only if the entire + /// pattern matches if the literal suffix matches. + pub fn literal_suffix(&self) -> Option { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + let start = + match self.tokens.get(1) { + Some(&Token::ZeroOrMore) => 2, + _ => 1, + }; + let mut lit = String::new(); + for t in &self.tokens[start..] { + match *t { + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns a basename literal prefix of this pattern. + pub fn base_literal_prefix(&self) -> Option { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + match self.tokens.last() { + Some(&Token::ZeroOrMore) => {} + _ => return None, + } + let mut lit = String::new(); + for t in &self.tokens[1..self.tokens.len()-1] { + match *t { + Token::Literal(c) if c == '/' || c == '\\' => return None, + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } + + /// Returns a basename literal suffix of this pattern. + pub fn base_literal_suffix(&self) -> Option { + match self.tokens.get(0) { + Some(&Token::RecursivePrefix) => {} + _ => return None, + } + match self.tokens.get(1) { + Some(&Token::ZeroOrMore) => {} + _ => return None, + } + let mut lit = String::new(); + for t in &self.tokens[2..] { + match *t { + Token::Literal(c) if c == '/' || c == '\\' => return None, + Token::Literal(c) => lit.push(c), + _ => return None, + } + } + Some(lit) + } +} + +impl<'a> PatternBuilder<'a> { + /// Create a new builder for the pattern given. + /// + /// The pattern is not compiled until `build` is called. + pub fn new(glob: &'a str) -> PatternBuilder<'a> { + PatternBuilder { + glob: glob, + opts: PatternOptions::default(), + } + } + + /// Parses and builds the pattern. + pub fn build(&self) -> Result { + let mut p = Parser { + stack: vec![Tokens::default()], + chars: self.glob.chars().peekable(), + prev: None, + cur: None, + }; + try!(p.parse()); + if p.stack.is_empty() { + Err(Error::UnopenedAlternates) + } else if p.stack.len() > 1 { + Err(Error::UnclosedAlternates) + } else { + let tokens = p.stack.pop().unwrap(); + Ok(Pattern { + glob: self.glob.to_string(), + re: tokens.to_regex_with(&self.opts), + opts: self.opts, + tokens: tokens, + }) + } + } + + /// Toggle whether the pattern matches case insensitively or not. + /// + /// This is disabled by default. + pub fn case_insensitive(&mut self, yes: bool) -> &mut PatternBuilder<'a> { + self.opts.case_insensitive = yes; + self + } + + /// Toggle whether a literal `/` is required to match a path separator. + pub fn literal_separator(&mut self, yes: bool) -> &mut PatternBuilder<'a> { + self.opts.literal_separator = yes; + self + } +} + +impl Tokens { + /// Convert this pattern to a string that is guaranteed to be a valid + /// regular expression and will represent the matching semantics of this + /// glob pattern and the options given. + fn to_regex_with(&self, options: &PatternOptions) -> String { + let mut re = String::new(); + re.push_str("(?-u)"); + if options.case_insensitive { + re.push_str("(?i)"); + } + re.push('^'); + // Special case. If the entire glob is just `**`, then it should match + // everything. + if self.len() == 1 && self[0] == Token::RecursivePrefix { + re.push_str(".*"); + re.push('$'); + return re; + } + self.tokens_to_regex(options, &self, &mut re); + re.push('$'); + re + } + + + fn tokens_to_regex( + &self, + options: &PatternOptions, + tokens: &[Token], + re: &mut String, + ) { + let seps = &*FILE_SEPARATORS; + + for tok in tokens { + match *tok { + Token::Literal(c) => { + re.push_str(®ex::quote(&c.to_string())); + } + Token::Any => { + if options.literal_separator { + re.push_str(&format!("[^{}]", seps)); + } else { + re.push_str("."); + } + } + Token::ZeroOrMore => { + if options.literal_separator { + re.push_str(&format!("[^{}]*", seps)); + } else { + re.push_str(".*"); + } + } + Token::RecursivePrefix => { + re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps)); + } + Token::RecursiveSuffix => { + re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps)); + } + Token::RecursiveZeroOrMore => { + re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])", + sep=seps)); + } + Token::Class { negated, ref ranges } => { + re.push('['); + if negated { + re.push('^'); + } + for r in ranges { + if r.0 == r.1 { + // Not strictly necessary, but nicer to look at. + re.push_str(®ex::quote(&r.0.to_string())); + } else { + re.push_str(®ex::quote(&r.0.to_string())); + re.push('-'); + re.push_str(®ex::quote(&r.1.to_string())); + } + } + re.push(']'); + } + Token::Alternates(ref patterns) => { + let mut parts = vec![]; + for pat in patterns { + let mut altre = String::new(); + self.tokens_to_regex(options, &pat, &mut altre); + parts.push(altre); + } + re.push_str(&parts.join("|")); + } + } + } + } +} + +struct Parser<'a> { + stack: Vec, + chars: iter::Peekable>, + prev: Option, + cur: Option, +} + +impl<'a> Parser<'a> { + fn parse(&mut self) -> Result<(), Error> { + while let Some(c) = self.bump() { + match c { + '?' => try!(self.push_token(Token::Any)), + '*' => try!(self.parse_star()), + '[' => try!(self.parse_class()), + '{' => try!(self.push_alternate()), + '}' => try!(self.pop_alternate()), + ',' => try!(self.parse_comma()), + c => try!(self.push_token(Token::Literal(c))), + } + } + Ok(()) + } + + fn push_alternate(&mut self) -> Result<(), Error> { + if self.stack.len() > 1 { + return Err(Error::NestedAlternates); + } + Ok(self.stack.push(Tokens::default())) + } + + fn pop_alternate(&mut self) -> Result<(), Error> { + let mut alts = vec![]; + while self.stack.len() >= 2 { + alts.push(self.stack.pop().unwrap()); + } + self.push_token(Token::Alternates(alts)) + } + + fn push_token(&mut self, tok: Token) -> Result<(), Error> { + match self.stack.last_mut() { + None => Err(Error::UnopenedAlternates), + Some(ref mut pat) => Ok(pat.push(tok)), + } + } + + fn pop_token(&mut self) -> Result { + match self.stack.last_mut() { + None => Err(Error::UnopenedAlternates), + Some(ref mut pat) => Ok(pat.pop().unwrap()), + } + } + + fn have_tokens(&self) -> Result { + match self.stack.last() { + None => Err(Error::UnopenedAlternates), + Some(ref pat) => Ok(!pat.is_empty()), + } + } + + fn parse_comma(&mut self) -> Result<(), Error> { + // If we aren't inside a group alternation, then don't + // treat commas specially. Otherwise, we need to start + // a new alternate. + if self.stack.len() <= 1 { + self.push_token(Token::Literal(',')) + } else { + Ok(self.stack.push(Tokens::default())) + } + } + + fn parse_star(&mut self) -> Result<(), Error> { + let prev = self.prev; + if self.chars.peek() != Some(&'*') { + try!(self.push_token(Token::ZeroOrMore)); + return Ok(()); + } + assert!(self.bump() == Some('*')); + if !try!(self.have_tokens()) { + try!(self.push_token(Token::RecursivePrefix)); + let next = self.bump(); + if !next.is_none() && next != Some('/') { + return Err(Error::InvalidRecursive); + } + return Ok(()); + } + try!(self.pop_token()); + if prev != Some('/') { + if self.stack.len() <= 1 + || (prev != Some(',') && prev != Some('{')) { + return Err(Error::InvalidRecursive); + } + } + match self.chars.peek() { + None => { + assert!(self.bump().is_none()); + self.push_token(Token::RecursiveSuffix) + } + Some(&',') | Some(&'}') if self.stack.len() >= 2 => { + self.push_token(Token::RecursiveSuffix) + } + Some(&'/') => { + assert!(self.bump() == Some('/')); + self.push_token(Token::RecursiveZeroOrMore) + } + _ => Err(Error::InvalidRecursive), + } + } + + fn parse_class(&mut self) -> Result<(), Error> { + fn add_to_last_range( + r: &mut (char, char), + add: char, + ) -> Result<(), Error> { + r.1 = add; + if r.1 < r.0 { + Err(Error::InvalidRange(r.0, r.1)) + } else { + Ok(()) + } + } + let mut negated = false; + let mut ranges = vec![]; + if self.chars.peek() == Some(&'!') { + assert!(self.bump() == Some('!')); + negated = true; + } + let mut first = true; + let mut in_range = false; + loop { + let c = match self.bump() { + Some(c) => c, + // The only way to successfully break this loop is to observe + // a ']'. + None => return Err(Error::UnclosedClass), + }; + match c { + ']' => { + if first { + ranges.push((']', ']')); + } else { + break; + } + } + '-' => { + if first { + ranges.push(('-', '-')); + } else if in_range { + // invariant: in_range is only set when there is + // already at least one character seen. + let r = ranges.last_mut().unwrap(); + try!(add_to_last_range(r, '-')); + in_range = false; + } else { + assert!(!ranges.is_empty()); + in_range = true; + } + } + c => { + if in_range { + // invariant: in_range is only set when there is + // already at least one character seen. + try!(add_to_last_range(ranges.last_mut().unwrap(), c)); + } else { + ranges.push((c, c)); + } + in_range = false; + } + } + first = false; + } + if in_range { + // Means that the last character in the class was a '-', so add + // it as a literal. + ranges.push(('-', '-')); + } + self.push_token(Token::Class { + negated: negated, + ranges: ranges, + }) + } + + fn bump(&mut self) -> Option { + self.prev = self.cur; + self.cur = self.chars.next(); + self.cur + } +} + +#[cfg(test)] +fn starts_with(needle: &[u8], haystack: &[u8]) -> bool { + needle.len() <= haystack.len() && needle == &haystack[..needle.len()] +} + +#[cfg(test)] +fn ends_with(needle: &[u8], haystack: &[u8]) -> bool { + if needle.len() > haystack.len() { + return false; + } + needle == &haystack[haystack.len() - needle.len()..] +} + +#[cfg(test)] +mod tests { + use std::ffi::{OsStr, OsString}; + + use {SetBuilder, Error}; + use super::{Pattern, PatternBuilder, Token}; + use super::Token::*; + + #[derive(Clone, Copy, Debug, Default)] + struct Options { + casei: bool, + litsep: bool, + } + + macro_rules! syntax { + ($name:ident, $pat:expr, $tokens:expr) => { + #[test] + fn $name() { + let pat = Pattern::new($pat).unwrap(); + assert_eq!($tokens, pat.tokens.0); + } + } + } + + macro_rules! syntaxerr { + ($name:ident, $pat:expr, $err:expr) => { + #[test] + fn $name() { + let err = Pattern::new($pat).unwrap_err(); + assert_eq!($err, err); + } + } + } + + macro_rules! toregex { + ($name:ident, $pat:expr, $re:expr) => { + toregex!($name, $pat, $re, Options::default()); + }; + ($name:ident, $pat:expr, $re:expr, $options:expr) => { + #[test] + fn $name() { + let pat = PatternBuilder::new($pat) + .case_insensitive($options.casei) + .literal_separator($options.litsep) + .build() + .unwrap(); + assert_eq!(format!("(?-u){}", $re), pat.regex()); + } + }; + } + + macro_rules! matches { + ($name:ident, $pat:expr, $path:expr) => { + matches!($name, $pat, $path, Options::default()); + }; + ($name:ident, $pat:expr, $path:expr, $options:expr) => { + #[test] + fn $name() { + let pat = PatternBuilder::new($pat) + .case_insensitive($options.casei) + .literal_separator($options.litsep) + .build() + .unwrap(); + let matcher = pat.compile_matcher(); + let strategic = pat.compile_strategic_matcher(); + let set = SetBuilder::new().add(pat).build().unwrap(); + assert!(matcher.is_match($path)); + assert!(strategic.is_match($path)); + assert!(set.is_match($path)); + } + }; + } + + macro_rules! nmatches { + ($name:ident, $pat:expr, $path:expr) => { + nmatches!($name, $pat, $path, Options::default()); + }; + ($name:ident, $pat:expr, $path:expr, $options:expr) => { + #[test] + fn $name() { + let pat = PatternBuilder::new($pat) + .case_insensitive($options.casei) + .literal_separator($options.litsep) + .build() + .unwrap(); + let matcher = pat.compile_matcher(); + let strategic = pat.compile_strategic_matcher(); + let set = SetBuilder::new().add(pat).build().unwrap(); + assert!(!matcher.is_match($path)); + assert!(!strategic.is_match($path)); + assert!(!set.is_match($path)); + } + }; + } + + fn s(string: &str) -> String { string.to_string() } + fn os(string: &str) -> OsString { OsStr::new(string).to_os_string() } + + fn class(s: char, e: char) -> Token { + Class { negated: false, ranges: vec![(s, e)] } + } + + fn classn(s: char, e: char) -> Token { + Class { negated: true, ranges: vec![(s, e)] } + } + + fn rclass(ranges: &[(char, char)]) -> Token { + Class { negated: false, ranges: ranges.to_vec() } + } + + fn rclassn(ranges: &[(char, char)]) -> Token { + Class { negated: true, ranges: ranges.to_vec() } + } + + syntax!(literal1, "a", vec![Literal('a')]); + syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]); + syntax!(any1, "?", vec![Any]); + syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]); + syntax!(seq1, "*", vec![ZeroOrMore]); + syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]); + syntax!(seq3, "*a*b*", vec![ + ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore, + ]); + syntax!(rseq1, "**", vec![RecursivePrefix]); + syntax!(rseq2, "**/", vec![RecursivePrefix]); + syntax!(rseq3, "/**", vec![RecursiveSuffix]); + syntax!(rseq4, "/**/", vec![RecursiveZeroOrMore]); + syntax!(rseq5, "a/**/b", vec![ + Literal('a'), RecursiveZeroOrMore, Literal('b'), + ]); + syntax!(cls1, "[a]", vec![class('a', 'a')]); + syntax!(cls2, "[!a]", vec![classn('a', 'a')]); + syntax!(cls3, "[a-z]", vec![class('a', 'z')]); + syntax!(cls4, "[!a-z]", vec![classn('a', 'z')]); + syntax!(cls5, "[-]", vec![class('-', '-')]); + syntax!(cls6, "[]]", vec![class(']', ']')]); + syntax!(cls7, "[*]", vec![class('*', '*')]); + syntax!(cls8, "[!!]", vec![classn('!', '!')]); + syntax!(cls9, "[a-]", vec![rclass(&[('a', 'a'), ('-', '-')])]); + syntax!(cls10, "[-a-z]", vec![rclass(&[('-', '-'), ('a', 'z')])]); + syntax!(cls11, "[a-z-]", vec![rclass(&[('a', 'z'), ('-', '-')])]); + syntax!(cls12, "[-a-z-]", vec![ + rclass(&[('-', '-'), ('a', 'z'), ('-', '-')]), + ]); + syntax!(cls13, "[]-z]", vec![class(']', 'z')]); + syntax!(cls14, "[--z]", vec![class('-', 'z')]); + syntax!(cls15, "[ --]", vec![class(' ', '-')]); + syntax!(cls16, "[0-9a-z]", vec![rclass(&[('0', '9'), ('a', 'z')])]); + syntax!(cls17, "[a-z0-9]", vec![rclass(&[('a', 'z'), ('0', '9')])]); + syntax!(cls18, "[!0-9a-z]", vec![rclassn(&[('0', '9'), ('a', 'z')])]); + syntax!(cls19, "[!a-z0-9]", vec![rclassn(&[('a', 'z'), ('0', '9')])]); + + syntaxerr!(err_rseq1, "a**", Error::InvalidRecursive); + syntaxerr!(err_rseq2, "**a", Error::InvalidRecursive); + syntaxerr!(err_rseq3, "a**b", Error::InvalidRecursive); + syntaxerr!(err_rseq4, "***", Error::InvalidRecursive); + syntaxerr!(err_rseq5, "/a**", Error::InvalidRecursive); + syntaxerr!(err_rseq6, "/**a", Error::InvalidRecursive); + syntaxerr!(err_rseq7, "/a**b", Error::InvalidRecursive); + syntaxerr!(err_unclosed1, "[", Error::UnclosedClass); + syntaxerr!(err_unclosed2, "[]", Error::UnclosedClass); + syntaxerr!(err_unclosed3, "[!", Error::UnclosedClass); + syntaxerr!(err_unclosed4, "[!]", Error::UnclosedClass); + syntaxerr!(err_range1, "[z-a]", Error::InvalidRange('z', 'a')); + syntaxerr!(err_range2, "[z--]", Error::InvalidRange('z', '-')); + + const CASEI: Options = Options { + casei: true, + litsep: false, + }; + const SLASHLIT: Options = Options { + casei: false, + litsep: true, + }; + + toregex!(re_casei, "a", "(?i)^a$", &CASEI); + + toregex!(re_slash1, "?", r"^[^/\\]$", SLASHLIT); + toregex!(re_slash2, "*", r"^[^/\\]*$", SLASHLIT); + + toregex!(re1, "a", "^a$"); + toregex!(re2, "?", "^.$"); + toregex!(re3, "*", "^.*$"); + toregex!(re4, "a?", "^a.$"); + toregex!(re5, "?a", "^.a$"); + toregex!(re6, "a*", "^a.*$"); + toregex!(re7, "*a", "^.*a$"); + toregex!(re8, "[*]", r"^[\*]$"); + toregex!(re9, "[+]", r"^[\+]$"); + toregex!(re10, "+", r"^\+$"); + toregex!(re11, "**", r"^.*$"); + + matches!(match1, "a", "a"); + matches!(match2, "a*b", "a_b"); + matches!(match3, "a*b*c", "abc"); + matches!(match4, "a*b*c", "a_b_c"); + matches!(match5, "a*b*c", "a___b___c"); + matches!(match6, "abc*abc*abc", "abcabcabcabcabcabcabc"); + matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); + matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd"); + matches!(match9, "*.rs", ".rs"); + + matches!(matchrec1, "some/**/needle.txt", "some/needle.txt"); + matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt"); + matches!(matchrec3, "some/**/needle.txt", "some/one/two/needle.txt"); + matches!(matchrec4, "some/**/needle.txt", "some/other/needle.txt"); + matches!(matchrec5, "**", "abcde"); + matches!(matchrec6, "**", ""); + matches!(matchrec7, "**", ".asdf"); + matches!(matchrec8, "**", "/x/.asdf"); + matches!(matchrec9, "some/**/**/needle.txt", "some/needle.txt"); + matches!(matchrec10, "some/**/**/needle.txt", "some/one/needle.txt"); + matches!(matchrec11, "some/**/**/needle.txt", "some/one/two/needle.txt"); + matches!(matchrec12, "some/**/**/needle.txt", "some/other/needle.txt"); + matches!(matchrec13, "**/test", "one/two/test"); + matches!(matchrec14, "**/test", "one/test"); + matches!(matchrec15, "**/test", "test"); + matches!(matchrec16, "/**/test", "/one/two/test"); + matches!(matchrec17, "/**/test", "/one/test"); + matches!(matchrec18, "/**/test", "/test"); + matches!(matchrec19, "**/.*", ".abc"); + matches!(matchrec20, "**/.*", "abc/.abc"); + matches!(matchrec21, ".*/**", ".abc"); + matches!(matchrec22, ".*/**", ".abc/abc"); + matches!(matchrec23, "foo/**", "foo"); + matches!(matchrec24, "**/foo/bar", "foo/bar"); + + matches!(matchrange1, "a[0-9]b", "a0b"); + matches!(matchrange2, "a[0-9]b", "a9b"); + matches!(matchrange3, "a[!0-9]b", "a_b"); + matches!(matchrange4, "[a-z123]", "1"); + matches!(matchrange5, "[1a-z23]", "1"); + matches!(matchrange6, "[123a-z]", "1"); + matches!(matchrange7, "[abc-]", "-"); + matches!(matchrange8, "[-abc]", "-"); + matches!(matchrange9, "[-a-c]", "b"); + matches!(matchrange10, "[a-c-]", "b"); + matches!(matchrange11, "[-]", "-"); + + matches!(matchpat1, "*hello.txt", "hello.txt"); + matches!(matchpat2, "*hello.txt", "gareth_says_hello.txt"); + matches!(matchpat3, "*hello.txt", "some/path/to/hello.txt"); + matches!(matchpat4, "*hello.txt", "some\\path\\to\\hello.txt"); + matches!(matchpat5, "*hello.txt", "/an/absolute/path/to/hello.txt"); + matches!(matchpat6, "*some/path/to/hello.txt", "some/path/to/hello.txt"); + matches!(matchpat7, "*some/path/to/hello.txt", + "a/bigger/some/path/to/hello.txt"); + + matches!(matchescape, "_[[]_[]]_[?]_[*]_!_", "_[_]_?_*_!_"); + + matches!(matchcasei1, "aBcDeFg", "aBcDeFg", CASEI); + matches!(matchcasei2, "aBcDeFg", "abcdefg", CASEI); + matches!(matchcasei3, "aBcDeFg", "ABCDEFG", CASEI); + matches!(matchcasei4, "aBcDeFg", "AbCdEfG", CASEI); + + matches!(matchalt1, "a,b", "a,b"); + matches!(matchalt2, ",", ","); + matches!(matchalt3, "{a,b}", "a"); + matches!(matchalt4, "{a,b}", "b"); + matches!(matchalt5, "{**/src/**,foo}", "abc/src/bar"); + matches!(matchalt6, "{**/src/**,foo}", "foo"); + matches!(matchalt7, "{[}],foo}", "}"); + matches!(matchalt8, "{foo}", "foo"); + matches!(matchalt9, "{}", ""); + matches!(matchalt10, "{,}", ""); + matches!(matchalt11, "{*.foo,*.bar,*.wat}", "test.foo"); + matches!(matchalt12, "{*.foo,*.bar,*.wat}", "test.bar"); + matches!(matchalt13, "{*.foo,*.bar,*.wat}", "test.wat"); + + matches!(matchslash1, "abc/def", "abc/def", SLASHLIT); + nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT); + nmatches!(matchslash2_win, "abc?def", "abc\\def", SLASHLIT); + nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT); + matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs + + nmatches!(matchnot1, "a*b*c", "abcd"); + nmatches!(matchnot2, "abc*abc*abc", "abcabcabcabcabcabcabca"); + nmatches!(matchnot3, "some/**/needle.txt", "some/other/notthis.txt"); + nmatches!(matchnot4, "some/**/**/needle.txt", "some/other/notthis.txt"); + nmatches!(matchnot5, "/**/test", "test"); + nmatches!(matchnot6, "/**/test", "/one/notthis"); + nmatches!(matchnot7, "/**/test", "/notthis"); + nmatches!(matchnot8, "**/.*", "ab.c"); + nmatches!(matchnot9, "**/.*", "abc/ab.c"); + nmatches!(matchnot10, ".*/**", "a.bc"); + nmatches!(matchnot11, ".*/**", "abc/a.bc"); + nmatches!(matchnot12, "a[0-9]b", "a_b"); + nmatches!(matchnot13, "a[!0-9]b", "a0b"); + nmatches!(matchnot14, "a[!0-9]b", "a9b"); + nmatches!(matchnot15, "[!-]", "-"); + nmatches!(matchnot16, "*hello.txt", "hello.txt-and-then-some"); + nmatches!(matchnot17, "*hello.txt", "goodbye.txt"); + nmatches!(matchnot18, "*some/path/to/hello.txt", + "some/path/to/hello.txt-and-then-some"); + nmatches!(matchnot19, "*some/path/to/hello.txt", + "some/other/path/to/hello.txt"); + nmatches!(matchnot20, "a", "foo/a"); + nmatches!(matchnot21, "./foo", "foo"); + nmatches!(matchnot22, "**/foo", "foofoo"); + nmatches!(matchnot23, "**/foo/bar", "foofoo/bar"); + nmatches!(matchnot24, "/*.c", "mozilla-sha1/sha1.c"); + nmatches!(matchnot25, "*.c", "mozilla-sha1/sha1.c", SLASHLIT); + nmatches!(matchnot26, "**/m4/ltoptions.m4", + "csharp/src/packages/repositories.config", SLASHLIT); + + macro_rules! extract { + ($which:ident, $name:ident, $pat:expr, $expect:expr) => { + extract!($which, $name, $pat, $expect, Options::default()); + }; + ($which:ident, $name:ident, $pat:expr, $expect:expr, $opts:expr) => { + #[test] + fn $name() { + let pat = PatternBuilder::new($pat) + .case_insensitive($opts.casei) + .literal_separator($opts.litsep) + .build().unwrap(); + assert_eq!($expect, pat.$which()); + } + }; + } + + macro_rules! literal { + ($($tt:tt)*) => { extract!(literal, $($tt)*); } + } + + macro_rules! basetokens { + ($($tt:tt)*) => { extract!(basename_tokens, $($tt)*); } + } + + macro_rules! ext { + ($($tt:tt)*) => { extract!(ext, $($tt)*); } + } + + macro_rules! required_ext { + ($($tt:tt)*) => { extract!(required_ext, $($tt)*); } + } + + macro_rules! prefix { + ($($tt:tt)*) => { extract!(prefix, $($tt)*); } + } + + macro_rules! suffix { + ($($tt:tt)*) => { extract!(suffix, $($tt)*); } + } + + macro_rules! baseliteral { + ($($tt:tt)*) => { extract!(basename_literal, $($tt)*); } + } + + literal!(extract_lit1, "foo", Some(s("foo"))); + literal!(extract_lit2, "foo", None, CASEI); + literal!(extract_lit3, "/foo", Some(s("/foo"))); + literal!(extract_lit4, "/foo/", Some(s("/foo/"))); + literal!(extract_lit5, "/foo/bar", Some(s("/foo/bar"))); + literal!(extract_lit6, "*.foo", None); + literal!(extract_lit7, "foo/bar", Some(s("foo/bar"))); + literal!(extract_lit8, "**/foo/bar", None); + + basetokens!(extract_basetoks1, "**/foo", Some(&*vec![ + Literal('f'), Literal('o'), Literal('o'), + ])); + basetokens!(extract_basetoks2, "**/foo", None, CASEI); + basetokens!(extract_basetoks3, "**/foo", Some(&*vec![ + Literal('f'), Literal('o'), Literal('o'), + ]), SLASHLIT); + basetokens!(extract_basetoks4, "*foo", None, SLASHLIT); + basetokens!(extract_basetoks5, "*foo", None); + basetokens!(extract_basetoks6, "**/fo*o", None); + basetokens!(extract_basetoks7, "**/fo*o", Some(&*vec![ + Literal('f'), Literal('o'), ZeroOrMore, Literal('o'), + ]), SLASHLIT); + + ext!(extract_ext1, "**/*.rs", Some(os(".rs"))); + ext!(extract_ext2, "**/*.rs.bak", None); + ext!(extract_ext3, "*.rs", Some(os(".rs"))); + ext!(extract_ext4, "a*.rs", None); + ext!(extract_ext5, "/*.c", None); + ext!(extract_ext6, "*.c", None, SLASHLIT); + ext!(extract_ext7, "*.c", Some(os(".c"))); + + required_ext!(extract_req_ext1, "*.rs", Some(os(".rs"))); + required_ext!(extract_req_ext2, "/foo/bar/*.rs", Some(os(".rs"))); + required_ext!(extract_req_ext3, "/foo/bar/*.rs", Some(os(".rs"))); + required_ext!(extract_req_ext4, "/foo/bar/.rs", Some(os(".rs"))); + required_ext!(extract_req_ext5, ".rs", Some(os(".rs"))); + required_ext!(extract_req_ext6, "./rs", None); + required_ext!(extract_req_ext7, "foo", None); + required_ext!(extract_req_ext8, ".foo/", None); + required_ext!(extract_req_ext9, "foo/", None); + + prefix!(extract_prefix1, "/foo", Some(s("/foo"))); + prefix!(extract_prefix2, "/foo/*", Some(s("/foo/"))); + prefix!(extract_prefix3, "**/foo", None); + prefix!(extract_prefix4, "foo/**", None); + + suffix!(extract_suffix1, "**/foo/bar", Some((s("/foo/bar"), true))); + suffix!(extract_suffix2, "*/foo/bar", Some((s("/foo/bar"), false))); + suffix!(extract_suffix3, "*/foo/bar", None, SLASHLIT); + suffix!(extract_suffix4, "foo/bar", Some((s("foo/bar"), false))); + suffix!(extract_suffix5, "*.foo", Some((s(".foo"), false))); + suffix!(extract_suffix6, "*.foo", None, SLASHLIT); + suffix!(extract_suffix7, "**/*_test", Some((s("_test"), false))); + + baseliteral!(extract_baselit1, "**/foo", Some(s("foo"))); + baseliteral!(extract_baselit2, "foo", None); + baseliteral!(extract_baselit3, "*foo", None); + baseliteral!(extract_baselit4, "*/foo", None); +} diff --git a/src/gitignore.rs b/src/gitignore.rs index 6191f0b5..5e07531d 100644 --- a/src/gitignore.rs +++ b/src/gitignore.rs @@ -28,7 +28,7 @@ use std::fs::File; use std::io::{self, BufRead}; use std::path::{Path, PathBuf}; -use globset; +use globset::{self, PatternBuilder, Set, SetBuilder}; use regex; use pathutil::{is_file_name, strip_prefix}; @@ -82,7 +82,7 @@ impl From for Error { /// Gitignore is a matcher for the glob patterns in a single gitignore file. #[derive(Clone, Debug)] pub struct Gitignore { - set: globset::Set, + set: Set, root: PathBuf, patterns: Vec, num_ignores: u64, @@ -207,7 +207,7 @@ impl<'a> Match<'a> { /// GitignoreBuilder constructs a matcher for a single set of globs from a /// .gitignore file. pub struct GitignoreBuilder { - builder: globset::SetBuilder, + builder: SetBuilder, root: PathBuf, patterns: Vec, } @@ -237,7 +237,7 @@ impl GitignoreBuilder { pub fn new>(root: P) -> GitignoreBuilder { let root = strip_prefix("./", root.as_ref()).unwrap_or(root.as_ref()); GitignoreBuilder { - builder: globset::SetBuilder::new(), + builder: SetBuilder::new(), root: root.to_path_buf(), patterns: vec![], } @@ -261,6 +261,7 @@ impl GitignoreBuilder { /// Add each pattern line from the file path given. pub fn add_path>(&mut self, path: P) -> Result<(), Error> { let rdr = io::BufReader::new(try!(File::open(&path))); + debug!("gitignore: {}", path.as_ref().display()); for line in rdr.lines() { try!(self.add(&path, &try!(line))); } @@ -299,7 +300,7 @@ impl GitignoreBuilder { whitelist: false, only_dir: false, }; - let mut opts = globset::MatchOptions::default(); + let mut literal_separator = false; let has_slash = line.chars().any(|c| c == '/'); let is_absolute = line.chars().nth(0).unwrap() == '/'; if line.starts_with("\\!") || line.starts_with("\\#") { @@ -314,7 +315,7 @@ impl GitignoreBuilder { // then the glob can only match the beginning of a path // (relative to the location of gitignore). We achieve this by // simply banning wildcards from matching /. - opts.require_literal_separator = true; + literal_separator = true; line = &line[1..]; } } @@ -330,7 +331,7 @@ impl GitignoreBuilder { // doesn't let wildcards match slashes. pat.pat = line.to_string(); if has_slash { - opts.require_literal_separator = true; + literal_separator = true; } // If there was a leading slash, then this is a pattern that must // match the entire path name. Otherwise, we should let it match @@ -347,7 +348,11 @@ impl GitignoreBuilder { if pat.pat.ends_with("/**") { pat.pat = format!("{}/*", pat.pat); } - try!(self.builder.add_with(&pat.pat, &opts)); + let parsed = try!( + PatternBuilder::new(&pat.pat) + .literal_separator(literal_separator) + .build()); + self.builder.add(parsed); self.patterns.push(pat); Ok(()) } @@ -429,6 +434,9 @@ mod tests { not_ignored!(ignot11, ROOT, "#foo", "#foo"); not_ignored!(ignot12, ROOT, "\n\n\n", "foo"); not_ignored!(ignot13, ROOT, "foo/**", "foo", true); + not_ignored!( + ignot14, "./third_party/protobuf", "m4/ltoptions.m4", + "./third_party/protobuf/csharp/src/packages/repositories.config"); // See: https://github.com/BurntSushi/ripgrep/issues/106 #[test] diff --git a/src/types.rs b/src/types.rs index 90b83391..af2a857d 100644 --- a/src/types.rs +++ b/src/types.rs @@ -11,7 +11,7 @@ use std::path::Path; use regex; use gitignore::{Match, Pattern}; -use globset::{self, MatchOptions}; +use globset::{self, PatternBuilder, Set, SetBuilder}; const TYPE_EXTENSIONS: &'static [(&'static str, &'static [&'static str])] = &[ ("asm", &["*.asm", "*.s", "*.S"]), @@ -161,8 +161,8 @@ impl FileTypeDef { #[derive(Clone, Debug)] pub struct Types { defs: Vec, - selected: Option, - negated: Option, + selected: Option, + negated: Option, has_selected: bool, unmatched_pat: Pattern, } @@ -175,8 +175,8 @@ impl Types { /// If has_selected is true, then at least one file type was selected. /// Therefore, any non-matches should be ignored. fn new( - selected: Option, - negated: Option, + selected: Option, + negated: Option, has_selected: bool, defs: Vec, ) -> Types { @@ -265,14 +265,11 @@ impl TypesBuilder { /// Build the current set of file type definitions *and* selections into /// a file type matcher. pub fn build(&self) -> Result { - let opts = MatchOptions { - require_literal_separator: true, ..MatchOptions::default() - }; let selected_globs = if self.selected.is_empty() { None } else { - let mut bset = globset::SetBuilder::new(); + let mut bset = SetBuilder::new(); for name in &self.selected { let globs = match self.types.get(name) { Some(globs) => globs, @@ -282,16 +279,19 @@ impl TypesBuilder { } }; for glob in globs { - try!(bset.add_with(glob, &opts)); + let pat = try!( + PatternBuilder::new(glob) + .literal_separator(true).build()); + bset.add(pat); } } - Some(try!(bset.build_yesno())) + Some(try!(bset.build())) }; let negated_globs = if self.negated.is_empty() { None } else { - let mut bset = globset::SetBuilder::new(); + let mut bset = SetBuilder::new(); for name in &self.negated { let globs = match self.types.get(name) { Some(globs) => globs, @@ -301,10 +301,13 @@ impl TypesBuilder { } }; for glob in globs { - try!(bset.add_with(glob, &opts)); + let pat = try!( + PatternBuilder::new(glob) + .literal_separator(true).build()); + bset.add(pat); } } - Some(try!(bset.build_yesno())) + Some(try!(bset.build())) }; Ok(Types::new( selected_globs, diff --git a/tests/tests.rs b/tests/tests.rs index 62fc55a0..d27db8ce 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -659,7 +659,6 @@ clean!(regression_30, "test", ".", |wd: WorkDir, mut cmd: Command| { } wd.create_dir("vendor"); wd.create("vendor/manifest", "test"); - cmd.arg("--debug"); let lines: String = wd.stdout(&mut cmd); let expected = path("vendor/manifest:test\n");