mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
grep: add "perfect" smart case detection
This commit removes the previous smart case detection logic and replaces it with detection based on the regex AST. This particular AST is a faithful representation of the concrete syntax, which lets us be very precise in how we handle it. Closes #851
This commit is contained in:
parent
cd08707c7c
commit
42b8132d0a
@ -43,6 +43,8 @@ Bug fixes:
|
|||||||
Support backslash escapes in globs.
|
Support backslash escapes in globs.
|
||||||
* [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
|
* [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
|
||||||
Clarify usage instructions for `-f/--file` flag.
|
Clarify usage instructions for `-f/--file` flag.
|
||||||
|
* [BUG #851](https://github.com/BurntSushi/ripgrep/issues/851):
|
||||||
|
Fix `-S/--smart-case` detection once and for all.
|
||||||
* [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
|
* [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
|
||||||
Be robust with respect to `ENOMEM` errors returned by `mmap`.
|
Be robust with respect to `ENOMEM` errors returned by `mmap`.
|
||||||
* [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
|
* [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
|
||||||
|
@ -19,6 +19,7 @@ pub use search::{Grep, GrepBuilder, Iter, Match};
|
|||||||
mod literals;
|
mod literals;
|
||||||
mod nonl;
|
mod nonl;
|
||||||
mod search;
|
mod search;
|
||||||
|
mod smart_case;
|
||||||
mod word_boundary;
|
mod word_boundary;
|
||||||
|
|
||||||
/// Result is a convenient type alias that fixes the type of the error to
|
/// Result is a convenient type alias that fixes the type of the error to
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
use memchr::{memchr, memrchr};
|
use memchr::{memchr, memrchr};
|
||||||
|
use syntax::ParserBuilder;
|
||||||
|
use syntax::hir::Hir;
|
||||||
use regex::bytes::{Regex, RegexBuilder};
|
use regex::bytes::{Regex, RegexBuilder};
|
||||||
|
|
||||||
use literals::LiteralSets;
|
use literals::LiteralSets;
|
||||||
use nonl;
|
use nonl;
|
||||||
use syntax::ParserBuilder;
|
use smart_case::Cased;
|
||||||
use syntax::hir::Hir;
|
|
||||||
use word_boundary::strip_unicode_word_boundaries;
|
use word_boundary::strip_unicode_word_boundaries;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
@ -205,7 +206,11 @@ impl GrepBuilder {
|
|||||||
if !self.opts.case_smart {
|
if !self.opts.case_smart {
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
Ok(!has_uppercase_literal(&self.pattern))
|
let cased = match Cased::from_pattern(&self.pattern) {
|
||||||
|
None => return Ok(false),
|
||||||
|
Some(cased) => cased,
|
||||||
|
};
|
||||||
|
Ok(cased.any_literal && !cased.any_uppercase)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -311,44 +316,15 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Determine whether the pattern contains an uppercase character which should
|
|
||||||
/// negate the effect of the smart-case option.
|
|
||||||
///
|
|
||||||
/// Ideally we would be able to check the AST in order to correctly handle
|
|
||||||
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
|
|
||||||
/// cased), but we don't currently have that option. For now, our 'good enough'
|
|
||||||
/// solution is to simply perform a semi-naïve scan of the input pattern and
|
|
||||||
/// ignore all characters following a '\'. The ExprBuilder will handle any
|
|
||||||
/// actual errors, and this at least lets us support the most common cases,
|
|
||||||
/// like 'foo\w' and 'foo\S', in an intuitive manner.
|
|
||||||
fn has_uppercase_literal(pattern: &str) -> bool {
|
|
||||||
let mut chars = pattern.chars();
|
|
||||||
while let Some(c) = chars.next() {
|
|
||||||
if c == '\\' {
|
|
||||||
chars.next();
|
|
||||||
} else if c.is_uppercase() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
#![allow(unused_imports)]
|
|
||||||
|
|
||||||
use memchr::{memchr, memrchr};
|
use memchr::{memchr, memrchr};
|
||||||
use regex::bytes::Regex;
|
use regex::bytes::Regex;
|
||||||
|
|
||||||
use super::{GrepBuilder, Match, has_uppercase_literal};
|
use super::{GrepBuilder, Match};
|
||||||
|
|
||||||
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
|
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
|
||||||
|
|
||||||
#[allow(dead_code)]
|
|
||||||
fn s(bytes: &[u8]) -> String {
|
|
||||||
String::from_utf8(bytes.to_vec()).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
|
fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
|
||||||
let re = Regex::new(pat).unwrap();
|
let re = Regex::new(pat).unwrap();
|
||||||
let mut lines = vec![];
|
let mut lines = vec![];
|
||||||
@ -377,20 +353,4 @@ mod tests {
|
|||||||
assert_eq!(expected.len(), got.len());
|
assert_eq!(expected.len(), got.len());
|
||||||
assert_eq!(expected, got);
|
assert_eq!(expected, got);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn pattern_case() {
|
|
||||||
assert_eq!(has_uppercase_literal(&"".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"Foo".to_string()), true);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foO".to_string()), true);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo\\\\".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo\\w".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo\\S".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo\\p{Ll}".to_string()), true);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo[a-z]".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo[A-Z]".to_string()), true);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo[\\S\\t]".to_string()), false);
|
|
||||||
assert_eq!(has_uppercase_literal(&"foo\\\\S".to_string()), true);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
191
grep/src/smart_case.rs
Normal file
191
grep/src/smart_case.rs
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
use syntax::ast::{self, Ast};
|
||||||
|
use syntax::ast::parse::Parser;
|
||||||
|
|
||||||
|
/// The results of analyzing a regex for cased literals.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct Cased {
|
||||||
|
/// True if and only if a literal uppercase character occurs in the regex.
|
||||||
|
///
|
||||||
|
/// A regex like `\pL` contains no uppercase literals, even though `L`
|
||||||
|
/// is uppercase and the `\pL` class contains uppercase characters.
|
||||||
|
pub any_uppercase: bool,
|
||||||
|
/// True if and only if the regex contains any literal at all. A regex like
|
||||||
|
/// `\pL` has this set to false.
|
||||||
|
pub any_literal: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Cased {
|
||||||
|
/// Returns a `Cased` value by doing analysis on the AST of `pattern`.
|
||||||
|
///
|
||||||
|
/// If `pattern` is not a valid regular expression, then `None` is
|
||||||
|
/// returned.
|
||||||
|
pub fn from_pattern(pattern: &str) -> Option<Cased> {
|
||||||
|
Parser::new()
|
||||||
|
.parse(pattern)
|
||||||
|
.map(|ast| Cased::from_ast(&ast))
|
||||||
|
.ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast(ast: &Ast) -> Cased {
|
||||||
|
let mut cased = Cased::default();
|
||||||
|
cased.from_ast_impl(ast);
|
||||||
|
cased
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_impl(&mut self, ast: &Ast) {
|
||||||
|
if self.done() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
match *ast {
|
||||||
|
Ast::Empty(_)
|
||||||
|
| Ast::Flags(_)
|
||||||
|
| Ast::Dot(_)
|
||||||
|
| Ast::Assertion(_)
|
||||||
|
| Ast::Class(ast::Class::Unicode(_))
|
||||||
|
| Ast::Class(ast::Class::Perl(_)) => {}
|
||||||
|
Ast::Literal(ref x) => {
|
||||||
|
self.from_ast_literal(x);
|
||||||
|
}
|
||||||
|
Ast::Class(ast::Class::Bracketed(ref x)) => {
|
||||||
|
self.from_ast_class_set(&x.kind);
|
||||||
|
}
|
||||||
|
Ast::Repetition(ref x) => {
|
||||||
|
self.from_ast_impl(&x.ast);
|
||||||
|
}
|
||||||
|
Ast::Group(ref x) => {
|
||||||
|
self.from_ast_impl(&x.ast);
|
||||||
|
}
|
||||||
|
Ast::Alternation(ref alt) => {
|
||||||
|
for x in &alt.asts {
|
||||||
|
self.from_ast_impl(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ast::Concat(ref alt) => {
|
||||||
|
for x in &alt.asts {
|
||||||
|
self.from_ast_impl(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
|
||||||
|
if self.done() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
match *ast {
|
||||||
|
ast::ClassSet::Item(ref item) => {
|
||||||
|
self.from_ast_class_set_item(item);
|
||||||
|
}
|
||||||
|
ast::ClassSet::BinaryOp(ref x) => {
|
||||||
|
self.from_ast_class_set(&x.lhs);
|
||||||
|
self.from_ast_class_set(&x.rhs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
|
||||||
|
if self.done() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
match *ast {
|
||||||
|
ast::ClassSetItem::Empty(_)
|
||||||
|
| ast::ClassSetItem::Ascii(_)
|
||||||
|
| ast::ClassSetItem::Unicode(_)
|
||||||
|
| ast::ClassSetItem::Perl(_) => {}
|
||||||
|
ast::ClassSetItem::Literal(ref x) => {
|
||||||
|
self.from_ast_literal(x);
|
||||||
|
}
|
||||||
|
ast::ClassSetItem::Range(ref x) => {
|
||||||
|
self.from_ast_literal(&x.start);
|
||||||
|
self.from_ast_literal(&x.end);
|
||||||
|
}
|
||||||
|
ast::ClassSetItem::Bracketed(ref x) => {
|
||||||
|
self.from_ast_class_set(&x.kind);
|
||||||
|
}
|
||||||
|
ast::ClassSetItem::Union(ref union) => {
|
||||||
|
for x in &union.items {
|
||||||
|
self.from_ast_class_set_item(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_literal(&mut self, ast: &ast::Literal) {
|
||||||
|
self.any_literal = true;
|
||||||
|
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the attributes can never change no matter
|
||||||
|
/// what other AST it might see.
|
||||||
|
fn done(&self) -> bool {
|
||||||
|
self.any_uppercase && self.any_literal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn cased(pattern: &str) -> Cased {
|
||||||
|
Cased::from_pattern(pattern).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn various() {
|
||||||
|
let x = cased("");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(!x.any_literal);
|
||||||
|
|
||||||
|
let x = cased("foo");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased("Foo");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased("foO");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo\\");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo\w");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo\S");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo\p{Ll}");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo[a-z]");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo[A-Z]");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo[\S\t]");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"foo\\S");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"\p{Ll}");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(!x.any_literal);
|
||||||
|
|
||||||
|
let x = cased(r"aBc\w");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user