mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
grep: upgrade to regex-syntax 0.5
This update brings with it many bug fixes: * Better error messages are printed overall. We also include explicit call out for unsupported features like backreferences and look-around. * Regexes like `\s*{` no longer emit incomprehensible errors. * Unicode escape sequences, such as `\u{..}` are now supported. For the most part, this upgrade was done in a straight-forward way. We resist the urge to refactor the `grep` crate, in anticipation of it being rewritten anyway. Note that we removed the `--fixed-strings` suggestion whenever a regex syntax error occurs. In practice, I've found that it results in a lot of false positives, and I believe that its use is not as paramount now that regex parse errors are much more readable. Closes #268, Closes #395, Closes #702, Closes #853
This commit is contained in:
parent
c2e97cd858
commit
cd08707c7c
12
CHANGELOG.md
12
CHANGELOG.md
@ -14,11 +14,16 @@ CPUs (such as AVX2) for additional optimizations.
|
||||
per line. Previously, the behavior of ripgrep was to report the total number
|
||||
of matching lines. (Note that this behavior diverges from the behavior of
|
||||
GNU grep.)
|
||||
* Octal syntax is no longer supported. ripgrep previously accepted expressions
|
||||
like `\1` as syntax for matching `U+0001`, but ripgrep will now report an
|
||||
error instead.
|
||||
|
||||
Feature enhancements:
|
||||
|
||||
* [FEATURE #411](https://github.com/BurntSushi/ripgrep/issues/411):
|
||||
Add a `--stats` flag, which emits aggregate statistics after search results.
|
||||
* [FEATURE #702](https://github.com/BurntSushi/ripgrep/issues/702):
|
||||
Support `\u{..}` Unicode escape sequences.
|
||||
* [FEATURE #812](https://github.com/BurntSushi/ripgrep/issues/812):
|
||||
Add `-b/--byte-offset` flag that reports byte offset of each matching line.
|
||||
* [FEATURE #814](https://github.com/BurntSushi/ripgrep/issues/814):
|
||||
@ -29,12 +34,19 @@ Bug fixes:
|
||||
* [BUG #135](https://github.com/BurntSushi/ripgrep/issues/135):
|
||||
Release portable binaries that conditionally use SSSE3, AVX2, etc., at
|
||||
runtime.
|
||||
* [BUG #268](https://github.com/BurntSushi/ripgrep/issues/268):
|
||||
Print descriptive error message when trying to use look-around or
|
||||
backreferences.
|
||||
* [BUG #395](https://github.com/BurntSushi/ripgrep/issues/395):
|
||||
Show comprehensible error messages for regexes like `\s*{`.
|
||||
* [BUG #526](https://github.com/BurntSushi/ripgrep/issues/526):
|
||||
Support backslash escapes in globs.
|
||||
* [BUG #832](https://github.com/BurntSushi/ripgrep/issues/832):
|
||||
Clarify usage instructions for `-f/--file` flag.
|
||||
* [BUG #852](https://github.com/BurntSushi/ripgrep/issues/852):
|
||||
Be robust with respect to `ENOMEM` errors returned by `mmap`.
|
||||
* [BUG #853](https://github.com/BurntSushi/ripgrep/issues/853):
|
||||
Upgrade `grep` crate to `regex-syntax 0.5.0`.
|
||||
|
||||
|
||||
0.8.1 (2018-02-20)
|
||||
|
14
Cargo.lock
generated
14
Cargo.lock
generated
@ -109,7 +109,7 @@ dependencies = [
|
||||
"log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -212,19 +212,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"aho-corasick 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex-syntax 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.5.2"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -401,8 +396,7 @@ dependencies = [
|
||||
"checksum redox_syscall 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "0d92eecebad22b767915e4d529f89f28ee96dbbf5a4810d2b844373f136417fd"
|
||||
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
|
||||
"checksum regex 0.2.9 (registry+https://github.com/rust-lang/crates.io-index)" = "bde64a9b799f85750f6469fd658cff5fce8d910a7d510858a1f9d15ca9f023bf"
|
||||
"checksum regex-syntax 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "8e931c58b93d86f080c734bfd2bce7dd0079ae2331235818133c8be7f422e20e"
|
||||
"checksum regex-syntax 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d8337992c33b62cf49462a1ce4d0eedbb021f48de017e40ec307cca445df0ca9"
|
||||
"checksum regex-syntax 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b2550876c31dc914696a6c2e01cbce8afba79a93c8ae979d2fe051c0230b3756"
|
||||
"checksum same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "cfb6eded0b06a0b512c8ddbcf04089138c9b4362c2f696f3c3d76039d68f3637"
|
||||
"checksum simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673"
|
||||
"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550"
|
||||
|
@ -16,4 +16,4 @@ license = "Unlicense/MIT"
|
||||
log = "0.4"
|
||||
memchr = "2"
|
||||
regex = "0.2.9"
|
||||
regex-syntax = "0.4.0"
|
||||
regex-syntax = "0.5.3"
|
||||
|
@ -10,10 +10,8 @@ principled.
|
||||
use std::cmp;
|
||||
|
||||
use regex::bytes::RegexBuilder;
|
||||
use syntax::{
|
||||
Expr, Literals, Lit,
|
||||
ByteClass, ByteRange, CharClass, ClassRange, Repeater,
|
||||
};
|
||||
use syntax::hir::{self, Hir, HirKind};
|
||||
use syntax::hir::literal::{Literal, Literals};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LiteralSets {
|
||||
@ -23,12 +21,12 @@ pub struct LiteralSets {
|
||||
}
|
||||
|
||||
impl LiteralSets {
|
||||
pub fn create(expr: &Expr) -> Self {
|
||||
pub fn create(expr: &Hir) -> Self {
|
||||
let mut required = Literals::empty();
|
||||
union_required(expr, &mut required);
|
||||
LiteralSets {
|
||||
prefixes: expr.prefixes(),
|
||||
suffixes: expr.suffixes(),
|
||||
prefixes: Literals::prefixes(expr),
|
||||
suffixes: Literals::suffixes(expr),
|
||||
required: required,
|
||||
}
|
||||
}
|
||||
@ -93,60 +91,52 @@ impl LiteralSets {
|
||||
}
|
||||
}
|
||||
|
||||
fn union_required(expr: &Expr, lits: &mut Literals) {
|
||||
use syntax::Expr::*;
|
||||
match *expr {
|
||||
Literal { ref chars, casei: false } => {
|
||||
let s: String = chars.iter().cloned().collect();
|
||||
lits.cross_add(s.as_bytes());
|
||||
fn union_required(expr: &Hir, lits: &mut Literals) {
|
||||
match *expr.kind() {
|
||||
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||
let mut buf = [0u8; 4];
|
||||
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
|
||||
}
|
||||
Literal { ref chars, casei: true } => {
|
||||
for &c in chars {
|
||||
let cls = CharClass::new(vec![
|
||||
ClassRange { start: c, end: c },
|
||||
]).case_fold();
|
||||
if !lits.add_char_class(&cls) {
|
||||
lits.cut();
|
||||
return;
|
||||
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||
lits.cross_add(&[b]);
|
||||
}
|
||||
}
|
||||
}
|
||||
LiteralBytes { ref bytes, casei: false } => {
|
||||
lits.cross_add(bytes);
|
||||
}
|
||||
LiteralBytes { ref bytes, casei: true } => {
|
||||
for &b in bytes {
|
||||
let cls = ByteClass::new(vec![
|
||||
ByteRange { start: b, end: b },
|
||||
]).case_fold();
|
||||
if !lits.add_byte_class(&cls) {
|
||||
lits.cut();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
Class(_) => {
|
||||
HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
||||
if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
|
||||
lits.cut();
|
||||
}
|
||||
ClassBytes(_) => {
|
||||
}
|
||||
HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
||||
if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
|
||||
lits.cut();
|
||||
}
|
||||
Group { ref e, .. } => {
|
||||
union_required(&**e, lits);
|
||||
}
|
||||
Repeat { r: Repeater::ZeroOrOne, .. } => lits.cut(),
|
||||
Repeat { r: Repeater::ZeroOrMore, .. } => lits.cut(),
|
||||
Repeat { ref e, r: Repeater::OneOrMore, .. } => {
|
||||
union_required(&**e, lits);
|
||||
HirKind::Group(hir::Group { ref hir, .. }) => {
|
||||
union_required(&**hir, lits);
|
||||
}
|
||||
HirKind::Repetition(ref x) => {
|
||||
match x.kind {
|
||||
hir::RepetitionKind::ZeroOrOne => lits.cut(),
|
||||
hir::RepetitionKind::ZeroOrMore => lits.cut(),
|
||||
hir::RepetitionKind::OneOrMore => {
|
||||
union_required(&x.hir, lits);
|
||||
lits.cut();
|
||||
}
|
||||
Repeat { ref e, r: Repeater::Range { min, max }, greedy } => {
|
||||
hir::RepetitionKind::Range(ref rng) => {
|
||||
let (min, max) = match *rng {
|
||||
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
|
||||
hir::RepetitionRange::AtLeast(m) => (m, None),
|
||||
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
|
||||
};
|
||||
repeat_range_literals(
|
||||
&**e, min, max, greedy, lits, union_required);
|
||||
&x.hir, min, max, x.greedy, lits, union_required);
|
||||
}
|
||||
Concat(ref es) if es.is_empty() => {}
|
||||
Concat(ref es) if es.len() == 1 => union_required(&es[0], lits),
|
||||
Concat(ref es) => {
|
||||
}
|
||||
}
|
||||
HirKind::Concat(ref es) if es.is_empty() => {}
|
||||
HirKind::Concat(ref es) if es.len() == 1 => {
|
||||
union_required(&es[0], lits)
|
||||
}
|
||||
HirKind::Concat(ref es) => {
|
||||
for e in es {
|
||||
let mut lits2 = lits.to_empty();
|
||||
union_required(e, &mut lits2);
|
||||
@ -157,7 +147,6 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
|
||||
if lits2.contains_empty() {
|
||||
lits.cut();
|
||||
}
|
||||
// if !lits.union(lits2) {
|
||||
if !lits.cross_product(&lits2) {
|
||||
// If this expression couldn't yield any literal that
|
||||
// could be extended, then we need to quit. Since we're
|
||||
@ -167,15 +156,15 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
|
||||
}
|
||||
}
|
||||
}
|
||||
Alternate(ref es) => {
|
||||
HirKind::Alternation(ref es) => {
|
||||
alternate_literals(es, lits, union_required);
|
||||
}
|
||||
_ => lits.cut(),
|
||||
}
|
||||
}
|
||||
|
||||
fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
|
||||
e: &Expr,
|
||||
fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
|
||||
e: &Hir,
|
||||
min: u32,
|
||||
max: Option<u32>,
|
||||
_greedy: bool,
|
||||
@ -204,8 +193,8 @@ fn repeat_range_literals<F: FnMut(&Expr, &mut Literals)>(
|
||||
}
|
||||
}
|
||||
|
||||
fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
|
||||
es: &[Expr],
|
||||
fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
|
||||
es: &[Hir],
|
||||
lits: &mut Literals,
|
||||
mut f: F,
|
||||
) {
|
||||
@ -234,11 +223,21 @@ fn alternate_literals<F: FnMut(&Expr, &mut Literals)>(
|
||||
}
|
||||
lits.cut();
|
||||
if !lcs.is_empty() {
|
||||
lits.add(Lit::empty());
|
||||
lits.add(Lit::new(lcs.to_vec()));
|
||||
lits.add(Literal::empty());
|
||||
lits.add(Literal::new(lcs.to_vec()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of characters in the given class.
|
||||
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
|
||||
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
||||
}
|
||||
|
||||
/// Return the number of bytes in the given class.
|
||||
fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
|
||||
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
||||
}
|
||||
|
||||
/// Converts an arbitrary sequence of bytes to a literal suitable for building
|
||||
/// a regular expression.
|
||||
fn bytes_to_regex(bs: &[u8]) -> String {
|
||||
|
@ -1,4 +1,4 @@
|
||||
use syntax::Expr;
|
||||
use syntax::hir::{self, Hir, HirKind};
|
||||
|
||||
use {Error, Result};
|
||||
|
||||
@ -9,59 +9,66 @@ use {Error, Result};
|
||||
///
|
||||
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
|
||||
/// function panics.
|
||||
pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
|
||||
// TODO(burntsushi): There is a bug in this routine where only `\n` is
|
||||
// handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
|
||||
// to proper character classes instead of the special `AnyCharNoNL` and
|
||||
// `AnyByteNoNL` classes.
|
||||
use syntax::Expr::*;
|
||||
pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
|
||||
assert!(byte <= 0x7F);
|
||||
let chr = byte as char;
|
||||
assert!(chr.len_utf8() == 1);
|
||||
|
||||
Ok(match expr {
|
||||
Literal { chars, casei } => {
|
||||
if chars.iter().position(|&c| c == chr).is_some() {
|
||||
Ok(match expr.into_kind() {
|
||||
HirKind::Empty => Hir::empty(),
|
||||
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||
if c == chr {
|
||||
return Err(Error::LiteralNotAllowed(chr));
|
||||
}
|
||||
Literal { chars: chars, casei: casei }
|
||||
Hir::literal(hir::Literal::Unicode(c))
|
||||
}
|
||||
LiteralBytes { bytes, casei } => {
|
||||
if bytes.iter().position(|&b| b == byte).is_some() {
|
||||
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||
if b as char == chr {
|
||||
return Err(Error::LiteralNotAllowed(chr));
|
||||
}
|
||||
LiteralBytes { bytes: bytes, casei: casei }
|
||||
Hir::literal(hir::Literal::Byte(b))
|
||||
}
|
||||
AnyChar => AnyCharNoNL,
|
||||
AnyByte => AnyByteNoNL,
|
||||
Class(mut cls) => {
|
||||
cls.remove(chr);
|
||||
Class(cls)
|
||||
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
||||
let remove = hir::ClassUnicode::new(Some(
|
||||
hir::ClassUnicodeRange::new(chr, chr),
|
||||
));
|
||||
cls.difference(&remove);
|
||||
if cls.iter().next().is_none() {
|
||||
return Err(Error::LiteralNotAllowed(chr));
|
||||
}
|
||||
ClassBytes(mut cls) => {
|
||||
cls.remove(byte);
|
||||
ClassBytes(cls)
|
||||
Hir::class(hir::Class::Unicode(cls))
|
||||
}
|
||||
Group { e, i, name } => {
|
||||
Group {
|
||||
e: Box::new(remove(*e, byte)?),
|
||||
i: i,
|
||||
name: name,
|
||||
HirKind::Class(hir::Class::Bytes(mut cls)) => {
|
||||
let remove = hir::ClassBytes::new(Some(
|
||||
hir::ClassBytesRange::new(byte, byte),
|
||||
));
|
||||
cls.difference(&remove);
|
||||
if cls.iter().next().is_none() {
|
||||
return Err(Error::LiteralNotAllowed(chr));
|
||||
}
|
||||
Hir::class(hir::Class::Bytes(cls))
|
||||
}
|
||||
Repeat { e, r, greedy } => {
|
||||
Repeat {
|
||||
e: Box::new(remove(*e, byte)?),
|
||||
r: r,
|
||||
greedy: greedy,
|
||||
HirKind::Anchor(x) => Hir::anchor(x),
|
||||
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
||||
HirKind::Repetition(mut x) => {
|
||||
x.hir = Box::new(remove(*x.hir, byte)?);
|
||||
Hir::repetition(x)
|
||||
}
|
||||
HirKind::Group(mut x) => {
|
||||
x.hir = Box::new(remove(*x.hir, byte)?);
|
||||
Hir::group(x)
|
||||
}
|
||||
Concat(exprs) => {
|
||||
Concat(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
|
||||
HirKind::Concat(xs) => {
|
||||
let xs = xs.into_iter()
|
||||
.map(|e| remove(e, byte))
|
||||
.collect::<Result<Vec<Hir>>>()?;
|
||||
Hir::concat(xs)
|
||||
}
|
||||
Alternate(exprs) => {
|
||||
Alternate(exprs.into_iter().map(|e| remove(e, byte)).collect::<Result<Vec<Expr>>>()?)
|
||||
HirKind::Alternation(xs) => {
|
||||
let xs = xs.into_iter()
|
||||
.map(|e| remove(e, byte))
|
||||
.collect::<Result<Vec<Hir>>>()?;
|
||||
Hir::alternation(xs)
|
||||
}
|
||||
e => e,
|
||||
})
|
||||
}
|
||||
|
@ -1,10 +1,10 @@
|
||||
use memchr::{memchr, memrchr};
|
||||
use regex::bytes::{Regex, RegexBuilder};
|
||||
use syntax;
|
||||
|
||||
use literals::LiteralSets;
|
||||
use nonl;
|
||||
use syntax::Expr;
|
||||
use syntax::ParserBuilder;
|
||||
use syntax::hir::Hir;
|
||||
use word_boundary::strip_unicode_word_boundaries;
|
||||
use Result;
|
||||
|
||||
@ -166,7 +166,7 @@ impl GrepBuilder {
|
||||
|
||||
/// Creates a new regex from the given expression with the current
|
||||
/// configuration.
|
||||
fn regex(&self, expr: &Expr) -> Result<Regex> {
|
||||
fn regex(&self, expr: &Hir) -> Result<Regex> {
|
||||
let mut builder = RegexBuilder::new(&expr.to_string());
|
||||
builder.unicode(true);
|
||||
self.regex_build(builder)
|
||||
@ -184,15 +184,16 @@ impl GrepBuilder {
|
||||
|
||||
/// Parses the underlying pattern and ensures the pattern can never match
|
||||
/// the line terminator.
|
||||
fn parse(&self) -> Result<syntax::Expr> {
|
||||
let expr =
|
||||
syntax::ExprBuilder::new()
|
||||
.allow_bytes(true)
|
||||
.unicode(true)
|
||||
fn parse(&self) -> Result<Hir> {
|
||||
let expr = ParserBuilder::new()
|
||||
.allow_invalid_utf8(true)
|
||||
.case_insensitive(self.is_case_insensitive()?)
|
||||
.multi_line(true)
|
||||
.build()
|
||||
.parse(&self.pattern)?;
|
||||
debug!("original regex HIR pattern:\n{}", expr);
|
||||
let expr = nonl::remove(expr, self.opts.line_terminator)?;
|
||||
debug!("regex ast:\n{:#?}", expr);
|
||||
debug!("transformed regex HIR pattern:\n{}", expr);
|
||||
Ok(expr)
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
use syntax::Expr;
|
||||
use syntax::hir::{self, Hir, HirKind};
|
||||
|
||||
/// Strips Unicode word boundaries from the given expression.
|
||||
///
|
||||
@ -8,7 +8,7 @@ use syntax::Expr;
|
||||
/// false negatives.
|
||||
///
|
||||
/// If no word boundaries could be stripped, then None is returned.
|
||||
pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
||||
pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
|
||||
// The real reason we do this is because Unicode word boundaries are the
|
||||
// one thing that Rust's regex DFA engine can't handle. When it sees a
|
||||
// Unicode word boundary among non-ASCII text, it falls back to one of the
|
||||
@ -16,23 +16,24 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
||||
// a regex to find candidate matches without a Unicode word boundary. We'll
|
||||
// only then use the full (and slower) regex to confirm a candidate as a
|
||||
// match or not during search.
|
||||
use syntax::Expr::*;
|
||||
|
||||
match *expr {
|
||||
Concat(ref es) if !es.is_empty() => {
|
||||
//
|
||||
// It looks like we only check the outer edges for `\b`? I guess this is
|
||||
// an attempt to optimize for the `-w/--word-regexp` flag? ---AG
|
||||
match *expr.kind() {
|
||||
HirKind::Concat(ref es) if !es.is_empty() => {
|
||||
let first = is_unicode_word_boundary(&es[0]);
|
||||
let last = is_unicode_word_boundary(es.last().unwrap());
|
||||
// Be careful not to strip word boundaries if there are no other
|
||||
// expressions to match.
|
||||
match (first, last) {
|
||||
(true, false) if es.len() > 1 => {
|
||||
Some(Concat(es[1..].to_vec()))
|
||||
Some(Hir::concat(es[1..].to_vec()))
|
||||
}
|
||||
(false, true) if es.len() > 1 => {
|
||||
Some(Concat(es[..es.len() - 1].to_vec()))
|
||||
Some(Hir::concat(es[..es.len() - 1].to_vec()))
|
||||
}
|
||||
(true, true) if es.len() > 2 => {
|
||||
Some(Concat(es[1..es.len() - 1].to_vec()))
|
||||
Some(Hir::concat(es[1..es.len() - 1].to_vec()))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
@ -42,13 +43,11 @@ pub fn strip_unicode_word_boundaries(expr: &Expr) -> Option<Expr> {
|
||||
}
|
||||
|
||||
/// Returns true if the given expression is a Unicode word boundary.
|
||||
fn is_unicode_word_boundary(expr: &Expr) -> bool {
|
||||
use syntax::Expr::*;
|
||||
|
||||
match *expr {
|
||||
WordBoundary => true,
|
||||
NotWordBoundary => true,
|
||||
Group { ref e, .. } => is_unicode_word_boundary(e),
|
||||
fn is_unicode_word_boundary(expr: &Hir) -> bool {
|
||||
match *expr.kind() {
|
||||
HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
|
||||
HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
|
||||
HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
13
src/args.rs
13
src/args.rs
@ -9,7 +9,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
|
||||
use clap;
|
||||
use encoding_rs::Encoding;
|
||||
use grep::{Grep, GrepBuilder, Error as GrepError};
|
||||
use grep::{Grep, GrepBuilder};
|
||||
use log;
|
||||
use num_cpus;
|
||||
use regex;
|
||||
@ -882,16 +882,7 @@ impl<'a> ArgMatches<'a> {
|
||||
if let Some(limit) = self.regex_size_limit()? {
|
||||
gb = gb.size_limit(limit);
|
||||
}
|
||||
gb.build().map_err(|err| {
|
||||
match err {
|
||||
GrepError::Regex(err) => {
|
||||
let s = format!("{}\n(Hint: Try the --fixed-strings flag \
|
||||
to search for a literal string.)", err.to_string());
|
||||
From::from(s)
|
||||
},
|
||||
err => From::from(err)
|
||||
}
|
||||
})
|
||||
Ok(gb.build()?)
|
||||
}
|
||||
|
||||
/// Builds the set of glob overrides from the command line flags.
|
||||
|
@ -1687,16 +1687,6 @@ sherlock!(feature_419_zero_as_shortcut_for_null, "Sherlock", ".",
|
||||
assert_eq!(lines, "sherlock\x002\n");
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/709
|
||||
clean!(suggest_fixed_strings_for_invalid_regex, "foo(", ".",
|
||||
|wd: WorkDir, mut cmd: Command| {
|
||||
wd.assert_non_empty_stderr(&mut cmd);
|
||||
|
||||
let output = cmd.output().unwrap();
|
||||
let err = String::from_utf8_lossy(&output.stderr);
|
||||
assert_eq!(err.contains("--fixed-strings"), true);
|
||||
});
|
||||
|
||||
#[test]
|
||||
fn compressed_gzip() {
|
||||
if !cmd_exists("gzip") {
|
||||
|
Loading…
x
Reference in New Issue
Block a user