mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
Fix required literal handling and add debug prints.
In particular, if we had an inner literal and were doing a case insensitive search, then the literals are dropped because we previously only allowed a single inner literal to have an effect. Now we allow alternations of inner literals, but still don't quite take full advantage.
This commit is contained in:
parent
0891b4a3c0
commit
fd3e5069b6
@ -14,6 +14,7 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
|
|||||||
license = "Unlicense/MIT"
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
log = "0.3"
|
||||||
memchr = "0.1"
|
memchr = "0.1"
|
||||||
memmap = "0.2"
|
memmap = "0.2"
|
||||||
regex = "0.1.75"
|
regex = "0.1.75"
|
||||||
|
@ -4,6 +4,8 @@
|
|||||||
A fast line oriented regex searcher.
|
A fast line oriented regex searcher.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
extern crate log;
|
||||||
extern crate memchr;
|
extern crate memchr;
|
||||||
extern crate regex;
|
extern crate regex;
|
||||||
extern crate regex_syntax as syntax;
|
extern crate regex_syntax as syntax;
|
||||||
|
@ -1,13 +1,22 @@
|
|||||||
|
/*!
|
||||||
|
The literals module is responsible for extracting *inner* literals out of the
|
||||||
|
AST of a regular expression. Normally this is the job of the regex engine
|
||||||
|
itself, but the regex engine doesn't look for inner literals. Since we're doing
|
||||||
|
line based searching, we can use them, so we need to do it ourselves.
|
||||||
|
|
||||||
|
Note that this implementation is incredibly suspicious. We need something more
|
||||||
|
principled.
|
||||||
|
*/
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
use regex::bytes::Regex;
|
use regex::bytes::Regex;
|
||||||
use syntax::{
|
use syntax::{
|
||||||
Expr, Literals, Lit,
|
Expr, Literals, Lit,
|
||||||
Repeater,
|
ByteClass, ByteRange, CharClass, ClassRange, Repeater,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct LiteralSets {
|
pub struct LiteralSets {
|
||||||
prefixes: Literals,
|
prefixes: Literals,
|
||||||
suffixes: Literals,
|
suffixes: Literals,
|
||||||
@ -27,6 +36,7 @@ impl LiteralSets {
|
|||||||
|
|
||||||
pub fn to_regex(&self) -> Option<Regex> {
|
pub fn to_regex(&self) -> Option<Regex> {
|
||||||
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
|
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
|
||||||
|
debug!("literal prefixes detected: {:?}", self.prefixes);
|
||||||
// When this is true, the regex engine will do a literal scan.
|
// When this is true, the regex engine will do a literal scan.
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
@ -56,13 +66,27 @@ impl LiteralSets {
|
|||||||
if suf_lcs.len() > lit.len() {
|
if suf_lcs.len() > lit.len() {
|
||||||
lit = suf_lcs;
|
lit = suf_lcs;
|
||||||
}
|
}
|
||||||
if req.len() > lit.len() {
|
if req_lits.len() == 1 && req.len() > lit.len() {
|
||||||
lit = req;
|
lit = req;
|
||||||
}
|
}
|
||||||
if lit.is_empty() {
|
|
||||||
|
// Special case: if we detected an alternation of inner required
|
||||||
|
// literals and its longest literal is bigger than the longest
|
||||||
|
// prefix/suffix, then choose the alternation. In practice, this
|
||||||
|
// helps with case insensitive matching, which can generate lots of
|
||||||
|
// inner required literals.
|
||||||
|
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
|
||||||
|
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
|
||||||
|
debug!("required literals found: {:?}", req_lits);
|
||||||
|
let alts: Vec<String> =
|
||||||
|
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
|
||||||
|
// Literals always compile.
|
||||||
|
Some(Regex::new(&alts.join("|")).unwrap())
|
||||||
|
} else if lit.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
// Literals always compile.
|
// Literals always compile.
|
||||||
|
debug!("required literal found: {:?}", show(lit));
|
||||||
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
|
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
|
|||||||
let s: String = chars.iter().cloned().collect();
|
let s: String = chars.iter().cloned().collect();
|
||||||
lits.cross_add(s.as_bytes());
|
lits.cross_add(s.as_bytes());
|
||||||
}
|
}
|
||||||
Literal { casei: true, .. } => {
|
Literal { ref chars, casei: true } => {
|
||||||
lits.cut();
|
for &c in chars {
|
||||||
|
let cls = CharClass::new(vec![
|
||||||
|
ClassRange { start: c, end: c },
|
||||||
|
]).case_fold();
|
||||||
|
if !lits.add_char_class(&cls) {
|
||||||
|
lits.cut();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
LiteralBytes { ref bytes, casei: false } => {
|
LiteralBytes { ref bytes, casei: false } => {
|
||||||
lits.cross_add(bytes);
|
lits.cross_add(bytes);
|
||||||
}
|
}
|
||||||
LiteralBytes { casei: true, .. } => {
|
LiteralBytes { ref bytes, casei: true } => {
|
||||||
lits.cut();
|
for &b in bytes {
|
||||||
|
let cls = ByteClass::new(vec![
|
||||||
|
ByteRange { start: b, end: b },
|
||||||
|
]).case_fold();
|
||||||
|
if !lits.add_byte_class(&cls) {
|
||||||
|
lits.cut();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Class(_) => {
|
Class(_) => {
|
||||||
lits.cut();
|
lits.cut();
|
||||||
@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
|
|||||||
}
|
}
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Converts arbitrary bytes to a nice string.
|
||||||
|
fn show(bs: &[u8]) -> String {
|
||||||
|
// Why aren't we using this to feed to the regex? Doesn't really matter
|
||||||
|
// I guess. ---AG
|
||||||
|
use std::ascii::escape_default;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
let mut nice = String::new();
|
||||||
|
for &b in bs {
|
||||||
|
let part: Vec<u8> = escape_default(b).collect();
|
||||||
|
nice.push_str(str::from_utf8(&part).unwrap());
|
||||||
|
}
|
||||||
|
nice
|
||||||
|
}
|
||||||
|
@ -152,6 +152,7 @@ impl GrepBuilder {
|
|||||||
.unicode(true)
|
.unicode(true)
|
||||||
.case_insensitive(self.opts.case_insensitive)
|
.case_insensitive(self.opts.case_insensitive)
|
||||||
.parse(&self.pattern));
|
.parse(&self.pattern));
|
||||||
|
debug!("regex ast:\n{:#?}", expr);
|
||||||
Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
|
Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user