diff --git a/src/literals.rs b/src/literals.rs index c45656a8..be91d550 100644 --- a/src/literals.rs +++ b/src/literals.rs @@ -1,12 +1,10 @@ use std::cmp; use std::iter; -use std::str; -use regex::quote; use regex::bytes::Regex; use syntax::{ Expr, Literals, Lit, - ByteClass, CharClass, Repeater, ClassRange, ByteRange, + Repeater, }; #[derive(Debug)] @@ -16,11 +14,6 @@ pub struct LiteralSets { required: Literals, } -#[derive(Debug)] -pub struct LiteralMatcher { - re: Regex, -} - impl LiteralSets { pub fn create(expr: &Expr) -> Self { let mut required = Literals::empty(); @@ -32,7 +25,11 @@ impl LiteralSets { } } - pub fn to_matcher(&self) -> Option { + pub fn to_matcher(&self) -> Option { + if self.prefixes.all_complete() && !self.prefixes.is_empty() { + // When this is true, the regex engine will do a literal scan. + return None; + } let pre_lcp = self.prefixes.longest_common_prefix(); let pre_lcs = self.prefixes.longest_common_suffix(); let suf_lcp = self.suffixes.longest_common_prefix(); @@ -60,8 +57,8 @@ impl LiteralSets { if lit.is_empty() { None } else { - let s = str::from_utf8(lit).unwrap(); - Some(LiteralMatcher { re: Regex::new("e(s)).unwrap() }) + // Literals always compile. + Some(Regex::new(&bytes_to_regex(lit)).unwrap()) } } } @@ -74,39 +71,19 @@ fn union_required(expr: &Expr, lits: &mut Literals) { lits.cross_add(s.as_bytes()); } Literal { ref chars, casei: true } => { - for &c in chars { - let cls = CharClass::new(vec![ - ClassRange { start: c, end: c }, - ]).case_fold(); - if !lits.add_char_class(&cls) { - lits.cut(); - return; - } - } + lits.cut(); } LiteralBytes { ref bytes, casei: false } => { lits.cross_add(bytes); } LiteralBytes { ref bytes, casei: true } => { - for &b in bytes { - let cls = ByteClass::new(vec![ - ByteRange { start: b, end: b }, - ]).case_fold(); - if !lits.add_byte_class(&cls) { - lits.cut(); - return; - } - } + lits.cut(); } Class(ref cls) => { - if !lits.add_char_class(cls) { - lits.cut(); - } + lits.cut(); } ClassBytes(ref cls) => { - if !lits.add_byte_class(cls) { - lits.cut(); - } + lits.cut(); } Group { ref e, .. } => { union_required(&**e, lits); @@ -212,3 +189,13 @@ fn alternate_literals( lits.add(Lit::new(lcs.to_vec())); } } + +/// Converts an arbitrary sequence of bytes to a literal suitable for building +/// a regular expression. +fn bytes_to_regex(bs: &[u8]) -> String { + let mut s = String::with_capacity(bs.len()); + for &b in bs { + s.push_str(&format!("\\x{:02x}", b)); + } + s +} diff --git a/src/main.rs b/src/main.rs index b328e249..59a08ce4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,9 +20,11 @@ use docopt::Docopt; use regex::bytes::Regex; use literals::LiteralSets; +use search::{LineSearcher, LineSearcherBuilder}; mod literals; mod nonl; +mod search; pub type Result = result::Result>; @@ -46,20 +48,36 @@ fn main() { } fn run(args: &Args) -> Result { - let expr = try!(parse(&args.arg_pattern)); - let literals = LiteralSets::create(&expr); - // println!("{:?}", literals); - // println!("{:?}", literals.to_matcher()); - let re = Regex::new(&expr.to_string()).unwrap(); if args.arg_file.is_empty() { + let expr = try!(parse(&args.arg_pattern)); + let literals = LiteralSets::create(&expr); + let re = Regex::new(&expr.to_string()).unwrap(); let _stdin = io::stdin(); let stdin = _stdin.lock(); run_by_line(args, &re, stdin) } else { - run_mmap(args, &re) + let searcher = + try!(LineSearcherBuilder::new(&args.arg_pattern).create()); + run_mmap(args, &searcher) } } +fn run_mmap(args: &Args, searcher: &LineSearcher) -> Result { + use memmap::{Mmap, Protection}; + + assert!(args.arg_file.len() == 1); + let mut wtr = io::BufWriter::new(io::stdout()); + let mut count = 0; + let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read)); + let text = unsafe { mmap.as_slice() }; + for m in searcher.search(text) { + try!(wtr.write(&text[m.start..m.end])); + try!(wtr.write(b"\n")); + count += 1; + } + Ok(count) +} + fn run_by_line( args: &Args, re: &Regex, @@ -84,31 +102,6 @@ fn run_by_line( Ok(count) } -fn run_mmap(args: &Args, re: &Regex) -> Result { - use memchr::{memchr, memrchr}; - use memmap::{Mmap, Protection}; - - assert!(args.arg_file.len() == 1); - let mut wtr = io::BufWriter::new(io::stdout()); - let mut count = 0; - let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read)); - let text = unsafe { mmap.as_slice() }; - let mut start = 0; - while let Some((s, e)) = re.find(&text[start..]) { - let (s, e) = (start + s, start + e); - let prevnl = memrchr(b'\n', &text[0..s]).map_or(0, |i| i + 1); - let nextnl = memchr(b'\n', &text[e..]).map_or(text.len(), |i| e + i); - try!(wtr.write(&text[prevnl..nextnl])); - try!(wtr.write(b"\n")); - start = nextnl + 1; - count += 1; - if start >= text.len() { - break; - } - } - Ok(count) -} - fn parse(re: &str) -> Result { let expr = try!(syntax::ExprBuilder::new() diff --git a/src/search.rs b/src/search.rs new file mode 100644 index 00000000..f47feae3 --- /dev/null +++ b/src/search.rs @@ -0,0 +1,164 @@ +use memchr::{memchr, memrchr}; +use regex::bytes::Regex; +use syntax; + +use literals::LiteralSets; +use nonl; +use Result; + +#[derive(Clone, Debug)] +pub struct LineSearcher { + re: Regex, + required: Option, + opts: Options, +} + +#[derive(Clone, Debug)] +pub struct LineSearcherBuilder { + pattern: String, + opts: Options, +} + +#[derive(Clone, Debug, Default)] +struct Options { + case_insensitive: bool, + lines: bool, + locations: bool, +} + +impl LineSearcherBuilder { + pub fn new(pattern: &str) -> LineSearcherBuilder { + LineSearcherBuilder { + pattern: pattern.to_string(), + opts: Options::default(), + } + } + + pub fn case_insensitive(mut self, yes: bool) -> LineSearcherBuilder { + self.opts.case_insensitive = yes; + self + } + + pub fn line_numbers(mut self, yes: bool) -> LineSearcherBuilder { + self.opts.lines = yes; + self + } + + pub fn locations(mut self, yes: bool) -> LineSearcherBuilder { + self.opts.locations = yes; + self + } + + pub fn create(self) -> Result { + let expr = try!(parse(&self.pattern)); + let literals = LiteralSets::create(&expr); + let pat = + if self.opts.case_insensitive { + format!("(?i){}", expr) + } else { + expr.to_string() + }; + // We've already parsed the pattern, so we know it will compiled. + let re = Regex::new(&pat).unwrap(); + Ok(LineSearcher { + re: re, + required: literals.to_matcher(), + opts: self.opts, + }) + } +} + +impl LineSearcher { + pub fn search<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> { + Iter { + searcher: self, + buf: buf, + start: 0, + count: 0, + } + } +} + +pub struct Match { + pub start: usize, + pub end: usize, + pub count: usize, + pub line: Option, + pub locations: Vec<(usize, usize)>, +} + +pub struct Iter<'b, 's> { + searcher: &'s LineSearcher, + buf: &'b [u8], + start: usize, + count: usize, +} + +impl<'b, 's> Iter<'b, 's> { + fn next_line_match(&mut self) -> Option<(usize, usize)> { + if self.start >= self.buf.len() { + return None; + } + if let Some(ref req) = self.searcher.required { + while self.start < self.buf.len() { + let (s, e) = match req.find(&self.buf[self.start..]) { + None => return None, + Some((s, e)) => (self.start + s, self.start + e), + }; + let (prevnl, nextnl) = self.find_line(s, e); + match self.searcher.re.find(&self.buf[prevnl..nextnl]) { + None => { + self.start = nextnl + 1; + continue; + } + Some(_) => return Some((prevnl, nextnl)), + } + } + None + } else { + let (s, e) = match self.searcher.re.find(&self.buf[self.start..]) { + None => return None, + Some((s, e)) => (self.start + s, self.start + e), + }; + Some(self.find_line(s, e)) + } + } + + fn find_line(&self, s: usize, e: usize) -> (usize, usize) { + let prevnl = + memrchr(b'\n', &self.buf[0..s]).map_or(0, |i| i + 1); + let nextnl = + memchr(b'\n', &self.buf[e..]).map_or(self.buf.len(), |i| e + i); + (prevnl, nextnl) + } +} + +impl<'b, 's> Iterator for Iter<'b, 's> { + type Item = Match; + + fn next(&mut self) -> Option { + let (prevnl, nextnl) = match self.next_line_match() { + None => return None, + Some((s, e)) => (s, e), + }; + let count = self.count; + self.start = nextnl + 1; + self.count += 1; + Some(Match { + start: prevnl, + end: nextnl, + count: count, + line: None, + locations: vec![], + }) + } +} + +fn parse(re: &str) -> Result { + let expr = + try!(syntax::ExprBuilder::new() + .allow_bytes(true) + .unicode(false) + .parse(re)); + Ok(try!(nonl::remove(expr))) +}