diff --git a/Cargo.toml b/Cargo.toml index e562a584..24b34617 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,10 @@ license = "Unlicense/MIT" [dependencies] docopt = "0.6" +memchr = "0.1" +memmap = "0.2" regex = { version = "0.1", path = "/home/andrew/rust/regex" } +regex-syntax = { version = "0.2", path = "/home/andrew/rust/regex/regex-syntax" } rustc-serialize = "0.3" [profile.release] diff --git a/src/main.rs b/src/main.rs index 62fe205c..583246cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,10 @@ -#![allow(dead_code)] +#![allow(dead_code, unused_variables)] extern crate docopt; +extern crate memchr; +extern crate memmap; extern crate regex; +extern crate regex_syntax as syntax; extern crate rustc_serialize; const USAGE: &'static str = " @@ -14,9 +17,11 @@ use std::process; use std::result; use docopt::Docopt; -use regex::internal::{ExecBuilder, Search}; +use regex::bytes::Regex; -type Result = result::Result>; +mod nonl; + +pub type Result = result::Result>; #[derive(RustcDecodable)] struct Args { @@ -38,17 +43,26 @@ fn main() { } fn run(args: &Args) -> Result { - let _stdin = io::stdin(); - let mut rdr = io::BufReader::new(_stdin.lock()); + let expr = try!(parse(&args.arg_pattern)); + let re = Regex::new(&expr.to_string()).unwrap(); + if args.arg_file.is_empty() { + let _stdin = io::stdin(); + let stdin = _stdin.lock(); + run_by_line(args, &re, stdin) + } else { + run_mmap(args, &re) + } +} + +fn run_by_line( + args: &Args, + re: &Regex, + mut rdr: B, +) -> Result { let mut wtr = io::BufWriter::new(io::stdout()); let mut count = 0; let mut nline = 0; let mut line = vec![]; - let re = try!(ExecBuilder::new(&args.arg_pattern).only_utf8(false).build()); - let mut search = Search { - captures: &mut [], - matches: &mut [false], - }; loop { line.clear(); let n = try!(rdr.read_until(b'\n', &mut line)); @@ -56,17 +70,44 @@ fn run(args: &Args) -> Result { break; } nline += 1; - if line.last().map_or(false, |&b| b == b'\n') { - line.pop().unwrap(); - } - search.matches[0] = false; - if re.exec(&mut search, &line, 0) { + if re.is_match(&line) { count += 1; - try!(wtr.write(nline.to_string().as_bytes())); - try!(wtr.write(&[b':'])); try!(wtr.write(&line)); - try!(wtr.write(&[b'\n'])); } } Ok(count) } + +fn run_mmap(args: &Args, re: &Regex) -> Result { + use memchr::{memchr, memrchr}; + use memmap::{Mmap, Protection}; + + assert!(args.arg_file.len() == 1); + let mut wtr = io::BufWriter::new(io::stdout()); + let mut count = 0; + let mmap = try!(Mmap::open_path(&args.arg_file[0], Protection::Read)); + let text = unsafe { mmap.as_slice() }; + let mut start = 0; + while let Some((s, e)) = re.find(&text[start..]) { + let (s, e) = (start + s, start + e); + let prevnl = memrchr(b'\n', &text[0..s]).map_or(0, |i| i + 1); + let nextnl = memchr(b'\n', &text[e..]).map_or(text.len(), |i| e + i); + try!(wtr.write(&text[prevnl..nextnl])); + try!(wtr.write(b"\n")); + start = nextnl + 1; + count += 1; + if start >= text.len() { + break; + } + } + Ok(count) +} + +fn parse(re: &str) -> Result { + let expr = + try!(syntax::ExprBuilder::new() + .allow_bytes(true) + .unicode(false) + .parse(re)); + Ok(try!(nonl::remove(expr))) +} diff --git a/src/nonl.rs b/src/nonl.rs new file mode 100644 index 00000000..96ae3937 --- /dev/null +++ b/src/nonl.rs @@ -0,0 +1,55 @@ +use syntax::Expr; + +use Result; + +/// Returns a new expression that is guaranteed to never match `\n`. +/// +/// If the expression contains a literal `\n`, then an error is returned. +pub fn remove(expr: Expr) -> Result { + use syntax::Expr::*; + Ok(match expr { + Literal { chars, casei } => { + if chars.iter().position(|&c| c == '\n').is_some() { + return Err(format!("Literal '\\n' are not allowed.").into()); + } + Literal { chars: chars, casei: casei } + } + LiteralBytes { bytes, casei } => { + if bytes.iter().position(|&b| b == b'\n').is_some() { + return Err(format!("Literal '\\n' are not allowed.").into()); + } + LiteralBytes { bytes: bytes, casei: casei } + } + AnyChar => AnyCharNoNL, + AnyByte => AnyByteNoNL, + Class(mut cls) => { + cls.remove('\n'); + Class(cls) + } + ClassBytes(mut cls) => { + cls.remove(b'\n'); + ClassBytes(cls) + } + Group { e, i, name } => { + Group { + e: Box::new(try!(remove(*e))), + i: i, + name: name, + } + } + Repeat { e, r, greedy } => { + Repeat { + e: Box::new(try!(remove(*e))), + r: r, + greedy: greedy, + } + } + Concat(exprs) => { + Concat(try!(exprs.into_iter().map(remove).collect())) + } + Alternate(exprs) => { + Alternate(try!(exprs.into_iter().map(remove).collect())) + } + e => e, + }) +}