repo: move all source code in crates directory

The top-level listing was just getting a bit too long for my taste. So
put all of the code in one directory and shrink the large top-level mess
to a small top-level mess.

NOTE: This commit only contains renames. The subsequent commit will
actually make ripgrep build again. We do it this way with the naive hope
that this will make it easier for git history to track the renames.
Sigh.
This commit is contained in:
Andrew Gallant
2020-02-17 18:19:19 -05:00
parent 0bc4f0447b
commit fdd8510fdd
113 changed files with 0 additions and 0 deletions

24
crates/matcher/Cargo.toml Normal file
View File

@@ -0,0 +1,24 @@
[package]
name = "grep-matcher"
version = "0.1.3" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
A trait for regular expressions, with a focus on line oriented search.
"""
documentation = "https://docs.rs/grep-matcher"
homepage = "https://github.com/BurntSushi/ripgrep"
repository = "https://github.com/BurntSushi/ripgrep"
readme = "README.md"
keywords = ["regex", "pattern", "trait"]
license = "Unlicense/MIT"
autotests = false
[dependencies]
memchr = "2.1"
[dev-dependencies]
regex = "1.1"
[[test]]
name = "integration"
path = "tests/tests.rs"

View File

@@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Andrew Gallant
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

36
crates/matcher/README.md Normal file
View File

@@ -0,0 +1,36 @@
grep-matcher
------------
This crate provides a low level interface for describing regular expression
matchers. The `grep` crate uses this interface in order to make the regex
engine it uses pluggable.
[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep)
[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep)
[![](https://img.shields.io/crates/v/grep-matcher.svg)](https://crates.io/crates/grep-matcher)
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
### Documentation
[https://docs.rs/grep-matcher](https://docs.rs/grep-matcher)
**NOTE:** You probably don't want to use this crate directly. Instead, you
should prefer the facade defined in the
[`grep`](https://docs.rs/grep)
crate.
### Usage
Add this to your `Cargo.toml`:
```toml
[dependencies]
grep-matcher = "0.1"
```
and this to your crate root:
```rust
extern crate grep_matcher;
```

24
crates/matcher/UNLICENSE Normal file
View File

@@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

View File

@@ -0,0 +1,328 @@
use std::str;
use memchr::memchr;
/// Interpolate capture references in `replacement` and write the interpolation
/// result to `dst`. References in `replacement` take the form of $N or $name,
/// where `N` is a capture group index and `name` is a capture group name. The
/// function provided, `name_to_index`, maps capture group names to indices.
///
/// The `append` function given is responsible for writing the replacement
/// to the `dst` buffer. That is, it is called with the capture group index
/// of a capture group reference and is expected to resolve the index to its
/// corresponding matched text. If no such match exists, then `append` should
/// not write anything to its given buffer.
pub fn interpolate<A, N>(
mut replacement: &[u8],
mut append: A,
mut name_to_index: N,
dst: &mut Vec<u8>,
) where
A: FnMut(usize, &mut Vec<u8>),
N: FnMut(&str) -> Option<usize>,
{
while !replacement.is_empty() {
match memchr(b'$', replacement) {
None => break,
Some(i) => {
dst.extend(&replacement[..i]);
replacement = &replacement[i..];
}
}
if replacement.get(1).map_or(false, |&b| b == b'$') {
dst.push(b'$');
replacement = &replacement[2..];
continue;
}
debug_assert!(!replacement.is_empty());
let cap_ref = match find_cap_ref(replacement) {
Some(cap_ref) => cap_ref,
None => {
dst.push(b'$');
replacement = &replacement[1..];
continue;
}
};
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => append(i, dst),
Ref::Named(name) => {
if let Some(i) = name_to_index(name) {
append(i, dst);
}
}
}
}
dst.extend(replacement);
}
/// `CaptureRef` represents a reference to a capture group inside some text.
/// The reference is either a capture group name or a number.
///
/// It is also tagged with the position in the text immediately proceding the
/// capture reference.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct CaptureRef<'a> {
cap: Ref<'a>,
end: usize,
}
/// A reference to a capture group in some text.
///
/// e.g., `$2`, `$foo`, `${foo}`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum Ref<'a> {
Named(&'a str),
Number(usize),
}
impl<'a> From<&'a str> for Ref<'a> {
fn from(x: &'a str) -> Ref<'a> {
Ref::Named(x)
}
}
impl From<usize> for Ref<'static> {
fn from(x: usize) -> Ref<'static> {
Ref::Number(x)
}
}
/// Parses a possible reference to a capture group name in the given text,
/// starting at the beginning of `replacement`.
///
/// If no such valid reference could be found, None is returned.
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
let mut i = 0;
if replacement.len() <= 1 || replacement[0] != b'$' {
return None;
}
let mut brace = false;
i += 1;
if replacement[i] == b'{' {
brace = true;
i += 1;
}
let mut cap_end = i;
while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
cap_end += 1;
}
if cap_end == i {
return None;
}
// We just verified that the range 0..cap_end is valid ASCII, so it must
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
// check with an unchecked conversion or by parsing the number straight
// from &[u8].
let cap = str::from_utf8(&replacement[i..cap_end])
.expect("valid UTF-8 capture name");
if brace {
if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
return None;
}
cap_end += 1;
}
Some(CaptureRef {
cap: match cap.parse::<u32>() {
Ok(i) => Ref::Number(i as usize),
Err(_) => Ref::Named(cap),
},
end: cap_end,
})
}
/// Returns true if and only if the given byte is allowed in a capture name.
fn is_valid_cap_letter(b: &u8) -> bool {
match *b {
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::{find_cap_ref, interpolate, CaptureRef};
macro_rules! find {
($name:ident, $text:expr) => {
#[test]
fn $name() {
assert_eq!(None, find_cap_ref($text.as_bytes()));
}
};
($name:ident, $text:expr, $capref:expr) => {
#[test]
fn $name() {
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
}
};
}
macro_rules! c {
($name_or_number:expr, $pos:expr) => {
CaptureRef { cap: $name_or_number.into(), end: $pos }
};
}
find!(find_cap_ref1, "$foo", c!("foo", 4));
find!(find_cap_ref2, "${foo}", c!("foo", 6));
find!(find_cap_ref3, "$0", c!(0, 2));
find!(find_cap_ref4, "$5", c!(5, 2));
find!(find_cap_ref5, "$10", c!(10, 3));
find!(find_cap_ref6, "$42a", c!("42a", 4));
find!(find_cap_ref7, "${42}a", c!(42, 5));
find!(find_cap_ref8, "${42");
find!(find_cap_ref9, "${42 ");
find!(find_cap_ref10, " $0 ");
find!(find_cap_ref11, "$");
find!(find_cap_ref12, " ");
find!(find_cap_ref13, "");
// A convenience routine for using interpolate's unwieldy but flexible API.
fn interpolate_string(
mut name_to_index: Vec<(&'static str, usize)>,
caps: Vec<&'static str>,
replacement: &str,
) -> String {
name_to_index.sort_by_key(|x| x.0);
let mut dst = vec![];
interpolate(
replacement.as_bytes(),
|i, dst| {
if let Some(&s) = caps.get(i) {
dst.extend(s.as_bytes());
}
},
|name| -> Option<usize> {
name_to_index
.binary_search_by_key(&name, |x| x.0)
.ok()
.map(|i| name_to_index[i].1)
},
&mut dst,
);
String::from_utf8(dst).unwrap()
}
macro_rules! interp {
($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
#[test]
fn $name() {
assert_eq!($expected, interpolate_string($map, $caps, $hay));
}
};
}
interp!(
interp1,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test $foo test",
"test xxx test",
);
interp!(
interp2,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test$footest",
"test",
);
interp!(
interp3,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test${foo}test",
"testxxxtest",
);
interp!(
interp4,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test$2test",
"test",
);
interp!(
interp5,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test${2}test",
"testxxxtest",
);
interp!(
interp6,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test $$foo test",
"test $foo test",
);
interp!(
interp7,
vec![("foo", 2)],
vec!["", "", "xxx"],
"test $foo",
"test xxx",
);
interp!(
interp8,
vec![("foo", 2)],
vec!["", "", "xxx"],
"$foo test",
"xxx test",
);
interp!(
interp9,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test $bar$foo",
"test yyyxxx",
);
interp!(
interp10,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test $ test",
"test $ test",
);
interp!(
interp11,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${} test",
"test ${} test",
);
interp!(
interp12,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${ } test",
"test ${ } test",
);
interp!(
interp13,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${a b} test",
"test ${a b} test",
);
interp!(
interp14,
vec![("bar", 1), ("foo", 2)],
vec!["", "yyy", "xxx"],
"test ${a} test",
"test test",
);
}

1151
crates/matcher/src/lib.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,230 @@
use grep_matcher::{Captures, Match, Matcher};
use regex::bytes::Regex;
use util::{RegexMatcher, RegexMatcherNoCaps};
fn matcher(pattern: &str) -> RegexMatcher {
RegexMatcher::new(Regex::new(pattern).unwrap())
}
fn matcher_no_caps(pattern: &str) -> RegexMatcherNoCaps {
RegexMatcherNoCaps(Regex::new(pattern).unwrap())
}
fn m(start: usize, end: usize) -> Match {
Match::new(start, end)
}
#[test]
fn find() {
let matcher = matcher(r"(\w+)\s+(\w+)");
assert_eq!(matcher.find(b" homer simpson ").unwrap(), Some(m(1, 14)));
}
#[test]
fn find_iter() {
let matcher = matcher(r"(\w+)\s+(\w+)");
let mut matches = vec![];
matcher
.find_iter(b"aa bb cc dd", |m| {
matches.push(m);
true
})
.unwrap();
assert_eq!(matches, vec![m(0, 5), m(6, 11)]);
// Test that find_iter respects short circuiting.
matches.clear();
matcher
.find_iter(b"aa bb cc dd", |m| {
matches.push(m);
false
})
.unwrap();
assert_eq!(matches, vec![m(0, 5)]);
}
#[test]
fn try_find_iter() {
#[derive(Clone, Debug, Eq, PartialEq)]
struct MyError;
let matcher = matcher(r"(\w+)\s+(\w+)");
let mut matches = vec![];
let err = matcher
.try_find_iter(b"aa bb cc dd", |m| {
if matches.is_empty() {
matches.push(m);
Ok(true)
} else {
Err(MyError)
}
})
.unwrap()
.unwrap_err();
assert_eq!(matches, vec![m(0, 5)]);
assert_eq!(err, MyError);
}
#[test]
fn shortest_match() {
let matcher = matcher(r"a+");
// This tests that the default impl isn't doing anything smart, and simply
// defers to `find`.
assert_eq!(matcher.shortest_match(b"aaa").unwrap(), Some(3));
// The actual underlying regex is smarter.
assert_eq!(matcher.re.shortest_match(b"aaa"), Some(1));
}
#[test]
fn captures() {
let matcher = matcher(r"(?P<a>\w+)\s+(?P<b>\w+)");
assert_eq!(matcher.capture_count(), 3);
assert_eq!(matcher.capture_index("a"), Some(1));
assert_eq!(matcher.capture_index("b"), Some(2));
assert_eq!(matcher.capture_index("nada"), None);
let mut caps = matcher.new_captures().unwrap();
assert!(matcher.captures(b" homer simpson ", &mut caps).unwrap());
assert_eq!(caps.get(0), Some(m(1, 14)));
assert_eq!(caps.get(1), Some(m(1, 6)));
assert_eq!(caps.get(2), Some(m(7, 14)));
}
#[test]
fn captures_iter() {
let matcher = matcher(r"(?P<a>\w+)\s+(?P<b>\w+)");
let mut caps = matcher.new_captures().unwrap();
let mut matches = vec![];
matcher
.captures_iter(b"aa bb cc dd", &mut caps, |caps| {
matches.push(caps.get(0).unwrap());
matches.push(caps.get(1).unwrap());
matches.push(caps.get(2).unwrap());
true
})
.unwrap();
assert_eq!(
matches,
vec![m(0, 5), m(0, 2), m(3, 5), m(6, 11), m(6, 8), m(9, 11),]
);
// Test that captures_iter respects short circuiting.
matches.clear();
matcher
.captures_iter(b"aa bb cc dd", &mut caps, |caps| {
matches.push(caps.get(0).unwrap());
matches.push(caps.get(1).unwrap());
matches.push(caps.get(2).unwrap());
false
})
.unwrap();
assert_eq!(matches, vec![m(0, 5), m(0, 2), m(3, 5),]);
}
#[test]
fn try_captures_iter() {
#[derive(Clone, Debug, Eq, PartialEq)]
struct MyError;
let matcher = matcher(r"(?P<a>\w+)\s+(?P<b>\w+)");
let mut caps = matcher.new_captures().unwrap();
let mut matches = vec![];
let err = matcher
.try_captures_iter(b"aa bb cc dd", &mut caps, |caps| {
if matches.is_empty() {
matches.push(caps.get(0).unwrap());
matches.push(caps.get(1).unwrap());
matches.push(caps.get(2).unwrap());
Ok(true)
} else {
Err(MyError)
}
})
.unwrap()
.unwrap_err();
assert_eq!(matches, vec![m(0, 5), m(0, 2), m(3, 5)]);
assert_eq!(err, MyError);
}
// Test that our default impls for capturing are correct. Namely, when
// capturing isn't supported by the underlying matcher, then all of the
// various capturing related APIs fail fast.
#[test]
fn no_captures() {
let matcher = matcher_no_caps(r"(?P<a>\w+)\s+(?P<b>\w+)");
assert_eq!(matcher.capture_count(), 0);
assert_eq!(matcher.capture_index("a"), None);
assert_eq!(matcher.capture_index("b"), None);
assert_eq!(matcher.capture_index("nada"), None);
let mut caps = matcher.new_captures().unwrap();
assert!(!matcher.captures(b"homer simpson", &mut caps).unwrap());
let mut called = false;
matcher
.captures_iter(b"homer simpson", &mut caps, |_| {
called = true;
true
})
.unwrap();
assert!(!called);
}
#[test]
fn replace() {
let matcher = matcher(r"(\w+)\s+(\w+)");
let mut dst = vec![];
matcher
.replace(b"aa bb cc dd", &mut dst, |_, dst| {
dst.push(b'z');
true
})
.unwrap();
assert_eq!(dst, b"z z");
// Test that replacements respect short circuiting.
dst.clear();
matcher
.replace(b"aa bb cc dd", &mut dst, |_, dst| {
dst.push(b'z');
false
})
.unwrap();
assert_eq!(dst, b"z cc dd");
}
#[test]
fn replace_with_captures() {
let matcher = matcher(r"(\w+)\s+(\w+)");
let haystack = b"aa bb cc dd";
let mut caps = matcher.new_captures().unwrap();
let mut dst = vec![];
matcher
.replace_with_captures(haystack, &mut caps, &mut dst, |caps, dst| {
caps.interpolate(
|name| matcher.capture_index(name),
haystack,
b"$2 $1",
dst,
);
true
})
.unwrap();
assert_eq!(dst, b"bb aa dd cc");
// Test that replacements respect short circuiting.
dst.clear();
matcher
.replace_with_captures(haystack, &mut caps, &mut dst, |caps, dst| {
caps.interpolate(
|name| matcher.capture_index(name),
haystack,
b"$2 $1",
dst,
);
false
})
.unwrap();
assert_eq!(dst, b"bb aa cc dd");
}

View File

@@ -0,0 +1,6 @@
extern crate grep_matcher;
extern crate regex;
mod util;
mod test_matcher;

View File

@@ -0,0 +1,95 @@
use std::collections::HashMap;
use std::result;
use grep_matcher::{Captures, Match, Matcher, NoCaptures, NoError};
use regex::bytes::{CaptureLocations, Regex};
#[derive(Debug)]
pub struct RegexMatcher {
pub re: Regex,
pub names: HashMap<String, usize>,
}
impl RegexMatcher {
pub fn new(re: Regex) -> RegexMatcher {
let mut names = HashMap::new();
for (i, optional_name) in re.capture_names().enumerate() {
if let Some(name) = optional_name {
names.insert(name.to_string(), i);
}
}
RegexMatcher { re: re, names: names }
}
}
type Result<T> = result::Result<T, NoError>;
impl Matcher for RegexMatcher {
type Captures = RegexCaptures;
type Error = NoError;
fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>> {
Ok(self
.re
.find_at(haystack, at)
.map(|m| Match::new(m.start(), m.end())))
}
fn new_captures(&self) -> Result<RegexCaptures> {
Ok(RegexCaptures(self.re.capture_locations()))
}
fn captures_at(
&self,
haystack: &[u8],
at: usize,
caps: &mut RegexCaptures,
) -> Result<bool> {
Ok(self.re.captures_read_at(&mut caps.0, haystack, at).is_some())
}
fn capture_count(&self) -> usize {
self.re.captures_len()
}
fn capture_index(&self, name: &str) -> Option<usize> {
self.names.get(name).map(|i| *i)
}
// We purposely don't implement any other methods, so that we test the
// default impls. The "real" Regex impl for Matcher provides a few more
// impls. e.g., Its `find_iter` impl is faster than what we can do here,
// since the regex crate avoids synchronization overhead.
}
#[derive(Debug)]
pub struct RegexMatcherNoCaps(pub Regex);
impl Matcher for RegexMatcherNoCaps {
type Captures = NoCaptures;
type Error = NoError;
fn find_at(&self, haystack: &[u8], at: usize) -> Result<Option<Match>> {
Ok(self
.0
.find_at(haystack, at)
.map(|m| Match::new(m.start(), m.end())))
}
fn new_captures(&self) -> Result<NoCaptures> {
Ok(NoCaptures::new())
}
}
#[derive(Clone, Debug)]
pub struct RegexCaptures(CaptureLocations);
impl Captures for RegexCaptures {
fn len(&self) -> usize {
self.0.len()
}
fn get(&self, i: usize) -> Option<Match> {
self.0.pos(i).map(|(s, e)| Match::new(s, e))
}
}