mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
regex: some small cleanup in 'strip.rs'
We also utilize bstr's methods to get rid of some helpers we had written by hand.
This commit is contained in:
parent
9d62eb997a
commit
d9bd261be8
@ -11,7 +11,7 @@ repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ["regex", "grep", "search", "pattern", "line"]
|
keywords = ["regex", "grep", "search", "pattern", "line"]
|
||||||
license = "Unlicense OR MIT"
|
license = "Unlicense OR MIT"
|
||||||
edition = "2018"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
aho-corasick = "1.0.2"
|
aho-corasick = "1.0.2"
|
||||||
|
@ -1,8 +1,3 @@
|
|||||||
use std::error;
|
|
||||||
use std::fmt;
|
|
||||||
|
|
||||||
use crate::util;
|
|
||||||
|
|
||||||
/// An error that can occur in this crate.
|
/// An error that can occur in this crate.
|
||||||
///
|
///
|
||||||
/// Generally, this error corresponds to problems building a regular
|
/// Generally, this error corresponds to problems building a regular
|
||||||
@ -32,7 +27,7 @@ impl Error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn generic<E: error::Error>(err: E) -> Error {
|
pub(crate) fn generic<E: std::error::Error>(err: E) -> Error {
|
||||||
Error { kind: ErrorKind::Regex(err.to_string()) }
|
Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -68,18 +63,23 @@ pub enum ErrorKind {
|
|||||||
InvalidLineTerminator(u8),
|
InvalidLineTerminator(u8),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl error::Error for Error {}
|
impl std::error::Error for Error {}
|
||||||
|
|
||||||
|
impl std::fmt::Display for Error {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
use bstr::ByteSlice;
|
||||||
|
|
||||||
impl fmt::Display for Error {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
||||||
match self.kind {
|
match self.kind {
|
||||||
ErrorKind::Regex(ref s) => write!(f, "{}", s),
|
ErrorKind::Regex(ref s) => write!(f, "{}", s),
|
||||||
ErrorKind::NotAllowed(ref lit) => {
|
ErrorKind::NotAllowed(ref lit) => {
|
||||||
write!(f, "the literal '{:?}' is not allowed in a regex", lit)
|
write!(f, "the literal {:?} is not allowed in a regex", lit)
|
||||||
}
|
}
|
||||||
ErrorKind::InvalidLineTerminator(byte) => {
|
ErrorKind::InvalidLineTerminator(byte) => {
|
||||||
let x = util::show_bytes(&[byte]);
|
write!(
|
||||||
write!(f, "line terminators must be ASCII, but '{}' is not", x)
|
f,
|
||||||
|
"line terminators must be ASCII, but {} is not",
|
||||||
|
[byte].as_bstr()
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,5 +14,4 @@ mod matcher;
|
|||||||
mod multi;
|
mod multi;
|
||||||
mod non_matching;
|
mod non_matching;
|
||||||
mod strip;
|
mod strip;
|
||||||
mod util;
|
|
||||||
mod word;
|
mod word;
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
use grep_matcher::LineTerminator;
|
use {
|
||||||
use regex_syntax::hir::{self, Hir, HirKind};
|
grep_matcher::LineTerminator,
|
||||||
|
regex_syntax::hir::{self, Hir, HirKind},
|
||||||
|
};
|
||||||
|
|
||||||
use crate::error::{Error, ErrorKind};
|
use crate::error::{Error, ErrorKind};
|
||||||
|
|
||||||
@ -15,7 +17,26 @@ use crate::error::{Error, ErrorKind};
|
|||||||
///
|
///
|
||||||
/// If the given line terminator is not ASCII, then this function returns an
|
/// If the given line terminator is not ASCII, then this function returns an
|
||||||
/// error.
|
/// error.
|
||||||
pub fn strip_from_match(
|
///
|
||||||
|
/// Note that as of regex 1.9, this routine could theoretically be implemented
|
||||||
|
/// without returning an error. Namely, for example, we could turn
|
||||||
|
/// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminators with a
|
||||||
|
/// sub-expression that can never match anything. Thus, ripgrep would accept
|
||||||
|
/// such regexes and just silently not match anything. Regex versions prior to 1.8
|
||||||
|
/// don't support such constructs. I ended up deciding to leave the existing
|
||||||
|
/// behavior of returning an error instead. For example:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// $ echo -n 'foo\nbar\n' | rg 'foo\nbar'
|
||||||
|
/// the literal '"\n"' is not allowed in a regex
|
||||||
|
///
|
||||||
|
/// Consider enabling multiline mode with the --multiline flag (or -U for short).
|
||||||
|
/// When multiline mode is enabled, new line characters can be matched.
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// This looks like a good error message to me, and even suggests a flag that
|
||||||
|
/// the user can use instead.
|
||||||
|
pub(crate) fn strip_from_match(
|
||||||
expr: Hir,
|
expr: Hir,
|
||||||
line_term: LineTerminator,
|
line_term: LineTerminator,
|
||||||
) -> Result<Hir, Error> {
|
) -> Result<Hir, Error> {
|
||||||
@ -23,23 +44,20 @@ pub fn strip_from_match(
|
|||||||
let expr1 = strip_from_match_ascii(expr, b'\r')?;
|
let expr1 = strip_from_match_ascii(expr, b'\r')?;
|
||||||
strip_from_match_ascii(expr1, b'\n')
|
strip_from_match_ascii(expr1, b'\n')
|
||||||
} else {
|
} else {
|
||||||
let b = line_term.as_byte();
|
strip_from_match_ascii(expr, line_term.as_byte())
|
||||||
if b > 0x7F {
|
|
||||||
return Err(Error::new(ErrorKind::InvalidLineTerminator(b)));
|
|
||||||
}
|
|
||||||
strip_from_match_ascii(expr, b)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The implementation of strip_from_match. The given byte must be ASCII. This
|
/// The implementation of strip_from_match. The given byte must be ASCII.
|
||||||
/// function panics otherwise.
|
/// This function returns an error otherwise. It also returns an error if
|
||||||
|
/// it couldn't remove `\n` from the given regex without leaving an empty
|
||||||
|
/// character class in its place.
|
||||||
fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
|
fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
|
||||||
assert!(byte <= 0x7F);
|
if !byte.is_ascii() {
|
||||||
let chr = byte as char;
|
return Err(Error::new(ErrorKind::InvalidLineTerminator(byte)));
|
||||||
assert_eq!(chr.len_utf8(), 1);
|
}
|
||||||
|
let ch = char::from(byte);
|
||||||
let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string())));
|
let invalid = || Err(Error::new(ErrorKind::NotAllowed(ch.to_string())));
|
||||||
|
|
||||||
Ok(match expr.into_kind() {
|
Ok(match expr.into_kind() {
|
||||||
HirKind::Empty => Hir::empty(),
|
HirKind::Empty => Hir::empty(),
|
||||||
HirKind::Literal(hir::Literal(lit)) => {
|
HirKind::Literal(hir::Literal(lit)) => {
|
||||||
@ -50,7 +68,7 @@ fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result<Hir, Error> {
|
|||||||
}
|
}
|
||||||
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
||||||
let remove = hir::ClassUnicode::new(Some(
|
let remove = hir::ClassUnicode::new(Some(
|
||||||
hir::ClassUnicodeRange::new(chr, chr),
|
hir::ClassUnicodeRange::new(ch, ch),
|
||||||
));
|
));
|
||||||
cls.difference(&remove);
|
cls.difference(&remove);
|
||||||
if cls.ranges().is_empty() {
|
if cls.ranges().is_empty() {
|
||||||
|
@ -1,30 +0,0 @@
|
|||||||
/// Converts an arbitrary sequence of bytes to a literal suitable for building
|
|
||||||
/// a regular expression.
|
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn bytes_to_regex(bs: &[u8]) -> String {
|
|
||||||
use regex_syntax::is_meta_character;
|
|
||||||
use std::fmt::Write;
|
|
||||||
|
|
||||||
let mut s = String::with_capacity(bs.len());
|
|
||||||
for &b in bs {
|
|
||||||
if b <= 0x7F && !is_meta_character(b as char) {
|
|
||||||
write!(s, r"{}", b as char).unwrap();
|
|
||||||
} else {
|
|
||||||
write!(s, r"\x{:02x}", b).unwrap();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
s
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts arbitrary bytes to a nice string.
|
|
||||||
pub fn show_bytes(bs: &[u8]) -> String {
|
|
||||||
use std::ascii::escape_default;
|
|
||||||
use std::str;
|
|
||||||
|
|
||||||
let mut nice = String::new();
|
|
||||||
for &b in bs {
|
|
||||||
let part: Vec<u8> = escape_default(b).collect();
|
|
||||||
nice.push_str(str::from_utf8(&part).unwrap());
|
|
||||||
}
|
|
||||||
nice
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user