regex: simplify AST analysis a bit

The verbatim literal stuff hasn't been used for a while and I don't
foresee it being used. If it's really needed, it would probably better
to just implement it by looking at the pattern string itself, which
avoids parsing it into an AST altogether.
This commit is contained in:
Andrew Gallant 2023-06-17 10:00:07 -04:00
parent d9bd261be8
commit 51480d57a6

View File

@ -1,17 +1,13 @@
use regex_syntax::ast::parse::Parser;
use regex_syntax::ast::{self, Ast}; use regex_syntax::ast::{self, Ast};
/// The results of analyzing AST of a regular expression (e.g., for supporting /// The results of analyzing AST of a regular expression (e.g., for supporting
/// smart case). /// smart case).
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct AstAnalysis { pub(crate) struct AstAnalysis {
/// True if and only if a literal uppercase character occurs in the regex. /// True if and only if a literal uppercase character occurs in the regex.
any_uppercase: bool, any_uppercase: bool,
/// True if and only if the regex contains any literal at all. /// True if and only if the regex contains any literal at all.
any_literal: bool, any_literal: bool,
/// True if and only if the regex consists entirely of a literal and no
/// other special regex characters.
all_verbatim_literal: bool,
} }
impl AstAnalysis { impl AstAnalysis {
@ -19,16 +15,16 @@ impl AstAnalysis {
/// ///
/// If `pattern` is not a valid regular expression, then `None` is /// If `pattern` is not a valid regular expression, then `None` is
/// returned. /// returned.
#[allow(dead_code)] #[cfg(test)]
pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> { pub(crate) fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
Parser::new() regex_syntax::ast::parse::Parser::new()
.parse(pattern) .parse(pattern)
.map(|ast| AstAnalysis::from_ast(&ast)) .map(|ast| AstAnalysis::from_ast(&ast))
.ok() .ok()
} }
/// Perform an AST analysis given the AST. /// Perform an AST analysis given the AST.
pub fn from_ast(ast: &Ast) -> AstAnalysis { pub(crate) fn from_ast(ast: &Ast) -> AstAnalysis {
let mut analysis = AstAnalysis::new(); let mut analysis = AstAnalysis::new();
analysis.from_ast_impl(ast); analysis.from_ast_impl(ast);
analysis analysis
@ -40,7 +36,7 @@ impl AstAnalysis {
/// For example, a pattern like `\pL` contains no uppercase literals, /// For example, a pattern like `\pL` contains no uppercase literals,
/// even though `L` is uppercase and the `\pL` class contains uppercase /// even though `L` is uppercase and the `\pL` class contains uppercase
/// characters. /// characters.
pub fn any_uppercase(&self) -> bool { pub(crate) fn any_uppercase(&self) -> bool {
self.any_uppercase self.any_uppercase
} }
@ -48,32 +44,13 @@ impl AstAnalysis {
/// ///
/// For example, a pattern like `\pL` reports `false`, but a pattern like /// For example, a pattern like `\pL` reports `false`, but a pattern like
/// `\pLfoo` reports `true`. /// `\pLfoo` reports `true`.
pub fn any_literal(&self) -> bool { pub(crate) fn any_literal(&self) -> bool {
self.any_literal self.any_literal
} }
/// Returns true if and only if the entire pattern is a verbatim literal
/// with no special meta characters.
///
/// When this is true, then the pattern satisfies the following law:
/// `escape(pattern) == pattern`. Notable examples where this returns
/// `false` include patterns like `a\u0061` even though `\u0061` is just
/// a literal `a`.
///
/// The purpose of this flag is to determine whether the patterns can be
/// given to non-regex substring search algorithms as-is.
#[allow(dead_code)]
pub fn all_verbatim_literal(&self) -> bool {
self.all_verbatim_literal
}
/// Creates a new `AstAnalysis` value with an initial configuration. /// Creates a new `AstAnalysis` value with an initial configuration.
fn new() -> AstAnalysis { fn new() -> AstAnalysis {
AstAnalysis { AstAnalysis { any_uppercase: false, any_literal: false }
any_uppercase: false,
any_literal: false,
all_verbatim_literal: true,
}
} }
fn from_ast_impl(&mut self, ast: &Ast) { fn from_ast_impl(&mut self, ast: &Ast) {
@ -86,26 +63,20 @@ impl AstAnalysis {
| Ast::Dot(_) | Ast::Dot(_)
| Ast::Assertion(_) | Ast::Assertion(_)
| Ast::Class(ast::Class::Unicode(_)) | Ast::Class(ast::Class::Unicode(_))
| Ast::Class(ast::Class::Perl(_)) => { | Ast::Class(ast::Class::Perl(_)) => {}
self.all_verbatim_literal = false;
}
Ast::Literal(ref x) => { Ast::Literal(ref x) => {
self.from_ast_literal(x); self.from_ast_literal(x);
} }
Ast::Class(ast::Class::Bracketed(ref x)) => { Ast::Class(ast::Class::Bracketed(ref x)) => {
self.all_verbatim_literal = false;
self.from_ast_class_set(&x.kind); self.from_ast_class_set(&x.kind);
} }
Ast::Repetition(ref x) => { Ast::Repetition(ref x) => {
self.all_verbatim_literal = false;
self.from_ast_impl(&x.ast); self.from_ast_impl(&x.ast);
} }
Ast::Group(ref x) => { Ast::Group(ref x) => {
self.all_verbatim_literal = false;
self.from_ast_impl(&x.ast); self.from_ast_impl(&x.ast);
} }
Ast::Alternation(ref alt) => { Ast::Alternation(ref alt) => {
self.all_verbatim_literal = false;
for x in &alt.asts { for x in &alt.asts {
self.from_ast_impl(x); self.from_ast_impl(x);
} }
@ -161,9 +132,6 @@ impl AstAnalysis {
} }
fn from_ast_literal(&mut self, ast: &ast::Literal) { fn from_ast_literal(&mut self, ast: &ast::Literal) {
if ast.kind != ast::LiteralKind::Verbatim {
self.all_verbatim_literal = false;
}
self.any_literal = true; self.any_literal = true;
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase(); self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
} }
@ -171,7 +139,7 @@ impl AstAnalysis {
/// Returns true if and only if the attributes can never change no matter /// Returns true if and only if the attributes can never change no matter
/// what other AST it might see. /// what other AST it might see.
fn done(&self) -> bool { fn done(&self) -> bool {
self.any_uppercase && self.any_literal && !self.all_verbatim_literal self.any_uppercase && self.any_literal
} }
} }
@ -188,76 +156,61 @@ mod tests {
let x = analysis(""); let x = analysis("");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(!x.any_literal); assert!(!x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis("foo"); let x = analysis("foo");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis("Foo"); let x = analysis("Foo");
assert!(x.any_uppercase); assert!(x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis("foO"); let x = analysis("foO");
assert!(x.any_uppercase); assert!(x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(x.all_verbatim_literal);
let x = analysis(r"foo\\"); let x = analysis(r"foo\\");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\w"); let x = analysis(r"foo\w");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\S"); let x = analysis(r"foo\S");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\p{Ll}"); let x = analysis(r"foo\p{Ll}");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo[a-z]"); let x = analysis(r"foo[a-z]");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo[A-Z]"); let x = analysis(r"foo[A-Z]");
assert!(x.any_uppercase); assert!(x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo[\S\t]"); let x = analysis(r"foo[\S\t]");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"foo\\S"); let x = analysis(r"foo\\S");
assert!(x.any_uppercase); assert!(x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"\p{Ll}"); let x = analysis(r"\p{Ll}");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(!x.any_literal); assert!(!x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"aBc\w"); let x = analysis(r"aBc\w");
assert!(x.any_uppercase); assert!(x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
let x = analysis(r"a\u0061"); let x = analysis(r"a\u0061");
assert!(!x.any_uppercase); assert!(!x.any_uppercase);
assert!(x.any_literal); assert!(x.any_literal);
assert!(!x.all_verbatim_literal);
} }
} }