Compare commits

...

4 Commits

Author SHA1 Message Date
Andrew Gallant
61733f6378 globset-0.4.13 2023-08-05 09:34:36 -04:00
Andrew Gallant
7227e94ce5 globset: use non-capture groups in regex transform
We currently implement globs by converting them to regexes, and in doing
so, sometimes use grouping. In all but one case, we used non-capturing
groups. But for alternations, we used capturing groups, which was likely
just an oversight. We don't make use of capture groups at all, and while
they usually don't have any overhead, they lead to weird cases like this
one: https://github.com/rust-lang/regex/issues/1059

That particular issue is also a bug in the regex crate itself, which is
fixed in https://github.com/rust-lang/regex/pull/1062. Note though that
the bug fix in the regex crate is required. Even with this patch to
globset, memory usage is reduced (by about half in rust-lang/regex#1059)
but is not returned to where it was prior to the regex 1.9 release.
2023-08-05 09:33:57 -04:00
Andrew Gallant
341a19e0d0 regex: fix fast path for -w/--word-regexp flag (#2576)
It turns out our fast path for -w/--word-regexp wasn't quite correct in
some cases. Namely, we use `(?m:^|\W)(<original-regex>)(?m:\W|$)` as the
implementation of -w/--word-regexp since `\b(<original-regex>)\b` has
some unintuitive results in certain cases, specifically when
<original-regex> matches non-word characters at match boundaries.

The problem is that using this formulation means that you need to
extract the capture group around <original-regex> to find the "real"
match, since the surrounding (^|\W) and (\W|$) aren't part of the match.
This is fine, but the capture group engine is usually slow, so we have a
fast path where we try to deduce the correct match boundary after an
initial match (before running capture groups). The problem is that doing
this is rather tricky because it's hard to know, in general, whether the
`^` or the `\W` matched.

This still doesn't seem quite right overall, but we at least fix one
more case.

Fixes #2574
2023-07-31 08:51:09 -04:00
Vidar
fed4fea217 ignore/types: add csproj
Supports the .NET C# Project file extension.

PR #2575
2023-07-31 07:08:44 -04:00
7 changed files with 36 additions and 5 deletions

View File

@@ -38,6 +38,8 @@ Bug fixes:
Fix bug when using inline regex flags with `-e/--regexp`.
* [BUG #2523](https://github.com/BurntSushi/ripgrep/issues/2523):
Make executable searching take `.com` into account on Windows.
* [BUG #2574](https://github.com/BurntSushi/ripgrep/issues/2574):
Fix bug in `-w/--word-regexp` that would result in incorrect match offsets.
13.0.0 (2021-06-12)

2
Cargo.lock generated
View File

@@ -119,7 +119,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "globset"
version = "0.4.12"
version = "0.4.13"
dependencies = [
"aho-corasick",
"bstr",

View File

@@ -1,6 +1,6 @@
[package]
name = "globset"
version = "0.4.12" #:version
version = "0.4.13" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Cross platform single glob and glob set matching. Glob set matching is the

View File

@@ -736,7 +736,7 @@ impl Tokens {
// It is possible to have an empty set in which case the
// resulting alternation '()' would be an error.
if !parts.is_empty() {
re.push('(');
re.push_str("(?:");
re.push_str(&parts.join("|"));
re.push(')');
}
@@ -1276,6 +1276,7 @@ mod tests {
toregex!(re32, "/a**", r"^/a.*.*$");
toregex!(re33, "/**a", r"^/.*.*a$");
toregex!(re34, "/a**b", r"^/a.*.*b$");
toregex!(re35, "{a,b}", r"^(?:b|a)$");
matches!(match1, "a", "a");
matches!(match2, "a*b", "a_b");

View File

@@ -55,6 +55,7 @@ pub const DEFAULT_TYPES: &[(&[&str], &[&str])] = &[
(&["cs"], &["*.cs"]),
(&["csharp"], &["*.cs"]),
(&["cshtml"], &["*.cshtml"]),
(&["csproj"], &["*.csproj"]),
(&["css"], &["*.css", "*.scss"]),
(&["csv"], &["*.csv"]),
(&["cuda"], &["*.cu", "*.cuh"]),

View File

@@ -128,6 +128,9 @@ impl WordMatcher {
// The reason why we cannot handle the ^/$ cases here is because we
// can't assume anything about the original pattern. (Try commenting
// out the checks for ^/$ below and run the tests to see examples.)
//
// NOTE(2023-07-31): After fixing #2574, this logic honestly still
// doesn't seem correct. Regex composition is hard.
let input = Input::new(haystack).span(at..haystack.len());
let mut cand = match self.regex.find(input) {
None => return Ok(None),
@@ -136,8 +139,17 @@ impl WordMatcher {
if cand.start() == 0 || cand.end() == haystack.len() {
return Err(());
}
let (_, slen) = bstr::decode_utf8(&haystack[cand]);
let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
// We decode the chars on either side of the match. If either char is
// a word character, then that means the ^/$ matched and not \W. In
// that case, we defer to the slower engine.
let (ch, slen) = bstr::decode_utf8(&haystack[cand]);
if ch.map_or(true, regex_syntax::is_word_character) {
return Err(());
}
let (ch, elen) = bstr::decode_last_utf8(&haystack[cand]);
if ch.map_or(true, regex_syntax::is_word_character) {
return Err(());
}
let new_start = cand.start() + slen;
let new_end = cand.end() - elen;
// This occurs the original regex can match the empty string. In this

View File

@@ -1173,3 +1173,18 @@ rgtest!(r2480, |dir: Dir, mut cmd: TestCommand| {
cmd.args(&["--only-matching", "-e", "(?i)notfoo", "-e", "bar", "file"]);
cmd.assert_err();
});
// See: https://github.com/BurntSushi/ripgrep/issues/2574
rgtest!(r2574, |dir: Dir, mut cmd: TestCommand| {
dir.create("haystack", "some.domain.com\nsome.domain.com/x\n");
let got = cmd
.args(&[
"--no-filename",
"--no-unicode",
"-w",
"-o",
r"(\w+\.)*domain\.(\w+)",
])
.stdout();
eqnice!("some.domain.com\nsome.domain.com\n", got);
});