mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-07-26 01:31:57 -07:00
regex: make CRLF hack more robust
This commit improves the CRLF hack to be more robust. In particular, in addition to rewriting `$` as `(?:\r??$)`, we now strip `\r` from the end of a match if and only if the regex has an ending line anchor required for a match. This doesn't quite make the hack 100% correct, but should fix most use cases in practice. An example of a regex that will still be incorrect is `foo|bar$`, since the analysis isn't quite sophisticated enough to determine that a `\r` can be safely stripped from any match. Even if we fix that, regexes like `foo\r|bar$` still won't be handled correctly. Alas, more work on this front should really be focused on enabling this in the regex engine itself. The specific cause of this bug was that grep-searcher was sneakily stripping CRLF from matching lines when it really shouldn't have. We remove that code now, and instead rely on better match semantics provided at a lower level. Fixes #1095
This commit is contained in:
@@ -153,7 +153,10 @@ rgtest!(basic, |dir: Dir, mut cmd: TestCommand| {
|
||||
msgs[1].unwrap_context(),
|
||||
Context {
|
||||
path: Some(Data::text("sherlock")),
|
||||
lines: Data::text("Holmeses, success in the province of detective work must always\n"),
|
||||
lines: Data::text(
|
||||
"Holmeses, success in the province of \
|
||||
detective work must always\n",
|
||||
),
|
||||
line_number: Some(2),
|
||||
absolute_offset: 65,
|
||||
submatches: vec![],
|
||||
@@ -163,7 +166,10 @@ rgtest!(basic, |dir: Dir, mut cmd: TestCommand| {
|
||||
msgs[2].unwrap_match(),
|
||||
Match {
|
||||
path: Some(Data::text("sherlock")),
|
||||
lines: Data::text("be, to a very large extent, the result of luck. Sherlock Holmes\n"),
|
||||
lines: Data::text(
|
||||
"be, to a very large extent, the result of luck. \
|
||||
Sherlock Holmes\n",
|
||||
),
|
||||
line_number: Some(3),
|
||||
absolute_offset: 129,
|
||||
submatches: vec![
|
||||
@@ -212,7 +218,9 @@ rgtest!(notutf8, |dir: Dir, mut cmd: TestCommand| {
|
||||
let contents = &b"quux\xFFbaz"[..];
|
||||
|
||||
// APFS does not support creating files with invalid UTF-8 bytes, so just
|
||||
// skip the test if we can't create our file.
|
||||
// skip the test if we can't create our file. Presumably we don't need this
|
||||
// check if we're already skipping it on macOS, but maybe other file
|
||||
// systems won't like this test either?
|
||||
if !dir.try_create_bytes(OsStr::from_bytes(name), contents).is_ok() {
|
||||
return;
|
||||
}
|
||||
@@ -305,3 +313,52 @@ rgtest!(crlf, |dir: Dir, mut cmd: TestCommand| {
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1095
|
||||
//
|
||||
// This test checks that we don't drop the \r\n in a matching line when --crlf
|
||||
// mode is enabled.
|
||||
rgtest!(r1095_missing_crlf, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create("foo", "test\r\n");
|
||||
|
||||
// Check without --crlf flag.
|
||||
let msgs = json_decode(&cmd.arg("--json").arg("test").stdout());
|
||||
assert_eq!(msgs.len(), 4);
|
||||
assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
|
||||
|
||||
// Now check with --crlf flag.
|
||||
let msgs = json_decode(&cmd.arg("--crlf").stdout());
|
||||
assert_eq!(msgs.len(), 4);
|
||||
assert_eq!(msgs[1].unwrap_match().lines, Data::text("test\r\n"));
|
||||
});
|
||||
|
||||
// See: https://github.com/BurntSushi/ripgrep/issues/1095
|
||||
//
|
||||
// This test checks that we don't return empty submatches when matching a `\n`
|
||||
// in CRLF mode.
|
||||
rgtest!(r1095_crlf_empty_match, |dir: Dir, mut cmd: TestCommand| {
|
||||
dir.create("foo", "test\r\n\n");
|
||||
|
||||
// Check without --crlf flag.
|
||||
let msgs = json_decode(&cmd.arg("-U").arg("--json").arg("\n").stdout());
|
||||
assert_eq!(msgs.len(), 5);
|
||||
|
||||
let m = msgs[1].unwrap_match();
|
||||
assert_eq!(m.lines, Data::text("test\r\n"));
|
||||
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||
|
||||
let m = msgs[2].unwrap_match();
|
||||
assert_eq!(m.lines, Data::text("\n"));
|
||||
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||
|
||||
// Now check with --crlf flag.
|
||||
let msgs = json_decode(&cmd.arg("--crlf").stdout());
|
||||
|
||||
let m = msgs[1].unwrap_match();
|
||||
assert_eq!(m.lines, Data::text("test\r\n"));
|
||||
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||
|
||||
let m = msgs[2].unwrap_match();
|
||||
assert_eq!(m.lines, Data::text("\n"));
|
||||
assert_eq!(m.submatches[0].m, Data::text("\n"));
|
||||
});
|
||||
|
Reference in New Issue
Block a user