From 1d1ca1aa5d921f4abc9e8305feb261a44e0eb9a6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 17 Aug 2025 16:14:51 -0400 Subject: [PATCH] test: check binary file detection when using memory maps This resolves a TODO comment I wrote a while back. Memory maps behave a little differently in terms of detecting binary data, so the tests have somewhat different results than the tests that disable memory maps. Closes #3002 --- tests/binary.rs | 122 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 4 deletions(-) diff --git a/tests/binary.rs b/tests/binary.rs index 93f6844c..dc481aaa 100644 --- a/tests/binary.rs +++ b/tests/binary.rs @@ -4,8 +4,8 @@ use crate::util::{Dir, TestCommand}; // handling of binary files. There's quite a bit of discussion on this in this // bug report: https://github.com/BurntSushi/ripgrep/issues/306 -// Our haystack is the first 500 lines of Gutenberg's copy of "A Study in -// Scarlet," with a NUL byte at line 1898: `abcdef\x00`. +// Our haystack is the first 2,133 lines of Gutenberg's copy of "A Study in +// Scarlet," with a NUL byte at line 1870: `abcdef\x00`. // // The position and size of the haystack is, unfortunately, significant. In // particular, the NUL byte is specifically inserted at some point *after* the @@ -21,10 +21,124 @@ use crate::util::{Dir, TestCommand}; // detection with memory maps is a bit different. Namely, NUL bytes are only // searched for in the first few KB of the file and in a match. Normally, NUL // bytes are searched for everywhere. -// -// TODO: Add tests for binary file detection when using memory maps. const HAY: &'static [u8] = include_bytes!("./data/sherlock-nul.txt"); +// Tests for binary file detection when using memory maps. +// As noted in the original comments, with memory maps binary detection +// works differently - NUL bytes are only searched for in the first few KB +// of the file and in matches. + +// Test that matches in a binary file with memory maps work as expected +// with implicit file search (via glob pattern). +rgtest!(mmap_match_implicit, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + cmd.args(&["--mmap", "-n", "Project Gutenberg EBook", "-g", "hay"]); + + // With mmap, we get a match and a warning about binary content + let expected = "\ +hay:1:The Project Gutenberg EBook of A Study In Scarlet, by Arthur Conan Doyle +"; + eqnice!(expected, cmd.stdout()); +}); + +// Test with an explicit file argument when using memory maps. +rgtest!(mmap_match_explicit, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + cmd.args(&["--mmap", "-n", "Project Gutenberg EBook", "hay"]); + + let expected = "\ +1:The Project Gutenberg EBook of A Study In Scarlet, by Arthur Conan Doyle +"; + eqnice!(expected, cmd.stdout()); +}); + +// Test specifically with a pattern that matches near the NUL byte which should +// trigger binary detection with memory maps. +rgtest!(mmap_match_near_nul, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + // Pattern that matches around line 1898 where the NUL byte is. + // Note: Using direct file path instead of glob. + cmd.args(&["--mmap", "-n", "abcdef", "hay"]); + + let expected = "\ +binary file matches (found \"\\0\" byte around offset 77041) +"; + eqnice!(expected, cmd.stdout()); +}); + +// Test with --count option to ensure full file scanning works with mmap. +rgtest!(mmap_match_count, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + cmd.args(&["--mmap", "-c", "Project Gutenberg EBook|Heaven", "hay"]); + + // With mmap, since we're counting all matches and might not + // encounter the NUL byte during initial detection, the count + // should still be reported. + eqnice!("2\n", cmd.stdout()); +}); + +// Test binary detection with mmap when pattern would match before and after NUL +// byte. +rgtest!(mmap_match_multiple, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + // Use explicit file path. + cmd.args(&["--mmap", "-n", "Project Gutenberg EBook|Heaven", "hay"]); + + // With explicit file and memory maps, matches before and after NUL byte + // are shown. + let expected = "\ +1:The Project Gutenberg EBook of A Study In Scarlet, by Arthur Conan Doyle +1871:\"No. Heaven knows what the objects of his studies are. But here we +"; + eqnice!(expected, cmd.stdout()); +}); + +// Test that --binary flag can have odd results when searching with a memory +// map. +rgtest!(mmap_binary_flag, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + // Use glob pattern. + cmd.args(&["--mmap", "-n", "--binary", "Heaven", "-g", "hay"]); + + let expected = "\ +hay:1871:\"No. Heaven knows what the objects of his studies are. But here we +"; + eqnice!(expected, cmd.stdout()); +}); + +// Test that using -a/--text flag works as expected with mmap. +rgtest!(mmap_text_flag, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + cmd.args(&["--mmap", "-n", "--text", "Heaven", "-g", "hay"]); + + // With --text flag, binary detection should be disabled. + let expected = "\ +hay:1871:\"No. Heaven knows what the objects of his studies are. But here we +"; + eqnice!(expected, cmd.stdout()); +}); + +// Test pattern that matches before and after the NUL byte with memory maps. +rgtest!(mmap_after_nul_match, |dir: Dir, mut cmd: TestCommand| { + dir.create_bytes("hay", HAY); + // Use explicit file path. + cmd.args(&["--mmap", "-n", "medical student", "hay"]); + + // With explicit file and memory maps, all matches are shown + let expected = "\ +176:\"A medical student, I suppose?\" said I. +409:\"A medical student, I suppose?\" said I. +642:\"A medical student, I suppose?\" said I. +875:\"A medical student, I suppose?\" said I. +1108:\"A medical student, I suppose?\" said I. +1341:\"A medical student, I suppose?\" said I. +1574:\"A medical student, I suppose?\" said I. +1807:\"A medical student, I suppose?\" said I. +1867:\"And yet you say he is not a medical student?\" +"; + eqnice!(expected, cmd.stdout()); +}); + // This tests that ripgrep prints a warning message if it finds and prints a // match in a binary file before detecting that it is a binary file. The point // here is to notify that user that the search of the file is only partially