From 6cdb99ea61d585cc32716d181517d71b4df769b5 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 2 Sep 2023 12:25:34 -0400 Subject: [PATCH] deps: drop bytecount in favor of memchr_iter(..).count() As of the memchr 2.6 release, its Iterator::count method is specialized to only count the number of occurrences instead of finding the offset of each occurrence. This replaces ripgrep's use of the bytecount crate. While micro-benchmarks suggest that memchr's method has better throughput than bytecount, it turned out to be an illusion. Namely, on a ~13GB haystack prior to this change: $ time rg-bytecount 'You killed my friend, my best friend, my lifelong friend!' OpenSubtitles2018.raw.en --line-number 441450441:- You killed my friend, my best friend, my lifelong friend! real 1.473 user 1.186 sys 0.286 maxmem 12512 MB faults 0 And then after: $ time rg 'You killed my friend, my best friend, my lifelong friend!' OpenSubtitles2018.raw.en --line-number 441450441:- You killed my friend, my best friend, my lifelong friend! real 1.532 user 1.280 sys 0.250 maxmem 12512 MB faults 0 But perf is just about in the same ballpark. That's good enough for me at the moment in order to drop the extra dependency. I did this because the marginal cost of adding the Iterator::count() specialization to memchr was extremely small. --- Cargo.lock | 8 +------- crates/searcher/Cargo.toml | 5 ++--- crates/searcher/src/lines.rs | 3 +-- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50b377c3..fa1ea97e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,12 +40,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bytecount" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" - [[package]] name = "cc" version = "1.0.83" @@ -215,12 +209,12 @@ name = "grep-searcher" version = "0.1.11" dependencies = [ "bstr", - "bytecount", "encoding_rs", "encoding_rs_io", "grep-matcher", "grep-regex", "log", + "memchr", "memmap2", "regex", ] diff --git a/crates/searcher/Cargo.toml b/crates/searcher/Cargo.toml index 68864cee..579d1370 100644 --- a/crates/searcher/Cargo.toml +++ b/crates/searcher/Cargo.toml @@ -15,19 +15,18 @@ edition = "2018" [dependencies] bstr = { version = "1.6.0", default-features = false, features = ["std"] } -bytecount = "0.6" encoding_rs = "0.8.14" encoding_rs_io = "0.1.6" grep-matcher = { version = "0.1.6", path = "../matcher" } log = "0.4.5" +memchr = "2.6.2" memmap = { package = "memmap2", version = "0.5.3" } [dev-dependencies] grep-regex = { version = "0.1.11", path = "../regex" } -regex = "1.1" +regex = "1.9.5" [features] -default = ["bytecount/runtime-dispatch-simd"] simd-accel = ["encoding_rs/simd-accel"] # This feature is DEPRECATED. Runtime dispatch is used for SIMD now. diff --git a/crates/searcher/src/lines.rs b/crates/searcher/src/lines.rs index 387a1b46..5e47c9b3 100644 --- a/crates/searcher/src/lines.rs +++ b/crates/searcher/src/lines.rs @@ -3,7 +3,6 @@ A collection of routines for performing operations on lines. */ use bstr::ByteSlice; -use bytecount; use grep_matcher::{LineTerminator, Match}; /// An iterator over lines in a particular slice of bytes. @@ -110,7 +109,7 @@ impl LineStep { /// Count the number of occurrences of `line_term` in `bytes`. pub fn count(bytes: &[u8], line_term: u8) -> u64 { - bytecount::count(bytes, line_term) as u64 + memchr::memchr_iter(line_term, bytes).count() as u64 } /// Given a line that possibly ends with a terminator, return that line without