Compare commits

...

20 Commits
0.1.0 ... 0.1.2

Author SHA1 Message Date
Andrew Gallant
8f87a4e8ac 0.1.2 2016-09-17 11:36:11 -04:00
Andrew Gallant
d27d3e675f bump grep 2016-09-17 11:34:27 -04:00
Andrew Gallant
bf5d873099 grep 0.1.1 2016-09-17 11:32:47 -04:00
Andrew Gallant
bc9d12c4c8 Improve ergonomics of benchsuite.
The runner now detects if commands exist and permits running incomplete
benchmarks.

Also, explicitly use Python 3 since that's what default Ubuntu 16.04 seems
to want.
2016-09-17 11:30:01 -04:00
Andrew Gallant
5a0c873f61 Fixing, polishing and adding benchmarks. 2016-09-16 21:02:46 -04:00
Andrew Gallant
65fec147d6 rename 2016-09-16 18:27:34 -04:00
Andrew Gallant
7fbf2f014c Reorganize some files. 2016-09-16 18:22:35 -04:00
Andrew Gallant
d22a3ca3e5 Improve the "bad literal" error message.
Incidentally, this was done by using the Debug impl for `char` instead
of the Display impl. Cute.

Fixes #5.
2016-09-16 18:12:00 -04:00
Andrew Gallant
e9ec52b7f9 Update walkdir 2016-09-16 17:56:44 -04:00
Andrew Gallant
0d14c74e63 Some minor performance tweaks.
This includes moving basename-only globs into separate regexes. The hope
is that if the regex processes less input, it will be faster.
2016-09-16 16:13:28 -04:00
Andrew Gallant
1c5884b2f9 try again... 2016-09-16 07:12:06 -04:00
Andrew Gallant
8203a80ac7 fix tests 2016-09-16 06:58:10 -04:00
Andrew Gallant
0e46171e3b Rework glob sets.
We try to reduce the pressure on regexes and offload some of it to
Aho-Corasick or exact lookups.
2016-09-15 22:06:04 -04:00
Andrew Gallant
f5c85827ce Don't traverse directory stack if we don't need to. 2016-09-15 12:40:28 -04:00
Andrew Gallant
7cefc55238 Remove .agignore from ignore file list. 2016-09-15 12:40:08 -04:00
Andrew Gallant
92c918ebd9 --no-ignore implies --no-ignore-parent 2016-09-14 14:33:37 -04:00
Andrew Gallant
c24f8fd50f Replace crossbeam with deque.
deque appears faster.
2016-09-14 07:40:46 -04:00
Andrew Gallant
73272cf8a6 notice 2016-09-13 21:23:22 -04:00
Andrew Gallant
4212a8b9cb 0.1.1 2016-09-13 21:21:45 -04:00
Andrew Gallant
983c7fd6f9 We don't use thread_local any more, so remove it. 2016-09-13 21:21:36 -04:00
21 changed files with 1030 additions and 352 deletions

70
Cargo.lock generated
View File

@@ -1,24 +1,24 @@
[root] [root]
name = "ripgrep" name = "ripgrep"
version = "0.1.0" version = "0.1.2"
dependencies = [ dependencies = [
"crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)", "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)", "docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"grep 0.1.0", "grep 0.1.1",
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)",
"term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", "walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
"walkdir 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@@ -31,9 +31,12 @@ dependencies = [
] ]
[[package]] [[package]]
name = "crossbeam" name = "deque"
version = "0.2.10" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "docopt" name = "docopt"
@@ -41,20 +44,25 @@ version = "0.6.83"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)", "rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)",
"strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)", "strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "env_logger" name = "env_logger"
version = "0.3.4" version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "fnv"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "fs2" name = "fs2"
version = "0.2.5" version = "0.2.5"
@@ -72,12 +80,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "grep" name = "grep"
version = "0.1.0" version = "0.1.1"
dependencies = [ dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@@ -126,7 +134,15 @@ dependencies = [
[[package]] [[package]]
name = "num_cpus" name = "num_cpus"
version = "1.0.0" version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -134,14 +150,14 @@ dependencies = [
[[package]] [[package]]
name = "regex" name = "regex"
version = "0.1.76" version = "0.1.77"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", "simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)", "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@@ -185,7 +201,7 @@ dependencies = [
[[package]] [[package]]
name = "thread_local" name = "thread_local"
version = "0.2.6" version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -198,7 +214,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "walkdir" name = "walkdir"
version = "0.1.6" version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -217,9 +233,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata] [metadata]
"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
"checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97" "checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
"checksum docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)" = "fc42c6077823a361410c37d47c2535b73a190cbe10838dc4f400fe87c10c8c3b" "checksum docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)" = "fc42c6077823a361410c37d47c2535b73a190cbe10838dc4f400fe87c10c8c3b"
"checksum env_logger 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "82dcb9ceed3868a03b335657b85a159736c961900f7e7747d3b0b97b9ccb5ccb" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
"checksum fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8e8af7b5408ab0c4910cad114c8f9eb454bf75df7afe8964307eeafb68a13a5e"
"checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef" "checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
@@ -228,16 +245,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054"
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
"checksum memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "f20f72ed93291a72e22e8b16bb18762183bb4943f0f483da5b8be1a9e8192752" "checksum memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "f20f72ed93291a72e22e8b16bb18762183bb4943f0f483da5b8be1a9e8192752"
"checksum num_cpus 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a859041cbf7a70ea1ece4b87d1a2c6ef364dcb68749c88db1f97304b9ec09d5f" "checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad"
"checksum regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)" = "63b49f873f36ddc838d773972511e5fed2ef7350885af07d58e2f48ce8073dcd" "checksum rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "2791d88c6defac799c3f20d74f094ca33b9332612d9aef9078519c82e4fe04a5"
"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665"
"checksum regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279401017ae31cf4e15344aa3f085d0e2e5c1e70067289ef906906fdbe92c8fd" "checksum regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279401017ae31cf4e15344aa3f085d0e2e5c1e70067289ef906906fdbe92c8fd"
"checksum rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)" = "6159e4e6e559c81bd706afe9c8fd68f547d3e851ce12e76b1de7914bab61691b" "checksum rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)" = "6159e4e6e559c81bd706afe9c8fd68f547d3e851ce12e76b1de7914bab61691b"
"checksum simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "63b5847c2d766ca7ce7227672850955802fabd779ba616aeabead4c2c3877023" "checksum simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "63b5847c2d766ca7ce7227672850955802fabd779ba616aeabead4c2c3877023"
"checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e" "checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e"
"checksum term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3deff8a2b3b6607d6d7cc32ac25c0b33709453ca9cceac006caac51e963cf94a" "checksum term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3deff8a2b3b6607d6d7cc32ac25c0b33709453ca9cceac006caac51e963cf94a"
"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
"checksum thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "55dd963dbaeadc08aa7266bf7f91c3154a7805e32bb94b820b769d2ef3b4744d" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
"checksum walkdir 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "d42144c31c9909882ce76e696b306b88a5b091721251137d5d522d1ef3da7cf9" "checksum walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c66c0b9792f0a765345452775f3adbd28dde9d33f30d13e5dcc5ae17cf6f3780"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "ripgrep" name = "ripgrep"
version = "0.1.0" #:version version = "0.1.2" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"] authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """ description = """
Line oriented search tool using Rust's regex library. Combines the raw Line oriented search tool using Rust's regex library. Combines the raw
@@ -23,10 +23,11 @@ name = "integration"
path = "tests/tests.rs" path = "tests/tests.rs"
[dependencies] [dependencies]
crossbeam = "0.2" deque = "0.3"
docopt = "0.6" docopt = "0.6"
env_logger = "0.3" env_logger = "0.3"
grep = { version = "0.1", path = "grep" } fnv = "1.0"
grep = { version = "0.1.1", path = "grep" }
lazy_static = "0.2" lazy_static = "0.2"
libc = "0.2" libc = "0.2"
log = "0.3" log = "0.3"
@@ -36,7 +37,6 @@ num_cpus = "1"
regex = "0.1.76" regex = "0.1.76"
rustc-serialize = "0.3" rustc-serialize = "0.3"
term = "0.4" term = "0.4"
thread_local = "0.2"
walkdir = "0.1" walkdir = "0.1"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]

View File

@@ -1,14 +0,0 @@
all:
echo Nothing to do...
ctags:
ctags --options=ctags.rust --languages=Rust src/*.rs src/*/*.rs
docs:
cargo doc
in-dir ./target/doc fix-perms
rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/
push:
git push origin master
git push github master

View File

@@ -1,3 +1,6 @@
**UNDER DEVELOPMENT.**
ripgrep (rg) ripgrep (rg)
------------ ------------
ripgrep combines the usability of the silver searcher with the raw speed of grep. ripgrep combines the usability of the silver searcher with the raw speed of
grep.

5
benches/README.md Normal file
View File

@@ -0,0 +1,5 @@
These are internal microbenchmarks for tracking the peformance of individual
components inside of ripgrep. At the moment, they aren't heavily used.
For performance benchmarks of ripgrep proper, see the sibling `benchsuite`
directory.

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
''' '''
benchsuite is a benchmark runner for comparing command line search tools. benchsuite is a benchmark runner for comparing command line search tools.
@@ -10,6 +10,7 @@ import os
import os.path as path import os.path as path
from multiprocessing import cpu_count from multiprocessing import cpu_count
import re import re
import shutil
import statistics import statistics
import subprocess import subprocess
import sys import sys
@@ -39,13 +40,23 @@ LINUX_CLONE = 'git://github.com/BurntSushi/linux'
GREP_ASCII = {'LC_ALL': 'C'} GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'} GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
# Sift tries really hard to search everything by default. In our code search
# benchmarks, we don't want that.
SIFT = [
'sift',
'--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
]
def bench_linux_literal_default(suite_dir): def bench_linux_literal_default(suite_dir):
''' '''
Benchmark the speed of a literal using *default* settings. Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful. analysis, but it is pedagogically useful to demonstrate how
default behaviors differ.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
@@ -55,8 +66,6 @@ def bench_linux_literal_default(suite_dir):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
# N.B. This is a purposefully unfair benchmark for illustrative purposes
# of how the default modes for each search tool differ.
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]), mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]), mkcmd('ag', ['ag', pat]),
@@ -64,10 +73,12 @@ def bench_linux_literal_default(suite_dir):
# doesn't read gitignore files. Instead, it has a file whitelist # doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search. # that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]), mkcmd('ucg', ['ucg', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}), # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems.
mkcmd('pt', ['pt', pat]), mkcmd('pt', ['pt', pat]),
# sift reports an extra line here for a binary file matched. # sift reports an extra line here for a binary file matched.
mkcmd('sift', ['sift', pat]), mkcmd('sift', ['sift', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
]) ])
@@ -76,8 +87,9 @@ def bench_linux_literal(suite_dir):
Benchmark the speed of a literal, attempting to be fair. Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is no to test how fast they are. For example, it makes sure there is
case insensitive matching and that line numbers are computed. no case insensitive matching and that line numbers are computed
(because some tools don't permit disabling line numbers).
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
@@ -88,19 +100,16 @@ def bench_linux_literal(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]), mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
mkcmd('ag', ['ag', '-s', pat]), mkcmd('pt (ignore)', ['pt', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]), mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]), mkcmd('git grep (ignore)', [
mkcmd('git grep', [
'git', 'grep', '-I', '-n', pat, 'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}), ], env={'LC_ALL': 'C'}),
mkcmd('pt', ['pt', pat]), mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('sift', [ mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
]) ])
@@ -120,23 +129,21 @@ def bench_linux_literal_casei(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]), mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]), mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
mkcmd('rg-novcs-mmap', [ mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
'rg', '--mmap', '--no-ignore', '-n', '-i', pat, mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
]), # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
mkcmd('ag', ['ag', '-i', pat]), # since that is certainly what ripgrep is doing, but this is for an
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]), # ASCII literal, so we should give `git grep` all the opportunity to
mkcmd('ucg', ['ucg', '-i', pat]), # do its best.
mkcmd('git grep', [ mkcmd('git grep (ignore)', [
'git', 'grep', '-I', '-n', '-i', pat, 'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}), ], env={'LC_ALL': 'C'}),
# sift yields more matches than it should here. Specifically, it gets mkcmd('rg (whitelist)', [
# matches in Module.symvers and System.map in the repo root. Both of 'rg', '-n', '-i', '--no-ignore', '-tall', pat,
# those files show up in the repo root's .gitignore file.
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
]), ]),
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
]) ])
@@ -156,20 +163,16 @@ def bench_linux_re_literal_suffix(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), mkcmd('ag (ignore)', ['ag', '-s', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]), mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd( mkcmd(
'git grep', 'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('sift', [ mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
]),
]) ])
@@ -189,22 +192,18 @@ def bench_linux_word(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-w', pat]), mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]), mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
mkcmd('rg-novcs-mmap', [ mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
]),
mkcmd('ag', ['ag', '-s', '-w', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
mkcmd( mkcmd(
'git grep', 'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', '-w', pat], ['git', 'grep', '-E', '-I', '-n', '-w', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('sift', [ mkcmd('rg (whitelist)', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat, 'rg', '-n', '-w', '--no-ignore', '-tall', pat,
]), ]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
]) ])
@@ -212,7 +211,8 @@ def bench_linux_unicode_greek(suite_dir):
''' '''
Benchmark matching of a Unicode category. Benchmark matching of a Unicode category.
Only three tools (ripgrep, sift and pt) support this. Only three tools (ripgrep, sift and pt) support this. We omit
pt because it is too slow.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
@@ -224,15 +224,7 @@ def bench_linux_unicode_greek(suite_dir):
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
# sift tries to search a bunch of PDF files and clutters up the mkcmd('sift', SIFT + ['-n', '--git', pat]),
# results, even though --binary-skip is provided. They are excluded
# here explicitly, but don't have a measurable impact on performance.
mkcmd('sift', [
'sift', '-n', '--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
pat,
]),
]) ])
@@ -252,15 +244,7 @@ def bench_linux_unicode_greek_casei(suite_dir):
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]), mkcmd('rg', ['rg', '-n', '-i', pat]),
# sift tries to search a bunch of PDF files and clutters up the mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
# results, even though --binary-skip is provided. They are excluded
# here explicitly, but don't have a measurable impact on performance.
mkcmd('sift', [
'sift', '-n', '--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
pat,
]),
]) ])
@@ -281,30 +265,25 @@ def bench_linux_unicode_word(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]), mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
mkcmd('rg-novcs-mmap', [ mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', pat]),
'rg', '--mmap', '--no-ignore', '-n', pat,
]),
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
mkcmd('ag-novcs (no Unicode)', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
mkcmd( mkcmd(
'git grep', 'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'}, env={'LC_ALL': 'en_US.UTF-8'},
), ),
mkcmd( mkcmd(
'git grep (no Unicode)', 'git grep (ignore) (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('sift (no Unicode)', [ mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, mkcmd('rg (whitelist) (ASCII)', [
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
]), ]),
mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
]) ])
@@ -326,30 +305,25 @@ def bench_linux_no_literal(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg-whitelist', ['rg', '-tall', '--no-ignore', '-n', pat]), mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]), mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
mkcmd('rg-whitelist (no Unicode)', [ mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', pat]),
'rg', '-tall', '--no-ignore', '-n', '(?-u)' + pat,
]),
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
mkcmd('ag-novcs (no Unicode)', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
mkcmd( mkcmd(
'git grep', 'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'}, env={'LC_ALL': 'en_US.UTF-8'},
), ),
mkcmd( mkcmd(
'git grep (no Unicode)', 'git grep (ignore) (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('sift (no Unicode)', [ mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat, mkcmd('rg (whitelist) (ASCII)', [
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
]), ]),
mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
]) ])
@@ -371,21 +345,15 @@ def bench_linux_alternates(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]), mkcmd('ag (ignore)', ['ag', '-s', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', pat,
]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd( mkcmd(
'git grep', 'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
]) ])
@@ -400,21 +368,15 @@ def bench_linux_alternates_casei(suite_dir):
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]), mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]), mkcmd('ag (ignore)', ['ag', '-i', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
]),
mkcmd('ag', ['ag', '-i', pat]),
mkcmd('ag-novcs', [
'ag', '--skip-vcs-ignores', '-i', pat,
]),
mkcmd('ucg', ['ucg', '-i', pat]),
mkcmd( mkcmd(
'git grep', 'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', '-i', pat], ['git', 'grep', '-E', '-I', '-n', '-i', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
]) ])
@@ -423,22 +385,159 @@ def bench_subtitles_en_literal(suite_dir):
Benchmark the speed of an ASCII string literal. Benchmark the speed of an ASCII string literal.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes' pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]), Command('rg', ['rg', pat, en]),
Command('rg (no line numbers)', ['rg', pat, ru]), Command('pt', ['pt', '-N', pat, en]),
Command('ag', ['ag', '-s', pat, ru]), Command('sift', ['sift', pat, en]),
Command('ucg', ['ucg', '--nosmart-case', pat, ru]), Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', pat, en]),
Command('grep (no line numbers)', [ Command('ag (lines)', ['ag', '-s', pat, en]),
'grep', '-a', pat, ru, Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
Command('pt (lines)', ['pt', pat, en]),
Command('sift (lines)', ['sift', '-n', pat, en]),
Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
])
def bench_subtitles_en_literal_casei(suite_dir):
'''
Benchmark the speed of a Unicode-y string case insensitively.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-i', pat, en]),
Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE),
Command('grep (ASCII)', [
'grep', '-E', '-ai', pat, en,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
])
def bench_subtitles_en_literal_word(suite_dir):
'''
Benchmark the speed of finding a literal inside word boundaries.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
]),
Command('ag (ASCII)', ['ag', '-sw', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-anw', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, en]),
Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE),
])
def bench_subtitles_en_alternate(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
])
return Benchmark(pattern=pat, commands=[
Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (lines)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', pat, en]),
Command('grep', [
'grep', '-E', '-a', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_en_alternate_casei(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
])
return Benchmark(pattern=pat, commands=[
Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-n', '-i', pat, en]),
Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE),
])
def bench_subtitles_en_surrounding_words(suite_dir):
'''
Benchmark a more complex regex with an inner literal.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w+\s+Holmes\s+\w+'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]),
Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_en_no_literal(suite_dir):
'''
Benchmark the speed of a regex with no literals.
Note that we don't even try to run grep with Unicode support
on this one. While it should eventually get the right answer,
I killed it after it had already been running for two minutes
and showed no signs of finishing soon.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('pt', ['pt', pat, ru]),
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
Command('sift', ['sift', '-n', pat, ru]),
Command('sift (no line numbers)', ['sift', pat, ru]),
]) ])
@@ -451,18 +550,16 @@ def bench_subtitles_ru_literal(suite_dir):
pat = 'Шерлок Холмс' # Sherlock Holmes pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]), Command('rg', ['rg', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]), Command('pt', ['pt', '-N', pat, ru]),
Command('ag', ['ag', '-s', pat, ru]), Command('sift', ['sift', pat, ru]),
Command('ucg', ['ucg', '--nosmart-case', pat, ru]), Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII), Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('grep (no line numbers)', [ Command('ag (lines)', ['ag', '-s', pat, ru]),
'grep', '-a', pat, ru, Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
], env=GREP_ASCII), Command('pt (lines)', ['pt', pat, ru]),
Command('pt', ['pt', pat, ru]), Command('sift (lines)', ['sift', '-n', pat, ru]),
Command('pt (no line numbers)', ['pt', '-N', pat, ru]), Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
Command('sift', ['sift', '-n', pat, ru]),
Command('sift (no line numbers)', ['sift', pat, ru]),
]) ])
@@ -475,13 +572,14 @@ def bench_subtitles_ru_literal_casei(suite_dir):
pat = 'Шерлок Холмс' # Sherlock Holmes pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', '-i', pat, ru]), Command('rg', ['rg', '-i', pat, ru]),
Command('ag (not Unicode)', ['ag', '-i', pat, ru]), Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE),
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]), Command('grep (ASCII)', [
Command('grep', ['grep', '-ani', pat, ru], env=GREP_UNICODE), 'grep', '-E', '-ai', pat, ru,
Command('grep (not Unicode)', [
'grep', '-E', '-ani', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
]) ])
@@ -494,15 +592,15 @@ def bench_subtitles_ru_literal_word(suite_dir):
pat = 'Шерлок Холмс' # Sherlock Holmes pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-nw', pat, ru]), Command('rg (ASCII)', [
Command('rg (not Unicode)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru, 'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
]), ]),
Command('ag (not Unicode)', ['ag', '-sw', pat, ru]), Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
Command('ucg (not Unicode)', ['ucg', '--nosmart-case', pat, ru]), Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (not Unicode)', [ Command('grep (ASCII)', [
'grep', '-anw', pat, ru, 'grep', '-anw', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, ru]),
Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE), Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
]) ])
@@ -522,11 +620,14 @@ def bench_subtitles_ru_alternate(suite_dir):
]) ])
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]), Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg', ['ucg', '--nosmart-case', pat, ru]), Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_ASCII), Command('grep (lines)', [
Command('grep (no line numbers)', [ 'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', pat, ru]),
Command('grep', [
'grep', '-E', '-a', pat, ru, 'grep', '-E', '-a', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
]) ])
@@ -547,12 +648,32 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
]) ])
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', '-i', pat, ru]), Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]), Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE), Command('grep (ASCII)', [
Command('grep (not Unicode)', [
'grep', '-E', '-ani', pat, ru, 'grep', '-E', '-ani', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('rg', ['rg', '-n', '-i', pat, ru]),
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
])
def bench_subtitles_ru_surrounding_words(suite_dir):
'''
Benchmark a more complex regex with an inner literal.
'''
require(suite_dir, 'subtitles-en')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = r'\w+\s+Холмс\s+\w+'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
]) ])
@@ -571,9 +692,10 @@ def bench_subtitles_ru_no_literal(suite_dir):
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]), Command('rg', ['rg', '-n', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
Command('ucg (no Unicode)', ['ucg', '--nosmart-case', pat, ru]), Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('grep (no Unicode)', [ Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, ru, 'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
]) ])
@@ -597,6 +719,23 @@ class MissingDependencies(Exception):
return 'MissingDependency(%s)' % repr(self.missing_names) return 'MissingDependency(%s)' % repr(self.missing_names)
class MissingCommands(Exception):
'''
A missing command exception.
This exception occurs when running a command in a benchmark
where the command could not be found on the current system.
:ivar list(str) missing_names:
The names of the command binaries that could not be found.
'''
def __init__(self, missing_names):
self.missing_names = sorted(set(missing_names))
def __str__(self):
return 'MissingCommands(%s)' % repr(self.missing_names)
class Benchmark(object): class Benchmark(object):
''' '''
A single benchmark corresponding to a grouping of commands. A single benchmark corresponding to a grouping of commands.
@@ -606,7 +745,8 @@ class Benchmark(object):
''' '''
def __init__(self, name=None, pattern=None, commands=None, def __init__(self, name=None, pattern=None, commands=None,
warmup_count=1, count=3, line_count=True): warmup_count=1, count=3, line_count=True,
allow_missing_commands=False):
''' '''
Create a single benchmark. Create a single benchmark.
@@ -644,15 +784,37 @@ class Benchmark(object):
self.warmup_count = warmup_count self.warmup_count = warmup_count
self.count = count self.count = count
self.line_count = line_count self.line_count = line_count
self.allow_missing_commands = allow_missing_commands
def raise_if_missing(self):
'''
Raises a MissingCommands exception if applicable.
A MissingCommands exception is raised when the following
criteria are met: 1) allow_missing_commands is False, and 2) at
least one command in this benchmark could not be found on this
system.
'''
missing_commands = \
[c.binary_name for c in self.commands if not c.exists()]
if not self.allow_missing_commands and len(missing_commands) > 0:
raise MissingCommands(missing_commands)
def run(self): def run(self):
''' '''
Runs this benchmark and returns the results. Runs this benchmark and returns the results.
:rtype: Result :rtype: Result
:raises:
MissingCommands if any command doesn't exist.
(Unless allow_missing_commands is enabled.)
''' '''
self.raise_if_missing()
result = Result(self) result = Result(self)
for cmd in self.commands: for cmd in self.commands:
if self.allow_missing_commands and not cmd.exists():
# Skip this command if we're OK with it.
continue
# Do a warmup first. # Do a warmup first.
for _ in range(self.warmup_count): for _ in range(self.warmup_count):
self.run_one(cmd) self.run_one(cmd)
@@ -677,6 +839,8 @@ class Benchmark(object):
it is the number of lines in the search output. it is the number of lines in the search output.
:rtype: int :rtype: int
''' '''
if not cmd.exists():
raise MissingCommand(cmd.cmd[0])
cmd.kwargs['stderr'] = subprocess.DEVNULL cmd.kwargs['stderr'] = subprocess.DEVNULL
if self.line_count: if self.line_count:
cmd.kwargs['stdout'] = subprocess.PIPE cmd.kwargs['stdout'] = subprocess.PIPE
@@ -746,6 +910,8 @@ class Result(object):
means = [] means = []
for cmd in self.benchmark.commands: for cmd in self.benchmark.commands:
mean, _ = self.distribution_for(cmd) mean, _ = self.distribution_for(cmd)
if mean is None:
continue
means.append((cmd, mean)) means.append((cmd, mean))
return min(means, key=lambda tup: tup[1])[0] return min(means, key=lambda tup: tup[1])[0]
@@ -768,16 +934,18 @@ class Result(object):
''' '''
Returns the distribution (mean +/- std) of the given command. Returns the distribution (mean +/- std) of the given command.
If there are no samples for this command (i.e., it was skipped),
then return ``(None, None)``.
:rtype: (float, float) :rtype: (float, float)
:returns: :returns:
A tuple containing the mean and standard deviation, in that A tuple containing the mean and standard deviation, in that
order. order.
''' '''
mean = statistics.mean( samples = list(s['duration'] for s in self.samples_for(cmd))
s['duration'] for s in self.samples_for(cmd)) if len(samples) == 0:
stdev = statistics.stdev( return None, None
s['duration'] for s in self.samples_for(cmd)) return statistics.mean(samples), statistics.stdev(samples)
return mean, stdev
class Command(object): class Command(object):
@@ -807,6 +975,15 @@ class Command(object):
self.args = args self.args = args
self.kwargs = kwargs self.kwargs = kwargs
def exists(self):
'Returns true if and only if this command exists.'
return shutil.which(self.binary_name) is not None
@property
def binary_name(self):
'Return the binary name of this command.'
return self.cmd[0]
def run(self): def run(self):
''' '''
Runs this command and returns its status. Runs this command and returns its status.
@@ -947,7 +1124,8 @@ def download(suite_dir, choices):
sys.exit(1) sys.exit(1)
def collect_benchmarks(suite_dir, filter_pat=None): def collect_benchmarks(suite_dir, filter_pat=None,
allow_missing_commands=False):
''' '''
Return an iterable of all runnable benchmarks. Return an iterable of all runnable benchmarks.
@@ -969,6 +1147,9 @@ def collect_benchmarks(suite_dir, filter_pat=None):
continue continue
try: try:
benchmark = globals()[fun](suite_dir) benchmark = globals()[fun](suite_dir)
benchmark.name = name
benchmark.allow_missing_commands = allow_missing_commands
benchmark.raise_if_missing()
except MissingDependencies as e: except MissingDependencies as e:
eprint( eprint(
'missing: %s, skipping benchmark %s (try running with: %s)' % ( 'missing: %s, skipping benchmark %s (try running with: %s)' % (
@@ -976,24 +1157,32 @@ def collect_benchmarks(suite_dir, filter_pat=None):
name, name,
' '.join(['--download %s' % n for n in e.missing_names]), ' '.join(['--download %s' % n for n in e.missing_names]),
)) ))
except MissingCommands as e:
fmt = 'missing commands: %s, skipping benchmark %s ' \
'(run with --allow-missing to run incomplete benchmarks)'
eprint(fmt % (', '.join(e.missing_names), name))
continue continue
benchmark.name = name
yield benchmark yield benchmark
def main(): def main():
download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru']
p = argparse.ArgumentParser('Command line search tool benchmark suite.') p = argparse.ArgumentParser('Command line search tool benchmark suite.')
p.add_argument( p.add_argument(
'--dir', metavar='PATH', default=os.getcwd(), '--dir', metavar='PATH', default=os.getcwd(),
help='The directory in which to download data and perform searches.') help='The directory in which to download data and perform searches.')
p.add_argument( p.add_argument(
'--download', metavar='CORPUS', action='append', '--download', metavar='CORPUS', action='append',
choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'], choices=download_choices,
help='Download and prepare corpus data, then exit without running ' help='Download and prepare corpus data, then exit without running '
'any benchmarks. Note that this command is intended to be ' 'any benchmarks. Note that this command is intended to be '
'idempotent. WARNING: This downloads over a gigabyte of data, ' 'idempotent. WARNING: This downloads over a gigabyte of data, '
'and also includes building the Linux kernel. If "all" is used ' 'and also includes building the Linux kernel. If "all" is used '
'then the total uncompressed size is around 13 GB.') 'then the total uncompressed size is around 13 GB. '
'Choices: %s' % ', '.join(download_choices))
p.add_argument(
'--allow-missing', action='store_true',
help='Permit benchmarks to run even if some commands are missing.')
p.add_argument( p.add_argument(
'-f', '--force', action='store_true', '-f', '--force', action='store_true',
help='Overwrite existing files if there is a conflict.') help='Overwrite existing files if there is a conflict.')
@@ -1009,6 +1198,13 @@ def main():
help='A regex pattern that will only run benchmarks that match.') help='A regex pattern that will only run benchmarks that match.')
args = p.parse_args() args = p.parse_args()
if args.list:
benchmarks = collect_benchmarks(
args.dir, filter_pat=args.bench,
allow_missing_commands=args.allow_missing)
for b in benchmarks:
print(b.name)
sys.exit(0)
if args.download is not None and len(args.download) > 0: if args.download is not None and len(args.download) > 0:
download(args.dir, args.download) download(args.dir, args.download)
sys.exit(0) sys.exit(0)
@@ -1028,7 +1224,9 @@ def main():
raw_csv_wtr = csv.DictWriter(raw_handle, fields) raw_csv_wtr = csv.DictWriter(raw_handle, fields)
raw_csv_wtr.writerow({x: x for x in fields}) raw_csv_wtr.writerow({x: x for x in fields})
benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench) benchmarks = collect_benchmarks(
args.dir, filter_pat=args.bench,
allow_missing_commands=args.allow_missing)
for i, b in enumerate(benchmarks): for i, b in enumerate(benchmarks):
result = b.run() result = b.run()
fastest_cmd = result.fastest_cmd() fastest_cmd = result.fastest_cmd()
@@ -1042,6 +1240,12 @@ def main():
for cmd in b.commands: for cmd in b.commands:
name = cmd.name name = cmd.name
mean, stdev = result.distribution_for(cmd) mean, stdev = result.distribution_for(cmd)
if mean is None:
# If we couldn't get a distribution for this command then
# it was skipped.
print('{name:{pad}} SKIPPED'.format(
name=name, pad=max_name_len + 2))
continue
line_counts = result.line_counts_for(cmd) line_counts = result.line_counts_for(cmd)
show_fast_cmd, show_line_counts = '', '' show_fast_cmd, show_line_counts = '', ''
if fastest_cmd.name == cmd.name: if fastest_cmd.name == cmd.name:

View File

@@ -1,11 +0,0 @@
--langdef=Rust
--langmap=Rust:.rs
--regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
--regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
--regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
--regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
--regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
--regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "grep" name = "grep"
version = "0.1.0" #:version version = "0.1.1" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"] authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """ description = """
Fast line oriented regex searching as a library. Fast line oriented regex searching as a library.

View File

@@ -62,7 +62,7 @@ impl fmt::Display for Error {
match *self { match *self {
Error::Regex(ref err) => err.fmt(f), Error::Regex(ref err) => err.fmt(f),
Error::LiteralNotAllowed(chr) => { Error::LiteralNotAllowed(chr) => {
write!(f, "Literal '{}' not allowed.", chr) write!(f, "Literal {:?} not allowed.", chr)
} }
Error::__Nonexhaustive => unreachable!(), Error::__Nonexhaustive => unreachable!(),
} }

View File

@@ -10,6 +10,10 @@ use {Error, Result};
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this /// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
/// function panics. /// function panics.
pub fn remove(expr: Expr, byte: u8) -> Result<Expr> { pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
// TODO(burntsushi): There is a bug in this routine where only `\n` is
// handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
// to proper character classes instead of the special `AnyCharNoNL` and
// `AnyByteNoNL` classes.
use syntax::Expr::*; use syntax::Expr::*;
assert!(byte <= 0x7F); assert!(byte <= 0x7F);
let chr = byte as char; let chr = byte as char;

View File

@@ -1 +0,0 @@
au BufWritePost *.rs silent!make ctags > /dev/null 2>&1

View File

@@ -124,6 +124,7 @@ Less common options:
--no-ignore --no-ignore
Don't respect ignore files (.gitignore, .rgignore, etc.) Don't respect ignore files (.gitignore, .rgignore, etc.)
This implies --no-ignore-parent.
--no-ignore-parent --no-ignore-parent
Don't respect ignore files in parent directories. Don't respect ignore files in parent directories.
@@ -338,7 +339,9 @@ impl RawArgs {
line_number: !self.flag_no_line_number && self.flag_line_number, line_number: !self.flag_no_line_number && self.flag_line_number,
mmap: mmap, mmap: mmap,
no_ignore: self.flag_no_ignore, no_ignore: self.flag_no_ignore,
no_ignore_parent: self.flag_no_ignore_parent, no_ignore_parent:
// --no-ignore implies --no-ignore-parent
self.flag_no_ignore_parent || self.flag_no_ignore,
quiet: self.flag_quiet, quiet: self.flag_quiet,
replace: self.flag_replace.clone().map(|s| s.into_bytes()), replace: self.flag_replace.clone().map(|s| s.into_bytes()),
text: self.flag_text, text: self.flag_text,

View File

@@ -21,6 +21,7 @@ additional rules such as whitelists (prefix of `!`) or directory-only globs
// TODO(burntsushi): Implement something similar, but for Mercurial. We can't // TODO(burntsushi): Implement something similar, but for Mercurial. We can't
// use this exact implementation because hgignore files are different. // use this exact implementation because hgignore files are different.
use std::cell::RefCell;
use std::error::Error as StdError; use std::error::Error as StdError;
use std::fmt; use std::fmt;
use std::fs::File; use std::fs::File;
@@ -30,6 +31,7 @@ use std::path::{Path, PathBuf};
use regex; use regex;
use glob; use glob;
use pathutil::strip_prefix;
/// Represents an error that can occur when parsing a gitignore file. /// Represents an error that can occur when parsing a gitignore file.
#[derive(Debug)] #[derive(Debug)]
@@ -110,37 +112,37 @@ impl Gitignore {
/// same directory as this gitignore file. /// same directory as this gitignore file.
pub fn matched<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> Match { pub fn matched<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> Match {
let mut path = path.as_ref(); let mut path = path.as_ref();
if let Ok(p) = path.strip_prefix(&self.root) { if let Some(p) = strip_prefix("./", path) {
path = p; path = p;
} }
self.matched_utf8(&*path.to_string_lossy(), is_dir) if let Some(p) = strip_prefix(&self.root, path) {
path = p;
}
self.matched_stripped(path, is_dir)
} }
/// Like matched, but takes a path that has already been stripped and /// Like matched, but takes a path that has already been stripped.
/// converted to UTF-8. pub fn matched_stripped(&self, path: &Path, is_dir: bool) -> Match {
pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match { thread_local! {
// A single regex with a bunch of alternations of glob patterns is static MATCHES: RefCell<Vec<usize>> = {
// unfortunately typically faster than a regex, so we use it as a RefCell::new(vec![])
// first pass filter. We still need to run the RegexSet to get the most
// recently defined glob that matched.
if !self.set.is_match(path) {
return Match::None;
}
// The regex set can't actually pick the right glob that matched all
// on its own. In particular, some globs require that only directories
// can match. Thus, only accept a match from the regex set if the given
// path satisfies the corresponding glob's directory criteria.
for i in self.set.matches(path).iter().rev() {
let pat = &self.patterns[i];
if !pat.only_dir || is_dir {
return if pat.whitelist {
Match::Whitelist(pat)
} else {
Match::Ignored(pat)
};
} }
} };
Match::None MATCHES.with(|matches| {
let mut matches = matches.borrow_mut();
self.set.matches_into(path, &mut *matches);
for &i in matches.iter().rev() {
let pat = &self.patterns[i];
if !pat.only_dir || is_dir {
return if pat.whitelist {
Match::Whitelist(pat)
} else {
Match::Ignored(pat)
};
}
}
Match::None
})
} }
/// Returns the total number of ignore patterns. /// Returns the total number of ignore patterns.
@@ -390,6 +392,7 @@ mod tests {
ignored!(ig23, ROOT, "foo", "./foo"); ignored!(ig23, ROOT, "foo", "./foo");
ignored!(ig24, ROOT, "target", "grep/target"); ignored!(ig24, ROOT, "target", "grep/target");
ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock"); ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock");
ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz");
not_ignored!(ignot1, ROOT, "amonths", "months"); not_ignored!(ignot1, ROOT, "amonths", "months");
not_ignored!(ignot2, ROOT, "monthsa", "months"); not_ignored!(ignot2, ROOT, "monthsa", "months");

View File

@@ -26,13 +26,22 @@ to make its way into `glob` proper.
// at the .gitignore for the chromium repo---just about every pattern satisfies // at the .gitignore for the chromium repo---just about every pattern satisfies
// that assumption.) // that assumption.)
use std::borrow::Cow;
use std::collections::HashMap;
use std::error::Error as StdError; use std::error::Error as StdError;
use std::ffi::{OsStr, OsString};
use std::fmt; use std::fmt;
use std::hash;
use std::iter; use std::iter;
use std::path::Path;
use std::str; use std::str;
use fnv;
use regex; use regex;
use regex::bytes::{Regex, RegexSet, SetMatches}; use regex::bytes::Regex;
use regex::bytes::RegexSet;
use pathutil::file_name;
/// Represents an error that can occur when parsing a glob pattern. /// Represents an error that can occur when parsing a glob pattern.
#[derive(Clone, Debug, Eq, PartialEq)] #[derive(Clone, Debug, Eq, PartialEq)]
@@ -71,33 +80,181 @@ impl fmt::Display for Error {
} }
} }
/// SetYesNo represents a group of globs that can be matched together in a
/// single pass. SetYesNo can only determine whether a particular path matched
/// any pattern in the set.
#[derive(Clone, Debug)]
pub struct SetYesNo {
re: Regex,
}
impl SetYesNo {
/// Returns true if and only if the given path matches at least one glob
/// in this set.
pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
self.re.is_match(&*path_bytes(path.as_ref()))
}
fn new(
pats: &[(Pattern, MatchOptions)],
) -> Result<SetYesNo, regex::Error> {
let mut joined = String::new();
for &(ref p, ref o) in pats {
let part = format!("(?:{})", p.to_regex_with(o));
if !joined.is_empty() {
joined.push('|');
}
joined.push_str(&part);
}
Ok(SetYesNo { re: try!(Regex::new(&joined)) })
}
}
type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
/// Set represents a group of globs that can be matched together in a single /// Set represents a group of globs that can be matched together in a single
/// pass. /// pass.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct Set { pub struct Set {
re: Regex, yesno: SetYesNo,
set: RegexSet, exts: HashMap<OsString, Vec<usize>, Fnv>,
literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
base_prefixes: Vec<Vec<u8>>,
base_prefixes_map: Vec<usize>,
base_suffixes: Vec<Vec<u8>>,
base_suffixes_map: Vec<usize>,
base_regexes: RegexSet,
base_regexes_map: Vec<usize>,
regexes: RegexSet,
regexes_map: Vec<usize>,
} }
impl Set { impl Set {
/// Returns true if and only if the given path matches at least one glob /// Returns the sequence number of every glob pattern that matches the
/// in this set. /// given path.
pub fn is_match<T: AsRef<[u8]>>(&self, path: T) -> bool {
self.re.is_match(path.as_ref())
}
/// Returns every glob pattern (by sequence number) that matches the given
/// path.
pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches {
// TODO(burntsushi): If we split this out into a separate crate, don't
// expose the regex::SetMatches type in the public API.
self.set.matches(path.as_ref())
}
/// Returns the number of glob patterns in this set.
#[allow(dead_code)] #[allow(dead_code)]
pub fn len(&self) -> usize { pub fn matches<T: AsRef<Path>>(&self, path: T) -> Vec<usize> {
self.set.len() let mut into = vec![];
self.matches_into(path, &mut into);
into
}
/// Adds the sequence number of every glob pattern that matches the given
/// path to the vec given.
pub fn matches_into<T: AsRef<Path>>(
&self,
path: T,
into: &mut Vec<usize>,
) {
into.clear();
let path = path.as_ref();
let path_bytes = &*path_bytes(path);
let basename = file_name(path).map(|b| os_str_bytes(b));
if !self.yesno.is_match(path) {
return;
}
if !self.exts.is_empty() {
if let Some(ext) = path.extension() {
if let Some(matches) = self.exts.get(ext) {
into.extend(matches.as_slice());
}
}
}
if !self.literals.is_empty() {
if let Some(matches) = self.literals.get(path_bytes) {
into.extend(matches.as_slice());
}
}
if !self.base_literals.is_empty() {
if let Some(ref basename) = basename {
if let Some(matches) = self.base_literals.get(&**basename) {
into.extend(matches.as_slice());
}
}
}
if !self.base_prefixes.is_empty() {
if let Some(ref basename) = basename {
let basename = &**basename;
for (i, pre) in self.base_prefixes.iter().enumerate() {
if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
into.push(self.base_prefixes_map[i]);
}
}
}
}
if !self.base_suffixes.is_empty() {
if let Some(ref basename) = basename {
let basename = &**basename;
for (i, suf) in self.base_suffixes.iter().enumerate() {
if suf.len() > basename.len() {
continue;
}
let (s, e) = (basename.len() - suf.len(), basename.len());
if &**suf == &basename[s..e] {
into.push(self.base_suffixes_map[i]);
}
}
}
}
if let Some(ref basename) = basename {
for i in self.base_regexes.matches(&**basename) {
into.push(self.base_regexes_map[i]);
}
}
for i in self.regexes.matches(path_bytes) {
into.push(self.regexes_map[i]);
}
into.sort();
}
fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
let fnv = Fnv::default();
let mut exts = HashMap::with_hasher(fnv.clone());
let mut literals = HashMap::with_hasher(fnv.clone());
let mut base_literals = HashMap::with_hasher(fnv.clone());
let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
let (mut regexes, mut regexes_map) = (vec![], vec![]);
let (mut base_regexes, mut base_regexes_map) = (vec![], vec![]);
for (i, &(ref p, ref o)) in pats.iter().enumerate() {
if let Some(ext) = p.ext() {
exts.entry(ext).or_insert(vec![]).push(i);
} else if let Some(literal) = p.literal() {
literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
} else if let Some(literal) = p.base_literal() {
base_literals
.entry(literal.into_bytes()).or_insert(vec![]).push(i);
} else if let Some(literal) = p.base_literal_prefix() {
base_prefixes.push(literal.into_bytes());
base_prefixes_map.push(i);
} else if let Some(literal) = p.base_literal_suffix() {
base_suffixes.push(literal.into_bytes());
base_suffixes_map.push(i);
} else if p.is_only_basename() {
let part = format!("(?:{})", p.to_regex_with(o));
base_regexes.push(part);
base_regexes_map.push(i);
} else {
let part = format!("(?:{})", p.to_regex_with(o));
regexes.push(part);
regexes_map.push(i);
}
}
Ok(Set {
yesno: try!(SetYesNo::new(pats)),
exts: exts,
literals: literals,
base_literals: base_literals,
base_prefixes: base_prefixes,
base_prefixes_map: base_prefixes_map,
base_suffixes: base_suffixes,
base_suffixes_map: base_suffixes_map,
base_regexes: try!(RegexSet::new(base_regexes)),
base_regexes_map: base_regexes_map,
regexes: try!(RegexSet::new(regexes)),
regexes_map: regexes_map,
})
} }
} }
@@ -119,19 +276,12 @@ impl SetBuilder {
/// ///
/// Once a matcher is built, no new patterns can be added to it. /// Once a matcher is built, no new patterns can be added to it.
pub fn build(&self) -> Result<Set, regex::Error> { pub fn build(&self) -> Result<Set, regex::Error> {
let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o)); Set::new(&self.pats)
let set = try!(RegexSet::new(it)); }
let mut joined = String::new(); /// Like `build`, but returns a matcher that can only answer yes/no.
for &(ref p, ref o) in &self.pats { pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
let part = format!("(?:{})", p.to_regex_with(o)); SetYesNo::new(&self.pats)
if !joined.is_empty() {
joined.push('|');
}
joined.push_str(&part);
}
let re = try!(Regex::new(&joined));
Ok(Set { re: re, set: set })
} }
/// Add a new pattern to this set. /// Add a new pattern to this set.
@@ -149,8 +299,21 @@ impl SetBuilder {
pat: &str, pat: &str,
opts: &MatchOptions, opts: &MatchOptions,
) -> Result<(), Error> { ) -> Result<(), Error> {
let pat = try!(Pattern::new(pat)); let parsed = try!(Pattern::new(pat));
self.pats.push((pat, opts.clone())); // if let Some(ext) = parsed.ext() {
// eprintln!("ext :: {:?} :: {:?}", ext, pat);
// } else if let Some(lit) = parsed.literal() {
// eprintln!("literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal_prefix() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal_suffix() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else {
// eprintln!("regex :: {:?} :: {:?}", pat, parsed);
// }
self.pats.push((parsed, opts.clone()));
Ok(()) Ok(())
} }
} }
@@ -204,6 +367,133 @@ impl Pattern {
Ok(p.p) Ok(p.p)
} }
/// Returns an extension if this pattern exclusively matches it.
pub fn ext(&self) -> Option<OsString> {
if self.tokens.len() <= 3 {
return None;
}
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.get(1) {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
match self.tokens.get(2) {
Some(&Token::Literal(c)) if c == '.' => {}
_ => return None,
}
let mut lit = OsString::new();
for t in self.tokens[3..].iter() {
match *t {
Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
return None;
}
Token::Literal(c) => lit.push(c.to_string()),
_ => return None,
}
}
Some(lit)
}
/// Returns the pattern as a literal if and only if the pattern exclusiely
/// matches the basename of a file path *and* is a literal.
///
/// The basic format of these patterns is `**/{literal}`, where `{literal}`
/// does not contain a path separator.
pub fn base_literal(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[1..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns true if and only if this pattern only inspects the basename
/// of a path.
pub fn is_only_basename(&self) -> bool {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return false,
}
for t in &self.tokens[1..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return false,
Token::RecursivePrefix
| Token::RecursiveSuffix
| Token::RecursiveZeroOrMore => return false,
_ => {}
}
}
true
}
/// Returns the pattern as a literal if and only if the pattern must match
/// an entire path exactly.
///
/// The basic format of these patterns is `{literal}`.
pub fn literal(&self) -> Option<String> {
let mut lit = String::new();
for t in &self.tokens {
match *t {
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns a basename literal prefix of this pattern.
pub fn base_literal_prefix(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.last() {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[1..self.tokens.len()-1] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns a basename literal suffix of this pattern.
pub fn base_literal_suffix(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.get(1) {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[2..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Convert this pattern to a string that is guaranteed to be a valid /// Convert this pattern to a string that is guaranteed to be a valid
/// regular expression and will represent the matching semantics of this /// regular expression and will represent the matching semantics of this
/// glob pattern. This uses a default set of options. /// glob pattern. This uses a default set of options.
@@ -415,13 +705,34 @@ impl<'a> Parser<'a> {
} }
} }
fn path_bytes(path: &Path) -> Cow<[u8]> {
os_str_bytes(path.as_os_str())
}
#[cfg(unix)]
fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(s.as_bytes())
}
#[cfg(not(unix))]
fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
// TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
// if we could get at the raw bytes, they wouldn't be useful. We *must*
// convert to UTF-8 before doing path matching. Unfortunate, but necessary.
match s.to_string_lossy() {
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::path::Path; use std::path::Path;
use regex::bytes::Regex; use regex::bytes::Regex;
use super::{Error, Pattern, MatchOptions, SetBuilder, Token}; use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token};
use super::Token::*; use super::Token::*;
macro_rules! syntax { macro_rules! syntax {
@@ -483,14 +794,42 @@ mod tests {
let pat = Pattern::new($pat).unwrap(); let pat = Pattern::new($pat).unwrap();
let path = &Path::new($path).to_str().unwrap(); let path = &Path::new($path).to_str().unwrap();
let re = Regex::new(&pat.to_regex_with(&$options)).unwrap(); let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
// println!("PATTERN: {}", $pat);
// println!("REGEX: {:?}", re);
// println!("PATH: {}", path);
assert!(!re.is_match(path.as_bytes())); assert!(!re.is_match(path.as_bytes()));
} }
}; };
} }
macro_rules! ext {
($name:ident, $pat:expr, $ext:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
let ext = pat.ext().map(|e| e.to_string_lossy().into_owned());
assert_eq!($ext, ext.as_ref().map(|s| &**s));
}
};
}
macro_rules! baseliteral {
($name:ident, $pat:expr, $yes:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
assert_eq!($yes, pat.base_literal().is_some());
}
};
}
macro_rules! basesuffix {
($name:ident, $pat:expr, $yes:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
assert_eq!($yes, pat.is_literal_suffix());
}
};
}
fn class(s: char, e: char) -> Token { fn class(s: char, e: char) -> Token {
Class { negated: false, ranges: vec![(s, e)] } Class { negated: false, ranges: vec![(s, e)] }
} }
@@ -585,6 +924,26 @@ mod tests {
toregex!(re10, "+", r"^\+$"); toregex!(re10, "+", r"^\+$");
toregex!(re11, "**", r"^.*$"); toregex!(re11, "**", r"^.*$");
ext!(ext1, "**/*.rs", Some("rs"));
baseliteral!(lit1, "**", true);
baseliteral!(lit2, "**/a", true);
baseliteral!(lit3, "**/ab", true);
baseliteral!(lit4, "**/a*b", false);
baseliteral!(lit5, "z/**/a*b", false);
baseliteral!(lit6, "[ab]", false);
baseliteral!(lit7, "?", false);
/*
issuffix!(suf1, "", false);
issuffix!(suf2, "a", true);
issuffix!(suf3, "ab", true);
issuffix!(suf4, "*ab", true);
issuffix!(suf5, "*.ab", true);
issuffix!(suf6, "?.ab", true);
issuffix!(suf7, "ab*", false);
*/
matches!(match1, "a", "a"); matches!(match1, "a", "a");
matches!(match2, "a*b", "a_b"); matches!(match2, "a*b", "a_b");
matches!(match3, "a*b*c", "abc"); matches!(match3, "a*b*c", "abc");
@@ -681,16 +1040,22 @@ mod tests {
builder.add("src/lib.rs").unwrap(); builder.add("src/lib.rs").unwrap();
let set = builder.build().unwrap(); let set = builder.build().unwrap();
assert!(set.is_match("foo.c")); fn is_match(set: &Set, s: &str) -> bool {
assert!(set.is_match("src/foo.c")); let mut matches = vec![];
assert!(!set.is_match("foo.rs")); set.matches_into(s, &mut matches);
assert!(!set.is_match("tests/foo.rs")); !matches.is_empty()
assert!(set.is_match("src/foo.rs")); }
assert!(set.is_match("src/grep/src/main.rs"));
assert_eq!(2, set.matches("src/lib.rs").iter().count()); assert!(is_match(&set, "foo.c"));
assert!(set.matches("src/lib.rs").matched(0)); assert!(is_match(&set, "src/foo.c"));
assert!(!set.matches("src/lib.rs").matched(1)); assert!(!is_match(&set, "foo.rs"));
assert!(set.matches("src/lib.rs").matched(2)); assert!(!is_match(&set, "tests/foo.rs"));
assert!(is_match(&set, "src/foo.rs"));
assert!(is_match(&set, "src/grep/src/main.rs"));
let matches = set.matches("src/lib.rs");
assert_eq!(2, matches.len());
assert_eq!(0, matches[0]);
assert_eq!(2, matches[1]);
} }
} }

View File

@@ -19,11 +19,11 @@ use std::io;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use gitignore::{self, Gitignore, GitignoreBuilder, Match, Pattern}; use gitignore::{self, Gitignore, GitignoreBuilder, Match, Pattern};
use pathutil::is_hidden;
use types::Types; use types::Types;
const IGNORE_NAMES: &'static [&'static str] = &[ const IGNORE_NAMES: &'static [&'static str] = &[
".gitignore", ".gitignore",
".agignore",
".rgignore", ".rgignore",
]; ];
@@ -83,7 +83,10 @@ pub struct Ignore {
overrides: Overrides, overrides: Overrides,
/// A file type matcher. /// A file type matcher.
types: Types, types: Types,
/// Whether to ignore hidden files or not.
ignore_hidden: bool, ignore_hidden: bool,
/// When true, don't look at .gitignore or .agignore files for ignore
/// rules.
no_ignore: bool, no_ignore: bool,
} }
@@ -208,15 +211,17 @@ impl Ignore {
debug!("{} ignored because it is hidden", path.display()); debug!("{} ignored because it is hidden", path.display());
return true; return true;
} }
for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) { if !self.no_ignore {
let mat = id.matched(path, is_dir); for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) {
if let Some(is_ignored) = self.ignore_match(path, mat) { let mat = id.matched(path, is_dir);
if is_ignored { if let Some(is_ignored) = self.ignore_match(path, mat) {
return true; if is_ignored {
return true;
}
// If this path is whitelisted by an ignore, then
// fallthrough and let the file type matcher have a say.
break;
} }
// If this path is whitelisted by an ignore, then fallthrough
// and let the file type matcher have a say.
break;
} }
} }
let mat = self.types.matched(path, is_dir); let mat = self.types.matched(path, is_dir);
@@ -361,8 +366,7 @@ impl Overrides {
let path = path.as_ref(); let path = path.as_ref();
self.gi.as_ref() self.gi.as_ref()
.map(|gi| { .map(|gi| {
let path = &*path.to_string_lossy(); let mat = gi.matched_stripped(path, is_dir).invert();
let mat = gi.matched_utf8(path, is_dir).invert();
if mat.is_none() && !is_dir { if mat.is_none() && !is_dir {
if gi.num_ignores() > 0 { if gi.num_ignores() > 0 {
return Match::Ignored(&self.unmatched_pat); return Match::Ignored(&self.unmatched_pat);
@@ -374,14 +378,6 @@ impl Overrides {
} }
} }
fn is_hidden<P: AsRef<Path>>(path: P) -> bool {
if let Some(name) = path.as_ref().file_name() {
name.to_str().map(|s| s.starts_with(".")).unwrap_or(false)
} else {
false
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::path::Path; use std::path::Path;

View File

@@ -1,6 +1,7 @@
extern crate crossbeam; extern crate deque;
extern crate docopt; extern crate docopt;
extern crate env_logger; extern crate env_logger;
extern crate fnv;
extern crate grep; extern crate grep;
#[cfg(windows)] #[cfg(windows)]
extern crate kernel32; extern crate kernel32;
@@ -15,7 +16,6 @@ extern crate num_cpus;
extern crate regex; extern crate regex;
extern crate rustc_serialize; extern crate rustc_serialize;
extern crate term; extern crate term;
extern crate thread_local;
extern crate walkdir; extern crate walkdir;
#[cfg(windows)] #[cfg(windows)]
extern crate winapi; extern crate winapi;
@@ -29,7 +29,7 @@ use std::result;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::thread; use std::thread;
use crossbeam::sync::chase_lev::{self, Steal, Stealer}; use deque::{Stealer, Stolen};
use grep::Grep; use grep::Grep;
use memmap::{Mmap, Protection}; use memmap::{Mmap, Protection};
use term::Terminal; use term::Terminal;
@@ -37,6 +37,7 @@ use walkdir::DirEntry;
use args::Args; use args::Args;
use out::{ColoredTerminal, Out}; use out::{ColoredTerminal, Out};
use pathutil::strip_prefix;
use printer::Printer; use printer::Printer;
use search_stream::InputBuffer; use search_stream::InputBuffer;
#[cfg(windows)] #[cfg(windows)]
@@ -61,6 +62,7 @@ mod gitignore;
mod glob; mod glob;
mod ignore; mod ignore;
mod out; mod out;
mod pathutil;
mod printer; mod printer;
mod search_buffer; mod search_buffer;
mod search_stream; mod search_stream;
@@ -98,8 +100,8 @@ fn run(args: Args) -> Result<u64> {
let out = Arc::new(Mutex::new(args.out())); let out = Arc::new(Mutex::new(args.out()));
let mut workers = vec![]; let mut workers = vec![];
let mut workq = { let workq = {
let (workq, stealer) = chase_lev::deque(); let (workq, stealer) = deque::new();
for _ in 0..args.threads() { for _ in 0..args.threads() {
let worker = MultiWorker { let worker = MultiWorker {
chan_work: stealer.clone(), chan_work: stealer.clone(),
@@ -216,10 +218,10 @@ impl MultiWorker {
fn run(mut self) -> u64 { fn run(mut self) -> u64 {
loop { loop {
let work = match self.chan_work.steal() { let work = match self.chan_work.steal() {
Steal::Empty | Steal::Abort => continue, Stolen::Empty | Stolen::Abort => continue,
Steal::Data(Work::Quit) => break, Stolen::Data(Work::Quit) => break,
Steal::Data(Work::Stdin) => WorkReady::Stdin, Stolen::Data(Work::Stdin) => WorkReady::Stdin,
Steal::Data(Work::File(ent)) => { Stolen::Data(Work::File(ent)) => {
match File::open(ent.path()) { match File::open(ent.path()) {
Ok(file) => WorkReady::DirFile(ent, file), Ok(file) => WorkReady::DirFile(ent, file),
Err(err) => { Err(err) => {
@@ -258,7 +260,7 @@ impl Worker {
} }
WorkReady::DirFile(ent, file) => { WorkReady::DirFile(ent, file) => {
let mut path = ent.path(); let mut path = ent.path();
if let Ok(p) = path.strip_prefix("./") { if let Some(p) = strip_prefix("./", path) {
path = p; path = p;
} }
if self.args.mmap() { if self.args.mmap() {
@@ -269,7 +271,7 @@ impl Worker {
} }
WorkReady::PathFile(path, file) => { WorkReady::PathFile(path, file) => {
let mut path = &*path; let mut path = &*path;
if let Ok(p) = path.strip_prefix("./") { if let Some(p) = strip_prefix("./", path) {
path = p; path = p;
} }
if self.args.mmap() { if self.args.mmap() {

98
src/pathutil.rs Normal file
View File

@@ -0,0 +1,98 @@
/*!
The pathutil module provides platform specific operations on paths that are
typically faster than the same operations as provided in std::path. In
particular, we really want to avoid the costly operation of parsing the path
into its constituent components. We give up on Windows, but on Unix, we deal
with the raw bytes directly.
On large repositories (like chromium), this can have a ~25% performance
improvement on just listing the files to search (!).
*/
use std::ffi::OsStr;
use std::path::Path;
use memchr::memrchr;
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(unix)]
pub fn strip_prefix<'a, P: AsRef<Path>>(
prefix: P,
path: &'a Path,
) -> Option<&'a Path> {
use std::os::unix::ffi::OsStrExt;
let prefix = prefix.as_ref().as_os_str().as_bytes();
let path = path.as_os_str().as_bytes();
if prefix.len() > path.len() || prefix != &path[0..prefix.len()] {
None
} else {
Some(&Path::new(OsStr::from_bytes(&path[prefix.len()..])))
}
}
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(not(unix))]
pub fn strip_prefix<'a>(prefix: &Path, path: &'a Path) -> Option<&'a Path> {
path.strip_prefix(prefix).ok()
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(unix)]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
use std::os::unix::ffi::OsStrExt;
let path = path.as_ref().as_os_str().as_bytes();
if path.is_empty() {
return None;
} else if path.len() == 1 && path[0] == b'.' {
return None;
} else if path.last() == Some(&b'.') {
return None;
} else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
return None;
}
let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0);
Some(OsStr::from_bytes(&path[last_slash..]))
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(not(unix))]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
path.as_ref().file_name()
}
/// Returns true if and only if this file path is considered to be hidden.
#[cfg(unix)]
pub fn is_hidden<P: AsRef<Path>>(path: P) -> bool {
use std::os::unix::ffi::OsStrExt;
if let Some(name) = file_name(path.as_ref()) {
name.as_bytes().get(0) == Some(&b'.')
} else {
false
}
}
/// Returns true if and only if this file path is considered to be hidden.
#[cfg(not(unix))]
pub fn is_hidden<P: AsRef<Path>>(path: P) -> bool {
if let Some(name) = file_name(path) {
name.to_str().map(|s| s.starts_with(".")).unwrap_or(false)
} else {
false
}
}

View File

@@ -151,8 +151,8 @@ impl FileTypeDef {
/// Types is a file type matcher. /// Types is a file type matcher.
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct Types { pub struct Types {
selected: Option<glob::Set>, selected: Option<glob::SetYesNo>,
negated: Option<glob::Set>, negated: Option<glob::SetYesNo>,
has_selected: bool, has_selected: bool,
unmatched_pat: Pattern, unmatched_pat: Pattern,
} }
@@ -165,8 +165,8 @@ impl Types {
/// If has_selected is true, then at least one file type was selected. /// If has_selected is true, then at least one file type was selected.
/// Therefore, any non-matches should be ignored. /// Therefore, any non-matches should be ignored.
fn new( fn new(
selected: Option<glob::Set>, selected: Option<glob::SetYesNo>,
negated: Option<glob::Set>, negated: Option<glob::SetYesNo>,
has_selected: bool, has_selected: bool,
) -> Types { ) -> Types {
Types { Types {
@@ -268,7 +268,7 @@ impl TypesBuilder {
try!(bset.add_with(glob, &opts)); try!(bset.add_with(glob, &opts));
} }
} }
Some(try!(bset.build())) Some(try!(bset.build_yesno()))
}; };
let negated_globs = let negated_globs =
if self.negated.is_empty() { if self.negated.is_empty() {
@@ -287,7 +287,7 @@ impl TypesBuilder {
try!(bset.add_with(glob, &opts)); try!(bset.add_with(glob, &opts));
} }
} }
Some(try!(bset.build())) Some(try!(bset.build_yesno()))
}; };
Ok(Types::new( Ok(Types::new(
selected_globs, negated_globs, !self.selected.is_empty())) selected_globs, negated_globs, !self.selected.is_empty()))

View File

@@ -26,6 +26,7 @@ impl Iter {
} }
/// Returns true if this entry should be skipped. /// Returns true if this entry should be skipped.
#[inline(always)]
fn skip_entry(&self, ent: &DirEntry) -> bool { fn skip_entry(&self, ent: &DirEntry) -> bool {
if ent.depth() == 0 { if ent.depth() == 0 {
// Never skip the root directory. // Never skip the root directory.
@@ -41,6 +42,7 @@ impl Iter {
impl Iterator for Iter { impl Iterator for Iter {
type Item = DirEntry; type Item = DirEntry;
#[inline(always)]
fn next(&mut self) -> Option<DirEntry> { fn next(&mut self) -> Option<DirEntry> {
while let Some(ev) = self.it.next() { while let Some(ev) = self.it.next() {
match ev { match ev {
@@ -108,6 +110,7 @@ impl From<WalkDir> for WalkEventIter {
impl Iterator for WalkEventIter { impl Iterator for WalkEventIter {
type Item = walkdir::Result<WalkEvent>; type Item = walkdir::Result<WalkEvent>;
#[inline(always)]
fn next(&mut self) -> Option<walkdir::Result<WalkEvent>> { fn next(&mut self) -> Option<walkdir::Result<WalkEvent>> {
let dent = self.next.take().or_else(|| self.it.next()); let dent = self.next.take().or_else(|| self.it.next());
let depth = match dent { let depth = match dent {