Compare commits

...

20 Commits
0.1.0 ... 0.1.2

Author SHA1 Message Date
Andrew Gallant
8f87a4e8ac 0.1.2 2016-09-17 11:36:11 -04:00
Andrew Gallant
d27d3e675f bump grep 2016-09-17 11:34:27 -04:00
Andrew Gallant
bf5d873099 grep 0.1.1 2016-09-17 11:32:47 -04:00
Andrew Gallant
bc9d12c4c8 Improve ergonomics of benchsuite.
The runner now detects if commands exist and permits running incomplete
benchmarks.

Also, explicitly use Python 3 since that's what default Ubuntu 16.04 seems
to want.
2016-09-17 11:30:01 -04:00
Andrew Gallant
5a0c873f61 Fixing, polishing and adding benchmarks. 2016-09-16 21:02:46 -04:00
Andrew Gallant
65fec147d6 rename 2016-09-16 18:27:34 -04:00
Andrew Gallant
7fbf2f014c Reorganize some files. 2016-09-16 18:22:35 -04:00
Andrew Gallant
d22a3ca3e5 Improve the "bad literal" error message.
Incidentally, this was done by using the Debug impl for `char` instead
of the Display impl. Cute.

Fixes #5.
2016-09-16 18:12:00 -04:00
Andrew Gallant
e9ec52b7f9 Update walkdir 2016-09-16 17:56:44 -04:00
Andrew Gallant
0d14c74e63 Some minor performance tweaks.
This includes moving basename-only globs into separate regexes. The hope
is that if the regex processes less input, it will be faster.
2016-09-16 16:13:28 -04:00
Andrew Gallant
1c5884b2f9 try again... 2016-09-16 07:12:06 -04:00
Andrew Gallant
8203a80ac7 fix tests 2016-09-16 06:58:10 -04:00
Andrew Gallant
0e46171e3b Rework glob sets.
We try to reduce the pressure on regexes and offload some of it to
Aho-Corasick or exact lookups.
2016-09-15 22:06:04 -04:00
Andrew Gallant
f5c85827ce Don't traverse directory stack if we don't need to. 2016-09-15 12:40:28 -04:00
Andrew Gallant
7cefc55238 Remove .agignore from ignore file list. 2016-09-15 12:40:08 -04:00
Andrew Gallant
92c918ebd9 --no-ignore implies --no-ignore-parent 2016-09-14 14:33:37 -04:00
Andrew Gallant
c24f8fd50f Replace crossbeam with deque.
deque appears faster.
2016-09-14 07:40:46 -04:00
Andrew Gallant
73272cf8a6 notice 2016-09-13 21:23:22 -04:00
Andrew Gallant
4212a8b9cb 0.1.1 2016-09-13 21:21:45 -04:00
Andrew Gallant
983c7fd6f9 We don't use thread_local any more, so remove it. 2016-09-13 21:21:36 -04:00
21 changed files with 1030 additions and 352 deletions

70
Cargo.lock generated
View File

@@ -1,24 +1,24 @@
[root]
name = "ripgrep"
version = "0.1.0"
version = "0.1.2"
dependencies = [
"crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)",
"deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"grep 0.1.0",
"grep 0.1.1",
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)",
"term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
"walkdir 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -31,9 +31,12 @@ dependencies = [
]
[[package]]
name = "crossbeam"
version = "0.2.10"
name = "deque"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "docopt"
@@ -41,20 +44,25 @@ version = "0.6.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)",
"strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "env_logger"
version = "0.3.4"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fnv"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "fs2"
version = "0.2.5"
@@ -72,12 +80,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "grep"
version = "0.1.0"
version = "0.1.1"
dependencies = [
"log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -126,7 +134,15 @@ dependencies = [
[[package]]
name = "num_cpus"
version = "1.0.0"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.3.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.16 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -134,14 +150,14 @@ dependencies = [
[[package]]
name = "regex"
version = "0.1.76"
version = "0.1.77"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
"thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
"utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -185,7 +201,7 @@ dependencies = [
[[package]]
name = "thread_local"
version = "0.2.6"
version = "0.2.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -198,7 +214,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "walkdir"
version = "0.1.6"
version = "0.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -217,9 +233,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[metadata]
"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
"checksum crossbeam 0.2.10 (registry+https://github.com/rust-lang/crates.io-index)" = "0c5ea215664ca264da8a9d9c3be80d2eaf30923c259d03e870388eb927508f97"
"checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
"checksum docopt 0.6.83 (registry+https://github.com/rust-lang/crates.io-index)" = "fc42c6077823a361410c37d47c2535b73a190cbe10838dc4f400fe87c10c8c3b"
"checksum env_logger 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "82dcb9ceed3868a03b335657b85a159736c961900f7e7747d3b0b97b9ccb5ccb"
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
"checksum fnv 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8e8af7b5408ab0c4910cad114c8f9eb454bf75df7afe8964307eeafb68a13a5e"
"checksum fs2 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "bcd414e5a1a979b931bb92f41b7a54106d3f6d2e6c253e9ce943b7cd468251ef"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
@@ -228,16 +245,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054"
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
"checksum memmap 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "f20f72ed93291a72e22e8b16bb18762183bb4943f0f483da5b8be1a9e8192752"
"checksum num_cpus 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a859041cbf7a70ea1ece4b87d1a2c6ef364dcb68749c88db1f97304b9ec09d5f"
"checksum regex 0.1.76 (registry+https://github.com/rust-lang/crates.io-index)" = "63b49f873f36ddc838d773972511e5fed2ef7350885af07d58e2f48ce8073dcd"
"checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad"
"checksum rand 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "2791d88c6defac799c3f20d74f094ca33b9332612d9aef9078519c82e4fe04a5"
"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665"
"checksum regex-syntax 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279401017ae31cf4e15344aa3f085d0e2e5c1e70067289ef906906fdbe92c8fd"
"checksum rustc-serialize 0.3.19 (registry+https://github.com/rust-lang/crates.io-index)" = "6159e4e6e559c81bd706afe9c8fd68f547d3e851ce12e76b1de7914bab61691b"
"checksum simd 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "63b5847c2d766ca7ce7227672850955802fabd779ba616aeabead4c2c3877023"
"checksum strsim 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "50c069df92e4b01425a8bf3576d5d417943a6a7272fbabaf5bd80b1aaa76442e"
"checksum term 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3deff8a2b3b6607d6d7cc32ac25c0b33709453ca9cceac006caac51e963cf94a"
"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03"
"checksum thread_local 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "55dd963dbaeadc08aa7266bf7f91c3154a7805e32bb94b820b769d2ef3b4744d"
"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5"
"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f"
"checksum walkdir 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "d42144c31c9909882ce76e696b306b88a5b091721251137d5d522d1ef3da7cf9"
"checksum walkdir 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "c66c0b9792f0a765345452775f3adbd28dde9d33f30d13e5dcc5ae17cf6f3780"
"checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a"
"checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc"

View File

@@ -1,6 +1,6 @@
[package]
name = "ripgrep"
version = "0.1.0" #:version
version = "0.1.2" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Line oriented search tool using Rust's regex library. Combines the raw
@@ -23,10 +23,11 @@ name = "integration"
path = "tests/tests.rs"
[dependencies]
crossbeam = "0.2"
deque = "0.3"
docopt = "0.6"
env_logger = "0.3"
grep = { version = "0.1", path = "grep" }
fnv = "1.0"
grep = { version = "0.1.1", path = "grep" }
lazy_static = "0.2"
libc = "0.2"
log = "0.3"
@@ -36,7 +37,6 @@ num_cpus = "1"
regex = "0.1.76"
rustc-serialize = "0.3"
term = "0.4"
thread_local = "0.2"
walkdir = "0.1"
[target.'cfg(windows)'.dependencies]

View File

@@ -1,14 +0,0 @@
all:
echo Nothing to do...
ctags:
ctags --options=ctags.rust --languages=Rust src/*.rs src/*/*.rs
docs:
cargo doc
in-dir ./target/doc fix-perms
rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/
push:
git push origin master
git push github master

View File

@@ -1,3 +1,6 @@
**UNDER DEVELOPMENT.**
ripgrep (rg)
------------
ripgrep combines the usability of the silver searcher with the raw speed of grep.
ripgrep combines the usability of the silver searcher with the raw speed of
grep.

5
benches/README.md Normal file
View File

@@ -0,0 +1,5 @@
These are internal microbenchmarks for tracking the peformance of individual
components inside of ripgrep. At the moment, they aren't heavily used.
For performance benchmarks of ripgrep proper, see the sibling `benchsuite`
directory.

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
'''
benchsuite is a benchmark runner for comparing command line search tools.
@@ -10,6 +10,7 @@ import os
import os.path as path
from multiprocessing import cpu_count
import re
import shutil
import statistics
import subprocess
import sys
@@ -39,13 +40,23 @@ LINUX_CLONE = 'git://github.com/BurntSushi/linux'
GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
# Sift tries really hard to search everything by default. In our code search
# benchmarks, we don't want that.
SIFT = [
'sift',
'--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
]
def bench_linux_literal_default(suite_dir):
'''
Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful.
analysis, but it is pedagogically useful to demonstrate how
default behaviors differ.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
@@ -55,8 +66,6 @@ def bench_linux_literal_default(suite_dir):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
# N.B. This is a purposefully unfair benchmark for illustrative purposes
# of how the default modes for each search tool differ.
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]),
@@ -64,10 +73,12 @@ def bench_linux_literal_default(suite_dir):
# doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}),
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems.
mkcmd('pt', ['pt', pat]),
# sift reports an extra line here for a binary file matched.
mkcmd('sift', ['sift', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
])
@@ -76,8 +87,9 @@ def bench_linux_literal(suite_dir):
Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is no
case insensitive matching and that line numbers are computed.
to test how fast they are. For example, it makes sure there is
no case insensitive matching and that line numbers are computed
(because some tools don't permit disabling line numbers).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
@@ -88,19 +100,16 @@ def bench_linux_literal(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd('git grep', [
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
mkcmd('pt (ignore)', ['pt', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd('git grep (ignore)', [
'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}),
mkcmd('pt', ['pt', pat]),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
])
@@ -120,23 +129,21 @@ def bench_linux_literal_casei(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
]),
mkcmd('ag', ['ag', '-i', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]),
mkcmd('ucg', ['ucg', '-i', pat]),
mkcmd('git grep', [
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
# since that is certainly what ripgrep is doing, but this is for an
# ASCII literal, so we should give `git grep` all the opportunity to
# do its best.
mkcmd('git grep (ignore)', [
'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}),
# sift yields more matches than it should here. Specifically, it gets
# matches in Module.symvers and System.map in the repo root. Both of
# those files show up in the repo root's .gitignore file.
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
mkcmd('rg (whitelist)', [
'rg', '-n', '-i', '--no-ignore', '-tall', pat,
]),
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
])
@@ -156,20 +163,16 @@ def bench_linux_re_literal_suffix(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('ag (ignore)', ['ag', '-s', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd(
'git grep',
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
])
@@ -189,22 +192,18 @@ def bench_linux_word(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-w', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
]),
mkcmd('ag', ['ag', '-s', '-w', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
mkcmd(
'git grep',
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', '-w', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
mkcmd('rg (whitelist)', [
'rg', '-n', '-w', '--no-ignore', '-tall', pat,
]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
])
@@ -212,7 +211,8 @@ def bench_linux_unicode_greek(suite_dir):
'''
Benchmark matching of a Unicode category.
Only three tools (ripgrep, sift and pt) support this.
Only three tools (ripgrep, sift and pt) support this. We omit
pt because it is too slow.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
@@ -224,15 +224,7 @@ def bench_linux_unicode_greek(suite_dir):
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
# sift tries to search a bunch of PDF files and clutters up the
# results, even though --binary-skip is provided. They are excluded
# here explicitly, but don't have a measurable impact on performance.
mkcmd('sift', [
'sift', '-n', '--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
pat,
]),
mkcmd('sift', SIFT + ['-n', '--git', pat]),
])
@@ -252,15 +244,7 @@ def bench_linux_unicode_greek_casei(suite_dir):
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
# sift tries to search a bunch of PDF files and clutters up the
# results, even though --binary-skip is provided. They are excluded
# here explicitly, but don't have a measurable impact on performance.
mkcmd('sift', [
'sift', '-n', '--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
pat,
]),
mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
])
@@ -281,30 +265,25 @@ def bench_linux_unicode_word(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', pat,
]),
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
mkcmd('ag-novcs (no Unicode)', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', pat]),
mkcmd(
'git grep',
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'},
),
mkcmd(
'git grep (no Unicode)',
'git grep (ignore) (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift (no Unicode)', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('rg (whitelist) (ASCII)', [
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
]),
mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
])
@@ -326,30 +305,25 @@ def bench_linux_no_literal(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-whitelist', ['rg', '-tall', '--no-ignore', '-n', pat]),
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('rg-whitelist (no Unicode)', [
'rg', '-tall', '--no-ignore', '-n', '(?-u)' + pat,
]),
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
mkcmd('ag-novcs (no Unicode)', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', pat]),
mkcmd(
'git grep',
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'},
),
mkcmd(
'git grep (no Unicode)',
'git grep (ignore) (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift (no Unicode)', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
mkcmd('rg (whitelist) (ASCII)', [
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
]),
mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
])
@@ -371,21 +345,15 @@ def bench_linux_alternates(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', pat,
]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd('rg (ignore)', ['rg', '-n', pat]),
mkcmd('ag (ignore)', ['ag', '-s', pat]),
mkcmd(
'git grep',
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
])
@@ -400,21 +368,15 @@ def bench_linux_alternates_casei(suite_dir):
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
]),
mkcmd('ag', ['ag', '-i', pat]),
mkcmd('ag-novcs', [
'ag', '--skip-vcs-ignores', '-i', pat,
]),
mkcmd('ucg', ['ucg', '-i', pat]),
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
mkcmd('ag (ignore)', ['ag', '-i', pat]),
mkcmd(
'git grep',
'git grep (ignore)',
['git', 'grep', '-E', '-I', '-n', '-i', pat],
env={'LC_ALL': 'C'},
),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
])
@@ -423,22 +385,159 @@ def bench_subtitles_en_literal(suite_dir):
Benchmark the speed of an ASCII string literal.
'''
require(suite_dir, 'subtitles-en')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]),
Command('ag', ['ag', '-s', pat, ru]),
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
Command('grep (no line numbers)', [
'grep', '-a', pat, ru,
Command('rg', ['rg', pat, en]),
Command('pt', ['pt', '-N', pat, en]),
Command('sift', ['sift', pat, en]),
Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
Command('pt (lines)', ['pt', pat, en]),
Command('sift (lines)', ['sift', '-n', pat, en]),
Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
])
def bench_subtitles_en_literal_casei(suite_dir):
'''
Benchmark the speed of a Unicode-y string case insensitively.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-i', pat, en]),
Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE),
Command('grep (ASCII)', [
'grep', '-E', '-ai', pat, en,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
])
def bench_subtitles_en_literal_word(suite_dir):
'''
Benchmark the speed of finding a literal inside word boundaries.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[
Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
]),
Command('ag (ASCII)', ['ag', '-sw', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-anw', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, en]),
Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE),
])
def bench_subtitles_en_alternate(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
])
return Benchmark(pattern=pat, commands=[
Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (lines)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', pat, en]),
Command('grep', [
'grep', '-E', '-a', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_en_alternate_casei(suite_dir):
'''
Benchmark the speed of a set of alternate literals.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([
'Sherlock Holmes',
'John Watson',
'Irene Adler',
'Inspector Lestrade',
'Professor Moriarty',
])
return Benchmark(pattern=pat, commands=[
Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-n', '-i', pat, en]),
Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE),
])
def bench_subtitles_en_surrounding_words(suite_dir):
'''
Benchmark a more complex regex with an inner literal.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w+\s+Holmes\s+\w+'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]),
Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
])
def bench_subtitles_en_no_literal(suite_dir):
'''
Benchmark the speed of a regex with no literals.
Note that we don't even try to run grep with Unicode support
on this one. While it should eventually get the right answer,
I killed it after it had already been running for two minutes
and showed no signs of finishing soon.
'''
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
Command('pt', ['pt', pat, ru]),
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
Command('sift', ['sift', '-n', pat, ru]),
Command('sift (no line numbers)', ['sift', pat, ru]),
])
@@ -451,18 +550,16 @@ def bench_subtitles_ru_literal(suite_dir):
pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]),
Command('ag', ['ag', '-s', pat, ru]),
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
Command('grep (no line numbers)', [
'grep', '-a', pat, ru,
], env=GREP_ASCII),
Command('pt', ['pt', pat, ru]),
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
Command('sift', ['sift', '-n', pat, ru]),
Command('sift (no line numbers)', ['sift', pat, ru]),
Command('rg', ['rg', pat, ru]),
Command('pt', ['pt', '-N', pat, ru]),
Command('sift', ['sift', pat, ru]),
Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
Command('pt (lines)', ['pt', pat, ru]),
Command('sift (lines)', ['sift', '-n', pat, ru]),
Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
])
@@ -475,13 +572,14 @@ def bench_subtitles_ru_literal_casei(suite_dir):
pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', '-i', pat, ru]),
Command('ag (not Unicode)', ['ag', '-i', pat, ru]),
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
Command('grep', ['grep', '-ani', pat, ru], env=GREP_UNICODE),
Command('grep (not Unicode)', [
'grep', '-E', '-ani', pat, ru,
Command('rg', ['rg', '-i', pat, ru]),
Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE),
Command('grep (ASCII)', [
'grep', '-E', '-ai', pat, ru,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
])
@@ -494,15 +592,15 @@ def bench_subtitles_ru_literal_word(suite_dir):
pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-nw', pat, ru]),
Command('rg (not Unicode)', [
Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
]),
Command('ag (not Unicode)', ['ag', '-sw', pat, ru]),
Command('ucg (not Unicode)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (not Unicode)', [
Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-anw', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, ru]),
Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE),
])
@@ -522,11 +620,14 @@ def bench_subtitles_ru_alternate(suite_dir):
])
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]),
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_ASCII),
Command('grep (no line numbers)', [
Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (lines)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', pat, ru]),
Command('grep', [
'grep', '-E', '-a', pat, ru,
], env=GREP_ASCII),
])
@@ -547,12 +648,32 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
])
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', '-i', pat, ru]),
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
Command('grep (not Unicode)', [
Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', '-n', '-i', pat, ru]),
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
])
def bench_subtitles_ru_surrounding_words(suite_dir):
'''
Benchmark a more complex regex with an inner literal.
'''
require(suite_dir, 'subtitles-en')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = r'\w+\s+Холмс\s+\w+'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
])
@@ -571,9 +692,10 @@ def bench_subtitles_ru_no_literal(suite_dir):
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]),
Command('rg (no line numbers)', ['rg', pat, ru]),
Command('ucg (no Unicode)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (no Unicode)', [
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
])
@@ -597,6 +719,23 @@ class MissingDependencies(Exception):
return 'MissingDependency(%s)' % repr(self.missing_names)
class MissingCommands(Exception):
'''
A missing command exception.
This exception occurs when running a command in a benchmark
where the command could not be found on the current system.
:ivar list(str) missing_names:
The names of the command binaries that could not be found.
'''
def __init__(self, missing_names):
self.missing_names = sorted(set(missing_names))
def __str__(self):
return 'MissingCommands(%s)' % repr(self.missing_names)
class Benchmark(object):
'''
A single benchmark corresponding to a grouping of commands.
@@ -606,7 +745,8 @@ class Benchmark(object):
'''
def __init__(self, name=None, pattern=None, commands=None,
warmup_count=1, count=3, line_count=True):
warmup_count=1, count=3, line_count=True,
allow_missing_commands=False):
'''
Create a single benchmark.
@@ -644,15 +784,37 @@ class Benchmark(object):
self.warmup_count = warmup_count
self.count = count
self.line_count = line_count
self.allow_missing_commands = allow_missing_commands
def raise_if_missing(self):
'''
Raises a MissingCommands exception if applicable.
A MissingCommands exception is raised when the following
criteria are met: 1) allow_missing_commands is False, and 2) at
least one command in this benchmark could not be found on this
system.
'''
missing_commands = \
[c.binary_name for c in self.commands if not c.exists()]
if not self.allow_missing_commands and len(missing_commands) > 0:
raise MissingCommands(missing_commands)
def run(self):
'''
Runs this benchmark and returns the results.
:rtype: Result
:raises:
MissingCommands if any command doesn't exist.
(Unless allow_missing_commands is enabled.)
'''
self.raise_if_missing()
result = Result(self)
for cmd in self.commands:
if self.allow_missing_commands and not cmd.exists():
# Skip this command if we're OK with it.
continue
# Do a warmup first.
for _ in range(self.warmup_count):
self.run_one(cmd)
@@ -677,6 +839,8 @@ class Benchmark(object):
it is the number of lines in the search output.
:rtype: int
'''
if not cmd.exists():
raise MissingCommand(cmd.cmd[0])
cmd.kwargs['stderr'] = subprocess.DEVNULL
if self.line_count:
cmd.kwargs['stdout'] = subprocess.PIPE
@@ -746,6 +910,8 @@ class Result(object):
means = []
for cmd in self.benchmark.commands:
mean, _ = self.distribution_for(cmd)
if mean is None:
continue
means.append((cmd, mean))
return min(means, key=lambda tup: tup[1])[0]
@@ -768,16 +934,18 @@ class Result(object):
'''
Returns the distribution (mean +/- std) of the given command.
If there are no samples for this command (i.e., it was skipped),
then return ``(None, None)``.
:rtype: (float, float)
:returns:
A tuple containing the mean and standard deviation, in that
order.
'''
mean = statistics.mean(
s['duration'] for s in self.samples_for(cmd))
stdev = statistics.stdev(
s['duration'] for s in self.samples_for(cmd))
return mean, stdev
samples = list(s['duration'] for s in self.samples_for(cmd))
if len(samples) == 0:
return None, None
return statistics.mean(samples), statistics.stdev(samples)
class Command(object):
@@ -807,6 +975,15 @@ class Command(object):
self.args = args
self.kwargs = kwargs
def exists(self):
'Returns true if and only if this command exists.'
return shutil.which(self.binary_name) is not None
@property
def binary_name(self):
'Return the binary name of this command.'
return self.cmd[0]
def run(self):
'''
Runs this command and returns its status.
@@ -947,7 +1124,8 @@ def download(suite_dir, choices):
sys.exit(1)
def collect_benchmarks(suite_dir, filter_pat=None):
def collect_benchmarks(suite_dir, filter_pat=None,
allow_missing_commands=False):
'''
Return an iterable of all runnable benchmarks.
@@ -969,6 +1147,9 @@ def collect_benchmarks(suite_dir, filter_pat=None):
continue
try:
benchmark = globals()[fun](suite_dir)
benchmark.name = name
benchmark.allow_missing_commands = allow_missing_commands
benchmark.raise_if_missing()
except MissingDependencies as e:
eprint(
'missing: %s, skipping benchmark %s (try running with: %s)' % (
@@ -976,24 +1157,32 @@ def collect_benchmarks(suite_dir, filter_pat=None):
name,
' '.join(['--download %s' % n for n in e.missing_names]),
))
except MissingCommands as e:
fmt = 'missing commands: %s, skipping benchmark %s ' \
'(run with --allow-missing to run incomplete benchmarks)'
eprint(fmt % (', '.join(e.missing_names), name))
continue
benchmark.name = name
yield benchmark
def main():
download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru']
p = argparse.ArgumentParser('Command line search tool benchmark suite.')
p.add_argument(
'--dir', metavar='PATH', default=os.getcwd(),
help='The directory in which to download data and perform searches.')
p.add_argument(
'--download', metavar='CORPUS', action='append',
choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'],
choices=download_choices,
help='Download and prepare corpus data, then exit without running '
'any benchmarks. Note that this command is intended to be '
'idempotent. WARNING: This downloads over a gigabyte of data, '
'and also includes building the Linux kernel. If "all" is used '
'then the total uncompressed size is around 13 GB.')
'then the total uncompressed size is around 13 GB. '
'Choices: %s' % ', '.join(download_choices))
p.add_argument(
'--allow-missing', action='store_true',
help='Permit benchmarks to run even if some commands are missing.')
p.add_argument(
'-f', '--force', action='store_true',
help='Overwrite existing files if there is a conflict.')
@@ -1009,6 +1198,13 @@ def main():
help='A regex pattern that will only run benchmarks that match.')
args = p.parse_args()
if args.list:
benchmarks = collect_benchmarks(
args.dir, filter_pat=args.bench,
allow_missing_commands=args.allow_missing)
for b in benchmarks:
print(b.name)
sys.exit(0)
if args.download is not None and len(args.download) > 0:
download(args.dir, args.download)
sys.exit(0)
@@ -1028,7 +1224,9 @@ def main():
raw_csv_wtr = csv.DictWriter(raw_handle, fields)
raw_csv_wtr.writerow({x: x for x in fields})
benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench)
benchmarks = collect_benchmarks(
args.dir, filter_pat=args.bench,
allow_missing_commands=args.allow_missing)
for i, b in enumerate(benchmarks):
result = b.run()
fastest_cmd = result.fastest_cmd()
@@ -1042,6 +1240,12 @@ def main():
for cmd in b.commands:
name = cmd.name
mean, stdev = result.distribution_for(cmd)
if mean is None:
# If we couldn't get a distribution for this command then
# it was skipped.
print('{name:{pad}} SKIPPED'.format(
name=name, pad=max_name_len + 2))
continue
line_counts = result.line_counts_for(cmd)
show_fast_cmd, show_line_counts = '', ''
if fastest_cmd.name == cmd.name:

View File

@@ -1,11 +0,0 @@
--langdef=Rust
--langmap=Rust:.rs
--regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/
--regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/
--regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/
--regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/
--regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/
--regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/
--regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/

View File

@@ -1,6 +1,6 @@
[package]
name = "grep"
version = "0.1.0" #:version
version = "0.1.1" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Fast line oriented regex searching as a library.

View File

@@ -62,7 +62,7 @@ impl fmt::Display for Error {
match *self {
Error::Regex(ref err) => err.fmt(f),
Error::LiteralNotAllowed(chr) => {
write!(f, "Literal '{}' not allowed.", chr)
write!(f, "Literal {:?} not allowed.", chr)
}
Error::__Nonexhaustive => unreachable!(),
}

View File

@@ -10,6 +10,10 @@ use {Error, Result};
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
/// function panics.
pub fn remove(expr: Expr, byte: u8) -> Result<Expr> {
// TODO(burntsushi): There is a bug in this routine where only `\n` is
// handled correctly. Namely, `AnyChar` and `AnyByte` need to be translated
// to proper character classes instead of the special `AnyCharNoNL` and
// `AnyByteNoNL` classes.
use syntax::Expr::*;
assert!(byte <= 0x7F);
let chr = byte as char;

View File

@@ -1 +0,0 @@
au BufWritePost *.rs silent!make ctags > /dev/null 2>&1

View File

@@ -124,6 +124,7 @@ Less common options:
--no-ignore
Don't respect ignore files (.gitignore, .rgignore, etc.)
This implies --no-ignore-parent.
--no-ignore-parent
Don't respect ignore files in parent directories.
@@ -338,7 +339,9 @@ impl RawArgs {
line_number: !self.flag_no_line_number && self.flag_line_number,
mmap: mmap,
no_ignore: self.flag_no_ignore,
no_ignore_parent: self.flag_no_ignore_parent,
no_ignore_parent:
// --no-ignore implies --no-ignore-parent
self.flag_no_ignore_parent || self.flag_no_ignore,
quiet: self.flag_quiet,
replace: self.flag_replace.clone().map(|s| s.into_bytes()),
text: self.flag_text,

View File

@@ -21,6 +21,7 @@ additional rules such as whitelists (prefix of `!`) or directory-only globs
// TODO(burntsushi): Implement something similar, but for Mercurial. We can't
// use this exact implementation because hgignore files are different.
use std::cell::RefCell;
use std::error::Error as StdError;
use std::fmt;
use std::fs::File;
@@ -30,6 +31,7 @@ use std::path::{Path, PathBuf};
use regex;
use glob;
use pathutil::strip_prefix;
/// Represents an error that can occur when parsing a gitignore file.
#[derive(Debug)]
@@ -110,37 +112,37 @@ impl Gitignore {
/// same directory as this gitignore file.
pub fn matched<P: AsRef<Path>>(&self, path: P, is_dir: bool) -> Match {
let mut path = path.as_ref();
if let Ok(p) = path.strip_prefix(&self.root) {
if let Some(p) = strip_prefix("./", path) {
path = p;
}
self.matched_utf8(&*path.to_string_lossy(), is_dir)
if let Some(p) = strip_prefix(&self.root, path) {
path = p;
}
self.matched_stripped(path, is_dir)
}
/// Like matched, but takes a path that has already been stripped and
/// converted to UTF-8.
pub fn matched_utf8(&self, path: &str, is_dir: bool) -> Match {
// A single regex with a bunch of alternations of glob patterns is
// unfortunately typically faster than a regex, so we use it as a
// first pass filter. We still need to run the RegexSet to get the most
// recently defined glob that matched.
if !self.set.is_match(path) {
return Match::None;
}
// The regex set can't actually pick the right glob that matched all
// on its own. In particular, some globs require that only directories
// can match. Thus, only accept a match from the regex set if the given
// path satisfies the corresponding glob's directory criteria.
for i in self.set.matches(path).iter().rev() {
let pat = &self.patterns[i];
if !pat.only_dir || is_dir {
return if pat.whitelist {
Match::Whitelist(pat)
} else {
Match::Ignored(pat)
};
/// Like matched, but takes a path that has already been stripped.
pub fn matched_stripped(&self, path: &Path, is_dir: bool) -> Match {
thread_local! {
static MATCHES: RefCell<Vec<usize>> = {
RefCell::new(vec![])
}
}
Match::None
};
MATCHES.with(|matches| {
let mut matches = matches.borrow_mut();
self.set.matches_into(path, &mut *matches);
for &i in matches.iter().rev() {
let pat = &self.patterns[i];
if !pat.only_dir || is_dir {
return if pat.whitelist {
Match::Whitelist(pat)
} else {
Match::Ignored(pat)
};
}
}
Match::None
})
}
/// Returns the total number of ignore patterns.
@@ -390,6 +392,7 @@ mod tests {
ignored!(ig23, ROOT, "foo", "./foo");
ignored!(ig24, ROOT, "target", "grep/target");
ignored!(ig25, ROOT, "Cargo.lock", "./tabwriter-bin/Cargo.lock");
ignored!(ig26, ROOT, "/foo/bar/baz", "./foo/bar/baz");
not_ignored!(ignot1, ROOT, "amonths", "months");
not_ignored!(ignot2, ROOT, "monthsa", "months");

View File

@@ -26,13 +26,22 @@ to make its way into `glob` proper.
// at the .gitignore for the chromium repo---just about every pattern satisfies
// that assumption.)
use std::borrow::Cow;
use std::collections::HashMap;
use std::error::Error as StdError;
use std::ffi::{OsStr, OsString};
use std::fmt;
use std::hash;
use std::iter;
use std::path::Path;
use std::str;
use fnv;
use regex;
use regex::bytes::{Regex, RegexSet, SetMatches};
use regex::bytes::Regex;
use regex::bytes::RegexSet;
use pathutil::file_name;
/// Represents an error that can occur when parsing a glob pattern.
#[derive(Clone, Debug, Eq, PartialEq)]
@@ -71,33 +80,181 @@ impl fmt::Display for Error {
}
}
/// SetYesNo represents a group of globs that can be matched together in a
/// single pass. SetYesNo can only determine whether a particular path matched
/// any pattern in the set.
#[derive(Clone, Debug)]
pub struct SetYesNo {
re: Regex,
}
impl SetYesNo {
/// Returns true if and only if the given path matches at least one glob
/// in this set.
pub fn is_match<T: AsRef<Path>>(&self, path: T) -> bool {
self.re.is_match(&*path_bytes(path.as_ref()))
}
fn new(
pats: &[(Pattern, MatchOptions)],
) -> Result<SetYesNo, regex::Error> {
let mut joined = String::new();
for &(ref p, ref o) in pats {
let part = format!("(?:{})", p.to_regex_with(o));
if !joined.is_empty() {
joined.push('|');
}
joined.push_str(&part);
}
Ok(SetYesNo { re: try!(Regex::new(&joined)) })
}
}
type Fnv = hash::BuildHasherDefault<fnv::FnvHasher>;
/// Set represents a group of globs that can be matched together in a single
/// pass.
#[derive(Clone, Debug)]
pub struct Set {
re: Regex,
set: RegexSet,
yesno: SetYesNo,
exts: HashMap<OsString, Vec<usize>, Fnv>,
literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
base_literals: HashMap<Vec<u8>, Vec<usize>, Fnv>,
base_prefixes: Vec<Vec<u8>>,
base_prefixes_map: Vec<usize>,
base_suffixes: Vec<Vec<u8>>,
base_suffixes_map: Vec<usize>,
base_regexes: RegexSet,
base_regexes_map: Vec<usize>,
regexes: RegexSet,
regexes_map: Vec<usize>,
}
impl Set {
/// Returns true if and only if the given path matches at least one glob
/// in this set.
pub fn is_match<T: AsRef<[u8]>>(&self, path: T) -> bool {
self.re.is_match(path.as_ref())
}
/// Returns every glob pattern (by sequence number) that matches the given
/// path.
pub fn matches<T: AsRef<[u8]>>(&self, path: T) -> SetMatches {
// TODO(burntsushi): If we split this out into a separate crate, don't
// expose the regex::SetMatches type in the public API.
self.set.matches(path.as_ref())
}
/// Returns the number of glob patterns in this set.
/// Returns the sequence number of every glob pattern that matches the
/// given path.
#[allow(dead_code)]
pub fn len(&self) -> usize {
self.set.len()
pub fn matches<T: AsRef<Path>>(&self, path: T) -> Vec<usize> {
let mut into = vec![];
self.matches_into(path, &mut into);
into
}
/// Adds the sequence number of every glob pattern that matches the given
/// path to the vec given.
pub fn matches_into<T: AsRef<Path>>(
&self,
path: T,
into: &mut Vec<usize>,
) {
into.clear();
let path = path.as_ref();
let path_bytes = &*path_bytes(path);
let basename = file_name(path).map(|b| os_str_bytes(b));
if !self.yesno.is_match(path) {
return;
}
if !self.exts.is_empty() {
if let Some(ext) = path.extension() {
if let Some(matches) = self.exts.get(ext) {
into.extend(matches.as_slice());
}
}
}
if !self.literals.is_empty() {
if let Some(matches) = self.literals.get(path_bytes) {
into.extend(matches.as_slice());
}
}
if !self.base_literals.is_empty() {
if let Some(ref basename) = basename {
if let Some(matches) = self.base_literals.get(&**basename) {
into.extend(matches.as_slice());
}
}
}
if !self.base_prefixes.is_empty() {
if let Some(ref basename) = basename {
let basename = &**basename;
for (i, pre) in self.base_prefixes.iter().enumerate() {
if pre.len() <= basename.len() && &**pre == &basename[0..pre.len()] {
into.push(self.base_prefixes_map[i]);
}
}
}
}
if !self.base_suffixes.is_empty() {
if let Some(ref basename) = basename {
let basename = &**basename;
for (i, suf) in self.base_suffixes.iter().enumerate() {
if suf.len() > basename.len() {
continue;
}
let (s, e) = (basename.len() - suf.len(), basename.len());
if &**suf == &basename[s..e] {
into.push(self.base_suffixes_map[i]);
}
}
}
}
if let Some(ref basename) = basename {
for i in self.base_regexes.matches(&**basename) {
into.push(self.base_regexes_map[i]);
}
}
for i in self.regexes.matches(path_bytes) {
into.push(self.regexes_map[i]);
}
into.sort();
}
fn new(pats: &[(Pattern, MatchOptions)]) -> Result<Set, regex::Error> {
let fnv = Fnv::default();
let mut exts = HashMap::with_hasher(fnv.clone());
let mut literals = HashMap::with_hasher(fnv.clone());
let mut base_literals = HashMap::with_hasher(fnv.clone());
let (mut base_prefixes, mut base_prefixes_map) = (vec![], vec![]);
let (mut base_suffixes, mut base_suffixes_map) = (vec![], vec![]);
let (mut regexes, mut regexes_map) = (vec![], vec![]);
let (mut base_regexes, mut base_regexes_map) = (vec![], vec![]);
for (i, &(ref p, ref o)) in pats.iter().enumerate() {
if let Some(ext) = p.ext() {
exts.entry(ext).or_insert(vec![]).push(i);
} else if let Some(literal) = p.literal() {
literals.entry(literal.into_bytes()).or_insert(vec![]).push(i);
} else if let Some(literal) = p.base_literal() {
base_literals
.entry(literal.into_bytes()).or_insert(vec![]).push(i);
} else if let Some(literal) = p.base_literal_prefix() {
base_prefixes.push(literal.into_bytes());
base_prefixes_map.push(i);
} else if let Some(literal) = p.base_literal_suffix() {
base_suffixes.push(literal.into_bytes());
base_suffixes_map.push(i);
} else if p.is_only_basename() {
let part = format!("(?:{})", p.to_regex_with(o));
base_regexes.push(part);
base_regexes_map.push(i);
} else {
let part = format!("(?:{})", p.to_regex_with(o));
regexes.push(part);
regexes_map.push(i);
}
}
Ok(Set {
yesno: try!(SetYesNo::new(pats)),
exts: exts,
literals: literals,
base_literals: base_literals,
base_prefixes: base_prefixes,
base_prefixes_map: base_prefixes_map,
base_suffixes: base_suffixes,
base_suffixes_map: base_suffixes_map,
base_regexes: try!(RegexSet::new(base_regexes)),
base_regexes_map: base_regexes_map,
regexes: try!(RegexSet::new(regexes)),
regexes_map: regexes_map,
})
}
}
@@ -119,19 +276,12 @@ impl SetBuilder {
///
/// Once a matcher is built, no new patterns can be added to it.
pub fn build(&self) -> Result<Set, regex::Error> {
let it = self.pats.iter().map(|&(ref p, ref o)| p.to_regex_with(o));
let set = try!(RegexSet::new(it));
Set::new(&self.pats)
}
let mut joined = String::new();
for &(ref p, ref o) in &self.pats {
let part = format!("(?:{})", p.to_regex_with(o));
if !joined.is_empty() {
joined.push('|');
}
joined.push_str(&part);
}
let re = try!(Regex::new(&joined));
Ok(Set { re: re, set: set })
/// Like `build`, but returns a matcher that can only answer yes/no.
pub fn build_yesno(&self) -> Result<SetYesNo, regex::Error> {
SetYesNo::new(&self.pats)
}
/// Add a new pattern to this set.
@@ -149,8 +299,21 @@ impl SetBuilder {
pat: &str,
opts: &MatchOptions,
) -> Result<(), Error> {
let pat = try!(Pattern::new(pat));
self.pats.push((pat, opts.clone()));
let parsed = try!(Pattern::new(pat));
// if let Some(ext) = parsed.ext() {
// eprintln!("ext :: {:?} :: {:?}", ext, pat);
// } else if let Some(lit) = parsed.literal() {
// eprintln!("literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal_prefix() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else if let Some(lit) = parsed.base_literal_suffix() {
// eprintln!("base_literal :: {:?} :: {:?}", lit, pat);
// } else {
// eprintln!("regex :: {:?} :: {:?}", pat, parsed);
// }
self.pats.push((parsed, opts.clone()));
Ok(())
}
}
@@ -204,6 +367,133 @@ impl Pattern {
Ok(p.p)
}
/// Returns an extension if this pattern exclusively matches it.
pub fn ext(&self) -> Option<OsString> {
if self.tokens.len() <= 3 {
return None;
}
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.get(1) {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
match self.tokens.get(2) {
Some(&Token::Literal(c)) if c == '.' => {}
_ => return None,
}
let mut lit = OsString::new();
for t in self.tokens[3..].iter() {
match *t {
Token::Literal(c) if c == '/' || c == '\\' || c == '.' => {
return None;
}
Token::Literal(c) => lit.push(c.to_string()),
_ => return None,
}
}
Some(lit)
}
/// Returns the pattern as a literal if and only if the pattern exclusiely
/// matches the basename of a file path *and* is a literal.
///
/// The basic format of these patterns is `**/{literal}`, where `{literal}`
/// does not contain a path separator.
pub fn base_literal(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[1..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns true if and only if this pattern only inspects the basename
/// of a path.
pub fn is_only_basename(&self) -> bool {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return false,
}
for t in &self.tokens[1..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return false,
Token::RecursivePrefix
| Token::RecursiveSuffix
| Token::RecursiveZeroOrMore => return false,
_ => {}
}
}
true
}
/// Returns the pattern as a literal if and only if the pattern must match
/// an entire path exactly.
///
/// The basic format of these patterns is `{literal}`.
pub fn literal(&self) -> Option<String> {
let mut lit = String::new();
for t in &self.tokens {
match *t {
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns a basename literal prefix of this pattern.
pub fn base_literal_prefix(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.last() {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[1..self.tokens.len()-1] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Returns a basename literal suffix of this pattern.
pub fn base_literal_suffix(&self) -> Option<String> {
match self.tokens.get(0) {
Some(&Token::RecursivePrefix) => {}
_ => return None,
}
match self.tokens.get(1) {
Some(&Token::ZeroOrMore) => {}
_ => return None,
}
let mut lit = String::new();
for t in &self.tokens[2..] {
match *t {
Token::Literal(c) if c == '/' || c == '\\' => return None,
Token::Literal(c) => lit.push(c),
_ => return None,
}
}
Some(lit)
}
/// Convert this pattern to a string that is guaranteed to be a valid
/// regular expression and will represent the matching semantics of this
/// glob pattern. This uses a default set of options.
@@ -415,13 +705,34 @@ impl<'a> Parser<'a> {
}
}
fn path_bytes(path: &Path) -> Cow<[u8]> {
os_str_bytes(path.as_os_str())
}
#[cfg(unix)]
fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
use std::os::unix::ffi::OsStrExt;
Cow::Borrowed(s.as_bytes())
}
#[cfg(not(unix))]
fn os_str_bytes(s: &OsStr) -> Cow<[u8]> {
// TODO(burntsushi): On Windows, OS strings are probably UTF-16, so even
// if we could get at the raw bytes, they wouldn't be useful. We *must*
// convert to UTF-8 before doing path matching. Unfortunate, but necessary.
match s.to_string_lossy() {
Cow::Owned(s) => Cow::Owned(s.into_bytes()),
Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
}
}
#[cfg(test)]
mod tests {
use std::path::Path;
use regex::bytes::Regex;
use super::{Error, Pattern, MatchOptions, SetBuilder, Token};
use super::{Error, Pattern, MatchOptions, Set, SetBuilder, Token};
use super::Token::*;
macro_rules! syntax {
@@ -483,14 +794,42 @@ mod tests {
let pat = Pattern::new($pat).unwrap();
let path = &Path::new($path).to_str().unwrap();
let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
// println!("PATTERN: {}", $pat);
// println!("REGEX: {:?}", re);
// println!("PATH: {}", path);
assert!(!re.is_match(path.as_bytes()));
}
};
}
macro_rules! ext {
($name:ident, $pat:expr, $ext:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
let ext = pat.ext().map(|e| e.to_string_lossy().into_owned());
assert_eq!($ext, ext.as_ref().map(|s| &**s));
}
};
}
macro_rules! baseliteral {
($name:ident, $pat:expr, $yes:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
assert_eq!($yes, pat.base_literal().is_some());
}
};
}
macro_rules! basesuffix {
($name:ident, $pat:expr, $yes:expr) => {
#[test]
fn $name() {
let pat = Pattern::new($pat).unwrap();
assert_eq!($yes, pat.is_literal_suffix());
}
};
}
fn class(s: char, e: char) -> Token {
Class { negated: false, ranges: vec![(s, e)] }
}
@@ -585,6 +924,26 @@ mod tests {
toregex!(re10, "+", r"^\+$");
toregex!(re11, "**", r"^.*$");
ext!(ext1, "**/*.rs", Some("rs"));
baseliteral!(lit1, "**", true);
baseliteral!(lit2, "**/a", true);
baseliteral!(lit3, "**/ab", true);
baseliteral!(lit4, "**/a*b", false);
baseliteral!(lit5, "z/**/a*b", false);
baseliteral!(lit6, "[ab]", false);
baseliteral!(lit7, "?", false);
/*
issuffix!(suf1, "", false);
issuffix!(suf2, "a", true);
issuffix!(suf3, "ab", true);
issuffix!(suf4, "*ab", true);
issuffix!(suf5, "*.ab", true);
issuffix!(suf6, "?.ab", true);
issuffix!(suf7, "ab*", false);
*/
matches!(match1, "a", "a");
matches!(match2, "a*b", "a_b");
matches!(match3, "a*b*c", "abc");
@@ -681,16 +1040,22 @@ mod tests {
builder.add("src/lib.rs").unwrap();
let set = builder.build().unwrap();
assert!(set.is_match("foo.c"));
assert!(set.is_match("src/foo.c"));
assert!(!set.is_match("foo.rs"));
assert!(!set.is_match("tests/foo.rs"));
assert!(set.is_match("src/foo.rs"));
assert!(set.is_match("src/grep/src/main.rs"));
fn is_match(set: &Set, s: &str) -> bool {
let mut matches = vec![];
set.matches_into(s, &mut matches);
!matches.is_empty()
}
assert_eq!(2, set.matches("src/lib.rs").iter().count());
assert!(set.matches("src/lib.rs").matched(0));
assert!(!set.matches("src/lib.rs").matched(1));
assert!(set.matches("src/lib.rs").matched(2));
assert!(is_match(&set, "foo.c"));
assert!(is_match(&set, "src/foo.c"));
assert!(!is_match(&set, "foo.rs"));
assert!(!is_match(&set, "tests/foo.rs"));
assert!(is_match(&set, "src/foo.rs"));
assert!(is_match(&set, "src/grep/src/main.rs"));
let matches = set.matches("src/lib.rs");
assert_eq!(2, matches.len());
assert_eq!(0, matches[0]);
assert_eq!(2, matches[1]);
}
}

View File

@@ -19,11 +19,11 @@ use std::io;
use std::path::{Path, PathBuf};
use gitignore::{self, Gitignore, GitignoreBuilder, Match, Pattern};
use pathutil::is_hidden;
use types::Types;
const IGNORE_NAMES: &'static [&'static str] = &[
".gitignore",
".agignore",
".rgignore",
];
@@ -83,7 +83,10 @@ pub struct Ignore {
overrides: Overrides,
/// A file type matcher.
types: Types,
/// Whether to ignore hidden files or not.
ignore_hidden: bool,
/// When true, don't look at .gitignore or .agignore files for ignore
/// rules.
no_ignore: bool,
}
@@ -208,15 +211,17 @@ impl Ignore {
debug!("{} ignored because it is hidden", path.display());
return true;
}
for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) {
let mat = id.matched(path, is_dir);
if let Some(is_ignored) = self.ignore_match(path, mat) {
if is_ignored {
return true;
if !self.no_ignore {
for id in self.stack.iter().rev().filter_map(|id| id.as_ref()) {
let mat = id.matched(path, is_dir);
if let Some(is_ignored) = self.ignore_match(path, mat) {
if is_ignored {
return true;
}
// If this path is whitelisted by an ignore, then
// fallthrough and let the file type matcher have a say.
break;
}
// If this path is whitelisted by an ignore, then fallthrough
// and let the file type matcher have a say.
break;
}
}
let mat = self.types.matched(path, is_dir);
@@ -361,8 +366,7 @@ impl Overrides {
let path = path.as_ref();
self.gi.as_ref()
.map(|gi| {
let path = &*path.to_string_lossy();
let mat = gi.matched_utf8(path, is_dir).invert();
let mat = gi.matched_stripped(path, is_dir).invert();
if mat.is_none() && !is_dir {
if gi.num_ignores() > 0 {
return Match::Ignored(&self.unmatched_pat);
@@ -374,14 +378,6 @@ impl Overrides {
}
}
fn is_hidden<P: AsRef<Path>>(path: P) -> bool {
if let Some(name) = path.as_ref().file_name() {
name.to_str().map(|s| s.starts_with(".")).unwrap_or(false)
} else {
false
}
}
#[cfg(test)]
mod tests {
use std::path::Path;

View File

@@ -1,6 +1,7 @@
extern crate crossbeam;
extern crate deque;
extern crate docopt;
extern crate env_logger;
extern crate fnv;
extern crate grep;
#[cfg(windows)]
extern crate kernel32;
@@ -15,7 +16,6 @@ extern crate num_cpus;
extern crate regex;
extern crate rustc_serialize;
extern crate term;
extern crate thread_local;
extern crate walkdir;
#[cfg(windows)]
extern crate winapi;
@@ -29,7 +29,7 @@ use std::result;
use std::sync::{Arc, Mutex};
use std::thread;
use crossbeam::sync::chase_lev::{self, Steal, Stealer};
use deque::{Stealer, Stolen};
use grep::Grep;
use memmap::{Mmap, Protection};
use term::Terminal;
@@ -37,6 +37,7 @@ use walkdir::DirEntry;
use args::Args;
use out::{ColoredTerminal, Out};
use pathutil::strip_prefix;
use printer::Printer;
use search_stream::InputBuffer;
#[cfg(windows)]
@@ -61,6 +62,7 @@ mod gitignore;
mod glob;
mod ignore;
mod out;
mod pathutil;
mod printer;
mod search_buffer;
mod search_stream;
@@ -98,8 +100,8 @@ fn run(args: Args) -> Result<u64> {
let out = Arc::new(Mutex::new(args.out()));
let mut workers = vec![];
let mut workq = {
let (workq, stealer) = chase_lev::deque();
let workq = {
let (workq, stealer) = deque::new();
for _ in 0..args.threads() {
let worker = MultiWorker {
chan_work: stealer.clone(),
@@ -216,10 +218,10 @@ impl MultiWorker {
fn run(mut self) -> u64 {
loop {
let work = match self.chan_work.steal() {
Steal::Empty | Steal::Abort => continue,
Steal::Data(Work::Quit) => break,
Steal::Data(Work::Stdin) => WorkReady::Stdin,
Steal::Data(Work::File(ent)) => {
Stolen::Empty | Stolen::Abort => continue,
Stolen::Data(Work::Quit) => break,
Stolen::Data(Work::Stdin) => WorkReady::Stdin,
Stolen::Data(Work::File(ent)) => {
match File::open(ent.path()) {
Ok(file) => WorkReady::DirFile(ent, file),
Err(err) => {
@@ -258,7 +260,7 @@ impl Worker {
}
WorkReady::DirFile(ent, file) => {
let mut path = ent.path();
if let Ok(p) = path.strip_prefix("./") {
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.args.mmap() {
@@ -269,7 +271,7 @@ impl Worker {
}
WorkReady::PathFile(path, file) => {
let mut path = &*path;
if let Ok(p) = path.strip_prefix("./") {
if let Some(p) = strip_prefix("./", path) {
path = p;
}
if self.args.mmap() {

98
src/pathutil.rs Normal file
View File

@@ -0,0 +1,98 @@
/*!
The pathutil module provides platform specific operations on paths that are
typically faster than the same operations as provided in std::path. In
particular, we really want to avoid the costly operation of parsing the path
into its constituent components. We give up on Windows, but on Unix, we deal
with the raw bytes directly.
On large repositories (like chromium), this can have a ~25% performance
improvement on just listing the files to search (!).
*/
use std::ffi::OsStr;
use std::path::Path;
use memchr::memrchr;
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(unix)]
pub fn strip_prefix<'a, P: AsRef<Path>>(
prefix: P,
path: &'a Path,
) -> Option<&'a Path> {
use std::os::unix::ffi::OsStrExt;
let prefix = prefix.as_ref().as_os_str().as_bytes();
let path = path.as_os_str().as_bytes();
if prefix.len() > path.len() || prefix != &path[0..prefix.len()] {
None
} else {
Some(&Path::new(OsStr::from_bytes(&path[prefix.len()..])))
}
}
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(not(unix))]
pub fn strip_prefix<'a>(prefix: &Path, path: &'a Path) -> Option<&'a Path> {
path.strip_prefix(prefix).ok()
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(unix)]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
use std::os::unix::ffi::OsStrExt;
let path = path.as_ref().as_os_str().as_bytes();
if path.is_empty() {
return None;
} else if path.len() == 1 && path[0] == b'.' {
return None;
} else if path.last() == Some(&b'.') {
return None;
} else if path.len() >= 2 && &path[path.len() - 2..] == &b".."[..] {
return None;
}
let last_slash = memrchr(b'/', path).map(|i| i + 1).unwrap_or(0);
Some(OsStr::from_bytes(&path[last_slash..]))
}
/// The final component of the path, if it is a normal file.
///
/// If the path terminates in ., .., or consists solely of a root of prefix,
/// file_name will return None.
#[cfg(not(unix))]
pub fn file_name<'a, P: AsRef<Path> + ?Sized>(
path: &'a P,
) -> Option<&'a OsStr> {
path.as_ref().file_name()
}
/// Returns true if and only if this file path is considered to be hidden.
#[cfg(unix)]
pub fn is_hidden<P: AsRef<Path>>(path: P) -> bool {
use std::os::unix::ffi::OsStrExt;
if let Some(name) = file_name(path.as_ref()) {
name.as_bytes().get(0) == Some(&b'.')
} else {
false
}
}
/// Returns true if and only if this file path is considered to be hidden.
#[cfg(not(unix))]
pub fn is_hidden<P: AsRef<Path>>(path: P) -> bool {
if let Some(name) = file_name(path) {
name.to_str().map(|s| s.starts_with(".")).unwrap_or(false)
} else {
false
}
}

View File

@@ -151,8 +151,8 @@ impl FileTypeDef {
/// Types is a file type matcher.
#[derive(Clone, Debug)]
pub struct Types {
selected: Option<glob::Set>,
negated: Option<glob::Set>,
selected: Option<glob::SetYesNo>,
negated: Option<glob::SetYesNo>,
has_selected: bool,
unmatched_pat: Pattern,
}
@@ -165,8 +165,8 @@ impl Types {
/// If has_selected is true, then at least one file type was selected.
/// Therefore, any non-matches should be ignored.
fn new(
selected: Option<glob::Set>,
negated: Option<glob::Set>,
selected: Option<glob::SetYesNo>,
negated: Option<glob::SetYesNo>,
has_selected: bool,
) -> Types {
Types {
@@ -268,7 +268,7 @@ impl TypesBuilder {
try!(bset.add_with(glob, &opts));
}
}
Some(try!(bset.build()))
Some(try!(bset.build_yesno()))
};
let negated_globs =
if self.negated.is_empty() {
@@ -287,7 +287,7 @@ impl TypesBuilder {
try!(bset.add_with(glob, &opts));
}
}
Some(try!(bset.build()))
Some(try!(bset.build_yesno()))
};
Ok(Types::new(
selected_globs, negated_globs, !self.selected.is_empty()))

View File

@@ -26,6 +26,7 @@ impl Iter {
}
/// Returns true if this entry should be skipped.
#[inline(always)]
fn skip_entry(&self, ent: &DirEntry) -> bool {
if ent.depth() == 0 {
// Never skip the root directory.
@@ -41,6 +42,7 @@ impl Iter {
impl Iterator for Iter {
type Item = DirEntry;
#[inline(always)]
fn next(&mut self) -> Option<DirEntry> {
while let Some(ev) = self.it.next() {
match ev {
@@ -108,6 +110,7 @@ impl From<WalkDir> for WalkEventIter {
impl Iterator for WalkEventIter {
type Item = walkdir::Result<WalkEvent>;
#[inline(always)]
fn next(&mut self) -> Option<walkdir::Result<WalkEvent>> {
let dent = self.next.take().or_else(|| self.it.next());
let depth = match dent {