Compare commits

..

31 Commits

Author SHA1 Message Date
Andrew Gallant
9bf7696ec8 Initial cut at a benchmark suite for CLI search tools. 2016-09-11 01:05:36 -04:00
Andrew Gallant
cb0f8fd2fa Bump default thread count to 8. 2016-09-11 00:42:39 -04:00
Andrew Gallant
fa8112ec34 Add alternative compile strategy (in a comment). 2016-09-11 00:42:30 -04:00
Andrew Gallant
cf21b4a97e Add doc. 2016-09-11 00:42:19 -04:00
Andrew Gallant
19615245cd Make line counting much faster. 2016-09-10 01:35:44 -04:00
Andrew Gallant
98a48b44bc Fix off-by-one bug in searcher. 2016-09-10 01:35:30 -04:00
Andrew Gallant
e3da726836 Rename search module to search_stream.
The name better reflects the difference between it and the search_buffer
module.
2016-09-10 00:08:42 -04:00
Andrew Gallant
5b36c86c15 Rejigger the atty detection stuff. 2016-09-10 00:05:20 -04:00
Andrew Gallant
76331e5fec Fix test that relied on non-deterministic order of results. 2016-09-09 23:24:01 -04:00
Andrew Gallant
1e678d7052 Fix files test. What a pain. 2016-09-09 23:19:46 -04:00
Andrew Gallant
dd986d7fe9 Add standard Linux CI (GNU libc). 2016-09-09 23:19:37 -04:00
Andrew Gallant
f83cd63b11 Add integration tests. 2016-09-09 22:58:30 -04:00
Andrew Gallant
9a4527d107 fix Rust version number in CI 2016-09-09 18:47:05 -04:00
Andrew Gallant
8f0d3d78ca clean up CI script 2016-09-09 18:10:20 -04:00
Andrew Gallant
3f7cd977bc expand Rust versions we test on. 2016-09-09 18:07:30 -04:00
Andrew Gallant
cc6b6dcf5b fix windows build 2016-09-09 08:53:10 -04:00
Andrew Gallant
48878bbb8f update project name 2016-09-08 21:47:49 -04:00
Andrew Gallant
0766617e07 Refactor how coloring is done.
All in the name of appeasing Windows.
2016-09-08 21:46:14 -04:00
Andrew Gallant
afd99c43d7 fix deploy 2016-09-08 16:35:48 -04:00
Andrew Gallant
96e87ab738 update distributable to include readme and license 2016-09-08 16:21:37 -04:00
Andrew Gallant
a744ec133d Rename xrep to ripgrep. 2016-09-08 16:15:44 -04:00
Andrew Gallant
0042dce949 Hack in Windows console coloring.
The code has suffered and needs refactoring/commenting. BUT... IT WORKS!
2016-09-07 21:54:28 -04:00
Andrew Gallant
ca058d7584 Add support for memory maps.
I though plain `read` had usurped them, but when searching a very small
number of files, mmaps can be around 20% faster on Linux. It'd be really
unfortunate to leave that on the table.

Mmap searching doesn't support contexts yet, but we probably don't really
care. And duplicating that logic doesn't sound fun. Without contexts, mmap
searching is delightfully simple.
2016-09-06 21:47:33 -04:00
Andrew Gallant
af3b56a623 Fix grep match iterator. 2016-09-06 21:45:41 -04:00
Andrew Gallant
5938bed339 Add support for printing column numbers. 2016-09-06 19:50:27 -04:00
Andrew Gallant
feff1849c8 Tweak colors. 2016-09-06 19:35:52 -04:00
Andrew Gallant
9948e0ca07 Only create the Grep searcher once. 2016-09-06 19:33:19 -04:00
Andrew Gallant
fd3e5069b6 Fix required literal handling and add debug prints.
In particular, if we had an inner literal and were doing a case insensitive
search, then the literals are dropped because we previously only allowed
a single inner literal to have an effect. Now we allow alternations of
inner literals, but still don't quite take full advantage.
2016-09-06 19:33:03 -04:00
Andrew Gallant
0891b4a3c0 update appveyor 2016-09-05 22:01:53 -04:00
Andrew Gallant
af48aaa647 another try 2016-09-05 21:57:57 -04:00
Andrew Gallant
ee7f300ae2 windows debug, take 1 2016-09-05 21:46:11 -04:00
25 changed files with 2938 additions and 416 deletions

View File

@@ -1,37 +1,49 @@
#language: rust
#rust:
# - stable
# - beta
# - nightly
#script:
# - cargo build --verbose
# - cargo doc
# - cargo test --verbose
# - if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then
# cargo bench --verbose;
# fi
language: rust
cache: cargo
env:
global:
- PROJECT_NAME=xrep
- PROJECT_NAME=ripgrep
matrix:
include:
# Nightly channel
- os: osx
rust: nightly
env: TARGET=i686-apple-darwin
- os: osx
rust: nightly
env: TARGET=x86_64-apple-darwin
# Nightly channel.
# (All *nix releases are done on the nightly channel to take advantage
# of the regex library's multiple pattern SIMD search.)
- os: linux
rust: nightly
env: TARGET=i686-unknown-linux-musl
- os: linux
rust: nightly
env: TARGET=x86_64-unknown-linux-musl
- os: linux
rust: nightly
env: TARGET=x86_64-unknown-linux-gnu
- os: osx
rust: nightly
env: TARGET=i686-apple-darwin
- os: osx
rust: nightly
env: TARGET=x86_64-apple-darwin
# Beta channel.
- os: linux
rust: beta
env: TARGET=x86_64-unknown-linux-musl
- os: linux
rust: beta
env: TARGET=x86_64-unknown-linux-gnu
- os: osx
rust: beta
env: TARGET=x86_64-apple-darwin
# Minimum Rust supported channel.
- os: linux
rust: 1.9.0
env: TARGET=x86_64-unknown-linux-musl
- os: linux
rust: 1.9.0
env: TARGET=x86_64-unknown-linux-gnu
- os: osx
rust: 1.9.0
env: TARGET=x86_64-apple-darwin
before_install:
- export PATH="$PATH:$HOME/.cargo/bin"

View File

@@ -1,14 +1,14 @@
[package]
publish = false
name = "xrep"
name = "ripgrep"
version = "0.1.0" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Line oriented search tool using Rust's regex library.
"""
documentation = "https://github.com/BurntSushi/xrep"
homepage = "https://github.com/BurntSushi/xrep"
repository = "https://github.com/BurntSushi/xrep"
documentation = "https://github.com/BurntSushi/ripgrep"
homepage = "https://github.com/BurntSushi/ripgrep"
repository = "https://github.com/BurntSushi/ripgrep"
readme = "README.md"
keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
@@ -16,7 +16,11 @@ license = "Unlicense/MIT"
[[bin]]
bench = false
path = "src/main.rs"
name = "xrep"
name = "rg"
[[test]]
name = "integration"
path = "tests/tests.rs"
[dependencies]
crossbeam = "0.2"

View File

@@ -1,6 +1,6 @@
environment:
global:
PROJECT_NAME: xrep
PROJECT_NAME: ripgrep
matrix:
# Nightly channel
- TARGET: i686-pc-windows-gnu
@@ -32,16 +32,14 @@ build: false
# Equivalent to Travis' `script` phase
# TODO modify this phase as you see fit
test_script:
- cargo build --verbose
- cargo test
- cargo test --verbose
before_deploy:
# Generate artifacts for release
- SET RUSTFLAGS="-C target-feature=+ssse3"
- cargo build --release --features simd-accel
# TODO(burntsushi): How can we enable SSSE3 on Windows?
- cargo build --release
- mkdir staging
# TODO update this part to copy the artifacts that make sense for your project
- copy target\release\xrep.exe staging
- copy target\release\rg.exe staging
- cd staging
# release zipfile will look like 'rust-everywhere-v1.2.3-x86_64-pc-windows-msvc'
- 7z a ../%PROJECT_NAME%-%APPVEYOR_REPO_TAG_NAME%-%TARGET%.zip *

View File

@@ -1,3 +1,7 @@
/*!
This module benchmarks the glob implementation. For benchmarks on the ripgrep
tool itself, see the benchsuite directory.
*/
#![feature(test)]
extern crate glob;

918
benchsuite Executable file
View File

@@ -0,0 +1,918 @@
#!/usr/bin/env python
'''
benchsuite is a benchmark runner for comparing command line search tools.
'''
import argparse
import csv
import os
import os.path as path
from multiprocessing import cpu_count
import re
import statistics
import subprocess
import sys
import time
# Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned.
SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.ru.gz'
LINUX_DIR = 'linux'
LINUX_CLONE = 'git://github.com/BurntSushi/linux'
def bench_linux_literal_default(suite_dir):
'''
Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
# N.B. This is a purposefully unfair benchmark for illustrative purposes
# of how the default modes for each search tool differ.
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]),
# ucg reports the exact same matches as ag and rg even though it
# doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'C'}),
mkcmd('pt', ['pt', pat]),
# sift reports an extra line here for a binary file matched.
mkcmd('sift', ['sift', pat]),
])
def bench_linux_literal(suite_dir):
'''
Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is no
case insensitive matching and that line numbers are computed.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd('git grep', [
'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}),
mkcmd('pt', ['pt', pat]),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
])
def bench_linux_literal_casei(suite_dir):
'''
Benchmark the speed of a case insensitive literal search.
This is like the linux_literal benchmark, except we ask the
search tools to do case insensitive search.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
]),
mkcmd('ag', ['ag', '-i', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-i', pat]),
mkcmd('ucg', ['ucg', '-i', pat]),
mkcmd('git grep', [
'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}),
# sift yields more matches than it should here. Specifically, it gets
# matches in Module.symvers and System.map in the repo root. Both of
# those files show up in the repo root's .gitignore file.
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-i', pat,
]),
])
def bench_linux_re_literal_suffix(suite_dir):
'''
Benchmark the speed of a literal inside a regex.
This, for example, inhibits a prefix byte optimization used
inside of Go's regex engine (relevant for sift and pt).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = '[A-Z]+_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', ['rg', '--mmap', '--no-ignore', '-n', pat]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd(
'git grep',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
])
def bench_linux_word(suite_dir):
'''
Benchmark use of the -w ("match word") flag in each tool.
sift has a lot of trouble with this because it forces it into Go's
regex engine by surrounding the pattern with \b assertions.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-w', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-w', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-w', pat,
]),
mkcmd('ag', ['ag', '-s', '-w', pat]),
mkcmd('ag-novcs', ['ag', '--skip-vcs-ignores', '-s', '-w', pat]),
mkcmd('ucg', ['ucg', '--nosmart-case', '-w', pat]),
mkcmd(
'git grep',
['git', 'grep', '-E', '-I', '-n', '-w', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', '-w', pat,
]),
])
def bench_linux_unicode_greek(suite_dir):
'''
Benchmark matching of a Unicode category.
Only three tools (ripgrep, sift and pt) support this.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\p{Greek}'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
# sift tries to search a bunch of PDF files and clutters up the
# results, even though --binary-skip is provided. They are excluded
# here explicitly, but don't have a measurable impact on performance.
mkcmd('sift', [
'sift', '-n', '--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
pat,
]),
])
def bench_linux_unicode_greek_casei(suite_dir):
'''
Benchmark matching of a Unicode category, case insensitively.
Only ripgrep gets this right (and it's still fast).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\p{Greek}'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
# sift tries to search a bunch of PDF files and clutters up the
# results, even though --binary-skip is provided. They are excluded
# here explicitly, but don't have a measurable impact on performance.
mkcmd('sift', [
'sift', '-n', '--binary-skip',
'--exclude-files', '.*',
'--exclude-files', '*.pdf',
pat,
]),
])
def bench_linux_unicode_word(suite_dir):
'''
Benchmark Unicode aware \w character class.
Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
this right. Everything else uses the standard ASCII interpretation
of \w.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\wAh'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', pat,
]),
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
mkcmd('ag-novcs (no Unicode)', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
mkcmd(
'git grep',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'},
),
mkcmd(
'git grep (no Unicode)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift (no Unicode)', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
])
def bench_linux_no_literal(suite_dir):
'''
Benchmark a regex that defeats all literal optimizations.
Most search patterns have some kind of literal in them, which
typically permits searches to take some shortcuts. Therefore, the
applicability of this benchmark is somewhat suspicious, but the
suite wouldn't feel complete without it.
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('rg-novcs (no Unicode)', [
'rg', '--no-ignore', '-n', '(?-u)' + pat,
]),
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
mkcmd('ag-novcs (no Unicode)', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg (no Unicode)', ['ucg', '--nosmart-case', pat]),
mkcmd(
'git grep',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'},
),
mkcmd(
'git grep (no Unicode)',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
mkcmd('sift (no Unicode)', [
'sift', '-n', '--binary-skip', '--exclude-files', '.*', pat,
]),
])
def bench_linux_alternates(suite_dir):
'''
Benchmark a small alternation of literals.
sift doesn't make the cut. It's more than 10x slower than the next
fastest result. The slowdown is likely because the Go regexp engine
doesn't do any literal optimizations for this case (there is no
common leading byte).
'''
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', pat,
]),
mkcmd('ag', ['ag', '-s', pat]),
mkcmd('ag-novcs', [
'ag', '--skip-vcs-ignores', '-s', pat,
]),
mkcmd('ucg', ['ucg', '--nosmart-case', pat]),
mkcmd(
'git grep',
['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'},
),
])
def bench_linux_alternates_casei(suite_dir):
'Benchmark a small alternation of literals case insensitively.'
require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR)
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd
return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', '-i', pat]),
mkcmd('rg-novcs-mmap', [
'rg', '--mmap', '--no-ignore', '-n', '-i', pat,
]),
mkcmd('ag', ['ag', '-i', pat]),
mkcmd('ag-novcs', [
'ag', '--skip-vcs-ignores', '-i', pat,
]),
mkcmd('ucg', ['ucg', '-i', pat]),
mkcmd(
'git grep',
['git', 'grep', '-E', '-I', '-n', '-i', pat],
env={'LC_ALL': 'C'},
),
])
# BREADCRUMBS(burntsushi): We should benchmark an alternation for `linux` as
# well.
def bench_sherlock(suite_dir):
'TODO: Fix this and add more single file benchmarks.'
require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME)
pat = 'Sherlock'
return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', pat, en]),
Command('grep', ['grep', '-a', pat, en])
])
class MissingDependencies(Exception):
'''
A missing dependency exception.
This exception occurs when running a benchmark that requires a
particular corpus that isn't available.
:ivar list(str) missing_names:
A list of missing dependency names. These names correspond to
names that can be used with the --download flag.
'''
def __init__(self, missing_names):
self.missing_names = missing_names
def __str__(self):
return 'MissingDependency(%s)' % repr(self.missing_names)
class Benchmark(object):
'''
A single benchmark corresponding to a grouping of commands.
The main purpose of a benchmark is to compare the performance
characteristics of a group of commands.
'''
def __init__(self, name=None, pattern=None, commands=None,
warmup_count=1, count=3, line_count=True):
'''
Create a single benchmark.
A single benchmark is composed of a set of commands that are
benchmarked and compared against one another. A benchmark may
have multiple commands that use the same search tool (but
probably should have something differentiating them).
The grouping of commands is a purely human driven process.
By default, the output of every command is sent to /dev/null.
Other types of behavior are available via the methods defined
on this benchmark.
:param str name:
A human readable string denoting the name of this
benchmark.
:param str pattern:
The pattern that is used in search.
:param list(Command) commands:
A list of commands to initialize this benchmark with. More
commands may be added before running the benchmark.
:param int warmup_count:
The number of times to run each command before recording
samples.
:param int count:
The number of samples to collect from each command.
:param bool line_count:
When set, the lines of each search are counted and included
in the samples produced.
'''
self.name = name
self.pattern = pattern
self.commands = commands or []
self.warmup_count = warmup_count
self.count = count
self.line_count = line_count
def run(self):
'''
Runs this benchmark and returns the results.
:rtype: Result
'''
result = Result(self)
for cmd in self.commands:
# Do a warmup first.
for _ in range(self.warmup_count):
self.run_one(cmd)
for _ in range(self.count):
result.add(cmd, **self.run_one(cmd))
return result
def run_one(self, cmd):
'''
Runs the given command exactly once.
Returns an object that includes the time taken by the command.
If this benchmark was configured to count the number of lines
returned, then the line count is also returned.
:param Command cmd: The command to run.
:returns:
A dict with two fields, duration and line_count.
The duration is in seconds, with fractional milliseconds,
and is guaranteed to be available. The line_count is set
to None unless line counting is enabled, in which case,
it is the number of lines in the search output.
:rtype: int
'''
cmd.kwargs['stderr'] = subprocess.DEVNULL
if self.line_count:
cmd.kwargs['stdout'] = subprocess.PIPE
else:
cmd.kwargs['stdout'] = subprocess.DEVNULL
start = time.time()
completed = cmd.run()
end = time.time()
line_count = None
if self.line_count:
line_count = completed.stdout.count(b'\n')
return {
'duration': end - start,
'line_count': line_count,
}
class Result(object):
'''
The result of running a benchmark.
Benchmark results consist of a set of samples, where each sample
corresponds to a single run of a single command in the benchmark.
Various statistics can be computed from these samples such as mean
and standard deviation.
'''
def __init__(self, benchmark):
'''
Create a new set of results, initially empty.
:param Benchmarl benchmark:
The benchmark that produced these results.
'''
self.benchmark = benchmark
self.samples = []
def add(self, cmd, duration, line_count=None):
'''
Add a new sample to this result set.
:param Command cmd:
The command that produced this sample.
:param int duration:
The duration, in milliseconds, that the command took to
run.
:param int line_count:
The number of lines in the search output. This is optional.
'''
self.samples.append({
'cmd': cmd,
'duration': duration,
'line_count': line_count,
})
def fastest_sample(self):
'''
Returns the fastest recorded sample.
'''
return min(self.samples, key=lambda s: s['duration'])
def fastest_cmd(self):
'''
Returns the fastest command according to distribution.
'''
means = []
for cmd in self.benchmark.commands:
mean, _ = self.distribution_for(cmd)
means.append((cmd, mean))
return min(means, key=lambda tup: tup[1])[0]
def samples_for(self, cmd):
'Returns an iterable of samples for cmd'
yield from (s for s in self.samples if s['cmd'].name == cmd.name)
def line_counts_for(self, cmd):
'''
Returns the line counts recorded for each command.
:returns:
A dictionary from command name to a set of line
counts recorded.
'''
return {s['line_count'] for s in self.samples_for(cmd)
if s['line_count'] is not None}
def distribution_for(self, cmd):
'''
Returns the distribution (mean +/- std) of the given command.
:rtype: (float, float)
:returns:
A tuple containing the mean and standard deviation, in that
order.
'''
mean = statistics.mean(
s['duration'] for s in self.samples_for(cmd))
stdev = statistics.stdev(
s['duration'] for s in self.samples_for(cmd))
return mean, stdev
class Command(object):
def __init__(self, name, cmd, *args, **kwargs):
'''
Create a new command that is run as part of a benchmark.
*args and **kwargs are passed directly to ``subprocess.run``.
An exception to this is stdin/stdout/stderr. Output
redirection is completely controlled by the benchmark harness.
Trying to set them here will trigger an assert.
:param str name:
The human readable name of this command. This is
particularly useful if the same search tool is used
multiple times in the same benchmark with different
arguments.
:param list(str) cmd:
The command to run as a list of arguments (including the
command name itself).
'''
assert 'stdin' not in kwargs
assert 'stdout' not in kwargs
assert 'stderr' not in kwargs
self.name = name
self.cmd = cmd
self.args = args
self.kwargs = kwargs
def run(self):
'''
Runs this command and returns its status.
:rtype: subprocess.CompletedProcess
'''
return subprocess.run(self.cmd, *self.args, **self.kwargs)
def eprint(*args, **kwargs):
'Like print, but to stderr.'
kwargs['file'] = sys.stderr
print(*args, **kwargs)
def run_cmd(cmd, *args, **kwargs):
'''
Print the command to stderr and run it.
If the command fails, throw a traceback.
'''
eprint('# %s' % ' '.join(cmd))
kwargs['check'] = True
return subprocess.run(cmd, *args, **kwargs)
def require(suite_dir, *names):
'''
Declare a dependency on the given names for a benchmark.
If any dependency doesn't exist, then fail with an error message.
'''
errs = []
for name in names:
fun_name = name.replace('-', '_')
if not globals()['has_%s' % fun_name](suite_dir):
errs.append(name)
if len(errs) > 0:
raise MissingDependencies(errs)
def download_linux(suite_dir):
'Download and build the Linux kernel.'
checkout_dir = path.join(suite_dir, LINUX_DIR)
if not os.path.isdir(checkout_dir):
# Clone from my fork so that we always get the same corpus *and* still
# do a shallow clone. Shallow clones are much much cheaper than full
# clones.
run_cmd(['git', 'clone', '--depth', '1', LINUX_CLONE, checkout_dir])
# We want to build the kernel because the process of building it produces
# a lot of junk in the repository that a search tool probably shouldn't
# touch.
if not os.path.exists(path.join(checkout_dir, 'vmlinux')):
eprint('# Building Linux kernel...')
run_cmd(['make', 'defconfig'], cwd=checkout_dir)
run_cmd(['make', '-j', str(cpu_count())], cwd=checkout_dir)
def has_linux(suite_dir):
'Returns true if we believe the Linux kernel is built.'
checkout_dir = path.join(suite_dir, LINUX_DIR)
return path.exists(path.join(checkout_dir, 'vmlinux'))
def download_subtitles_en(suite_dir):
'Download and decompress English subtitles.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
if not os.path.isdir(subtitle_dir):
os.makedirs(subtitle_dir)
if not os.path.exists(en_path):
if not os.path.exists(en_path_gz):
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
def has_subtitles_en(suite_dir):
'Returns true if English subtitles have been downloaded.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME))
def download_subtitles_ru(suite_dir):
'Download and decompress Russian subtitles.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
ru_path_gz = path.join(subtitle_dir, SUBTITLES_RU_NAME_GZ)
ru_path = path.join(subtitle_dir, SUBTITLES_RU_NAME)
if not os.path.isdir(subtitle_dir):
os.makedirs(subtitle_dir)
if not os.path.exists(ru_path):
if not os.path.exists(ru_path_gz):
run_cmd(['curl', '-LO', SUBTITLES_RU_URL], cwd=subtitle_dir)
run_cmd(['gunzip', ru_path_gz], cwd=subtitle_dir)
def has_subtitles_ru(suite_dir):
'Returns true if Russian subtitles have been downloaded.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
return path.exists(path.join(subtitle_dir, SUBTITLES_RU_NAME))
def download(suite_dir, choices):
'''
Download choices into suite_dir.
Specifically, choices specifies a list of corpora to fetch.
:param str suite_dir:
The directory in which to download corpora.
:param list(str) choices:
A list of corpora to download. Available choices are:
all, linux, subtitles-en, subtitles-ru.
'''
for choice in args.download:
if choice == 'linux':
download_linux(suite_dir)
elif choice == 'subtitles-en':
download_subtitles_en(suite_dir)
elif choice == 'subtitles-ru':
download_subtitles_ru(suite_dir)
elif choice == 'all':
download_linux(suite_dir)
download_subtitles_en(suite_dir)
download_subtitles_ru(suite_dir)
else:
eprint('Unrecognized download choice: %s' % choice)
sys.exit(1)
def collect_benchmarks(suite_dir, filter_pat=None):
'''
Return an iterable of all runnable benchmarks.
:param str suite_dir:
The directory containing corpora.
:param str filter_pat:
A single regular expression that is used to filter benchmarks
by their name. When not specified, all benchmarks are run.
:returns:
An iterable over all runnable benchmarks. If a benchmark
requires corpora that are missing, then a log message is
emitted to stderr and it is not yielded.
'''
for fun in sorted(globals()):
if not fun.startswith('bench_'):
continue
name = re.sub('^bench_', '', fun)
if filter_pat is not None and not re.search(filter_pat, name):
continue
try:
benchmark = globals()[fun](suite_dir)
except MissingDependencies as e:
eprint(
'missing: %s, skipping benchmark %s (try running with: %s)' % (
', '.join(e.missing_names),
name,
' '.join(['--download %s' % n for n in e.missing_names]),
))
continue
benchmark.name = name
yield benchmark
def main():
p = argparse.ArgumentParser('Command line search tool benchmark suite.')
p.add_argument(
'--dir', metavar='PATH', default=os.getcwd(),
help='The directory in which to download data and perform searches.')
p.add_argument(
'--download', metavar='CORPUS', action='append',
choices=['all', 'linux', 'subtitles-en', 'subtitles-ru'],
help='Download and prepare corpus data, then exit without running '
'any benchmarks. Note that this command is intended to be '
'idempotent. WARNING: This downloads over a gigabyte of data, '
'and also includes building the Linux kernel. If "all" is used '
'then the total uncompressed size is around 13 GB.')
p.add_argument(
'-f', '--force', action='store_true',
help='Overwrite existing files if there is a conflict.')
p.add_argument(
'--list', action='store_true',
help='List available benchmarks by name.')
p.add_argument(
'--raw', metavar='PATH',
help='Dump raw data (all samples collected) in CSV format to the '
'file path provided.')
p.add_argument(
'bench', metavar='PAT', nargs='?',
help='A regex pattern that will only run benchmarks that match.')
args = p.parse_args()
if args.download is not None and len(args.download) > 0:
download(args.dir, args.choices)
sys.exit(0)
if not path.isdir(args.dir):
os.makedirs(args.dir)
if args.raw is not None and path.exists(args.raw) and not args.force:
eprint('File %s already exists (delete it or use --force)' % args.raw)
sys.exit(1)
raw_handle, raw_csv_wtr = None, None
if args.raw is not None:
fields = [
'benchmark', 'warmup_iter', 'iter',
'name', 'command', 'duration', 'lines', 'env',
]
raw_handle = open(args.raw, 'w+')
raw_csv_wtr = csv.DictWriter(raw_handle, fields)
raw_csv_wtr.writerow({x: x for x in fields})
benchmarks = collect_benchmarks(args.dir, filter_pat=args.bench)
for i, b in enumerate(benchmarks):
result = b.run()
fastest_cmd = result.fastest_cmd()
fastest_sample = result.fastest_sample()
max_name_len = max(len(cmd.name) for cmd in b.commands)
if i > 0:
print()
header = '%s (pattern: %s)' % (b.name, b.pattern)
print('%s\n%s' % (header, '-' * len(header)))
for cmd in b.commands:
name = cmd.name
mean, stdev = result.distribution_for(cmd)
line_counts = result.line_counts_for(cmd)
show_fast_cmd, show_line_counts = '', ''
if fastest_cmd.name == cmd.name:
show_fast_cmd = '*'
if fastest_sample['cmd'].name == cmd.name:
name += '*'
if len(line_counts) > 0:
counts = map(str, line_counts)
show_line_counts = ' (lines: %s)' % ', '.join(counts)
fmt = '{name:{pad}} {mean:0.3f} +/- {stdev:0.3f}{lines}{fast_cmd}'
print(fmt.format(
name=name, pad=max_name_len + 2, fast_cmd=show_fast_cmd,
mean=mean, stdev=stdev, lines=show_line_counts))
sys.stdout.flush()
if raw_csv_wtr is not None:
for sample in result.samples:
cmd, duration = sample['cmd'], sample['duration']
env = ' '.join(['%s=%s' % (k, v)
for k, v in cmd.kwargs.get('env', {}).items()])
raw_csv_wtr.writerow({
'benchmark': b.name,
'warmup_iter': b.warmup_count,
'iter': b.count,
'name': sample['cmd'].name,
'command': ' '.join(cmd.cmd),
'duration': duration,
'lines': sample['line_count'] or '',
'env': env,
})
raw_handle.flush()
if __name__ == '__main__':
main()

View File

@@ -6,23 +6,22 @@ set -ex
# Generate artifacts for release
mk_artifacts() {
RUSTFLAGS="-C target-feature=+ssse3" cargo build --target $TARGET --release --features simd-accel
RUSTFLAGS="-C target-feature=+ssse3" \
cargo build --target $TARGET --release --features simd-accel
}
mk_tarball() {
# create a "staging" directory
local td=$(mktempd)
local out_dir=$(pwd)
local name="${PROJECT_NAME}-${TRAVIS_TAG}-${TARGET}"
mkdir "$td/$name"
# TODO update this part to copy the artifacts that make sense for your project
# NOTE All Cargo build artifacts will be under the 'target/$TARGET/{debug,release}'
cp target/$TARGET/release/xrep $td
cp target/$TARGET/release/rg "$td/$name/"
cp {README.md,UNLICENSE,COPYING,LICENSE-MIT} "$td/$name/"
pushd $td
# release tarball will look like 'rust-everywhere-v1.2.3-x86_64-unknown-linux-gnu.tar.gz'
tar czf $out_dir/${PROJECT_NAME}-${TRAVIS_TAG}-${TARGET}.tar.gz *
tar czf "$out_dir/$name.tar.gz" *
popd
rm -r $td
}

View File

@@ -11,42 +11,20 @@ disable_cross_doctests() {
if [ "$TRAVIS_OS_NAME" = "osx" ]; then
brew install gnu-sed --default-names
fi
find src -name '*.rs' -type f | xargs sed -i -e 's:\(//.\s*```\):\1 ignore,:g'
fi
}
# TODO modify this function as you see fit
# PROTIP Always pass `--target $TARGET` to cargo commands, this makes cargo output build artifacts
# to target/$TARGET/{debug,release} which can reduce the number of needed conditionals in the
# `before_deploy`/packaging phase
run_test_suite() {
case $TARGET in
# configure emulation for transparent execution of foreign binaries
aarch64-unknown-linux-gnu)
export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
;;
arm*-unknown-linux-gnueabihf)
export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
;;
*)
;;
esac
if [ ! -z "$QEMU_LD_PREFIX" ]; then
# Run tests on a single thread when using QEMU user emulation
export RUST_TEST_THREADS=1
fi
cargo build --target $TARGET --verbose
cargo test --target $TARGET
cargo test --target $TARGET --verbose
# sanity check the file type
file target/$TARGET/debug/xrep
file target/$TARGET/debug/rg
}
main() {
disable_cross_doctests
# disable_cross_doctests
run_test_suite
}

View File

@@ -1,4 +1,5 @@
#!/bin/sh
export RUSTFLAGS="-C target-feature=+ssse3"
# export RUSTFLAGS="-C target-cpu=native"
cargo build --release --features simd-accel

View File

@@ -6,14 +6,15 @@ authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Fast line oriented regex searching as a library.
"""
documentation = "https://github.com/BurntSushi/xrep"
homepage = "https://github.com/BurntSushi/xrep"
repository = "https://github.com/BurntSushi/xrep"
documentation = "https://github.com/BurntSushi/ripgrep"
homepage = "https://github.com/BurntSushi/ripgrep"
repository = "https://github.com/BurntSushi/ripgrep"
readme = "README.md"
keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
log = "0.3"
memchr = "0.1"
memmap = "0.2"
regex = "0.1.75"

View File

@@ -4,6 +4,8 @@
A fast line oriented regex searcher.
*/
#[macro_use]
extern crate log;
extern crate memchr;
extern crate regex;
extern crate regex_syntax as syntax;

View File

@@ -1,13 +1,22 @@
/*!
The literals module is responsible for extracting *inner* literals out of the
AST of a regular expression. Normally this is the job of the regex engine
itself, but the regex engine doesn't look for inner literals. Since we're doing
line based searching, we can use them, so we need to do it ourselves.
Note that this implementation is incredibly suspicious. We need something more
principled.
*/
use std::cmp;
use std::iter;
use regex::bytes::Regex;
use syntax::{
Expr, Literals, Lit,
Repeater,
ByteClass, ByteRange, CharClass, ClassRange, Repeater,
};
#[derive(Debug)]
#[derive(Clone, Debug)]
pub struct LiteralSets {
prefixes: Literals,
suffixes: Literals,
@@ -27,6 +36,7 @@ impl LiteralSets {
pub fn to_regex(&self) -> Option<Regex> {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan.
return None;
}
@@ -56,13 +66,27 @@ impl LiteralSets {
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
if req.len() > lit.len() {
if req_lits.len() == 1 && req.len() > lit.len() {
lit = req;
}
if lit.is_empty() {
// Special case: if we detected an alternation of inner required
// literals and its longest literal is bigger than the longest
// prefix/suffix, then choose the alternation. In practice, this
// helps with case insensitive matching, which can generate lots of
// inner required literals.
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> =
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
// Literals always compile.
Some(Regex::new(&alts.join("|")).unwrap())
} else if lit.is_empty() {
None
} else {
// Literals always compile.
debug!("required literal found: {:?}", show(lit));
Some(Regex::new(&bytes_to_regex(lit)).unwrap())
}
}
@@ -75,14 +99,30 @@ fn union_required(expr: &Expr, lits: &mut Literals) {
let s: String = chars.iter().cloned().collect();
lits.cross_add(s.as_bytes());
}
Literal { casei: true, .. } => {
lits.cut();
Literal { ref chars, casei: true } => {
for &c in chars {
let cls = CharClass::new(vec![
ClassRange { start: c, end: c },
]).case_fold();
if !lits.add_char_class(&cls) {
lits.cut();
return;
}
}
}
LiteralBytes { ref bytes, casei: false } => {
lits.cross_add(bytes);
}
LiteralBytes { casei: true, .. } => {
lits.cut();
LiteralBytes { ref bytes, casei: true } => {
for &b in bytes {
let cls = ByteClass::new(vec![
ByteRange { start: b, end: b },
]).case_fold();
if !lits.add_byte_class(&cls) {
lits.cut();
return;
}
}
}
Class(_) => {
lits.cut();
@@ -205,3 +245,18 @@ fn bytes_to_regex(bs: &[u8]) -> String {
}
s
}
/// Converts arbitrary bytes to a nice string.
fn show(bs: &[u8]) -> String {
// Why aren't we using this to feed to the regex? Doesn't really matter
// I guess. ---AG
use std::ascii::escape_default;
use std::str;
let mut nice = String::new();
for &b in bs {
let part: Vec<u8> = escape_default(b).collect();
nice.push_str(str::from_utf8(&part).unwrap());
}
nice
}

View File

@@ -152,6 +152,7 @@ impl GrepBuilder {
.unicode(true)
.case_insensitive(self.opts.case_insensitive)
.parse(&self.pattern));
debug!("regex ast:\n{:#?}", expr);
Ok(try!(nonl::remove(expr, self.opts.line_terminator)))
}
}
@@ -194,7 +195,7 @@ impl Grep {
let (prevnl, nextnl) = self.find_line(buf, e, e);
match self.re.shortest_match(&buf[prevnl..nextnl]) {
None => {
start = nextnl + 1;
start = nextnl;
continue;
}
Some(_) => {
@@ -253,7 +254,7 @@ impl<'b, 's> Iterator for Iter<'b, 's> {
self.start = self.buf.len();
return None;
}
self.start = mat.end + 1;
self.start = mat.end;
Some(mat)
}
}

View File

@@ -9,14 +9,16 @@ use grep::{Grep, GrepBuilder};
use log;
use num_cpus;
use regex;
use term::Terminal;
use walkdir::WalkDir;
use atty;
use gitignore::{Gitignore, GitignoreBuilder};
use ignore::Ignore;
use out::Out;
use out::{Out, OutBuffer};
use printer::Printer;
use search::{InputBuffer, Searcher};
use sys;
use search_buffer::BufferSearcher;
use search_stream::{InputBuffer, Searcher};
use types::{FileTypeDef, Types, TypesBuilder};
use walk;
@@ -27,13 +29,13 @@ use Result;
/// If you've never heard of Docopt before, see: http://docopt.org
/// (TL;DR: The CLI parser is generated from the usage string below.)
const USAGE: &'static str = "
Usage: xrep [options] <pattern> [<path> ...]
xrep [options] --files [<path> ...]
xrep [options] --type-list
xrep --help
xrep --version
Usage: rg [options] <pattern> [<path> ...]
rg [options] --files [<path> ...]
rg [options] --type-list
rg --help
rg --version
xrep is like the silver searcher and grep, but faster than both.
rg combines the usability of the silver search with the raw speed of grep.
Common options:
-a, --text Search binary files as if they were text.
@@ -75,6 +77,11 @@ Less common options:
-C, --context NUM
Show NUM lines before and after each match.
--column
Show column numbers (1 based) in output. This only shows the column
numbers for the first match on each line. Note that this doesn't try
to account for Unicode. One byte is equal to one column.
--context-separator ARG
The string to use when separating non-continuous context lines. Escape
sequences may be used. [default: --]
@@ -97,17 +104,22 @@ Less common options:
Don't show any file name heading.
--hidden
Search hidden directories and files.
Search hidden directories and files. (Hidden directories and files are
skipped by default.)
-L, --follow
Follow symlinks.
--line-terminator ARG
The byte to use for a line terminator. Escape sequences may be used.
[default: \\n]
--mmap
Search using memory maps when possible. This is enabled by default
when ripgrep thinks it will be faster. (Note that mmap searching
doesn't current support the various context related options.)
--no-mmap
Never use memory maps, even when they might be faster.
--no-ignore
Don't respect ignore files (.gitignore, .xrepignore, etc.)
Don't respect ignore files (.gitignore, .rgignore, etc.)
--no-ignore-parent
Don't respect ignore files in parent directories.
@@ -123,7 +135,7 @@ Less common options:
(capped at 6). [default: 0]
--version
Show the version number of xrep and exit.
Show the version number of ripgrep and exit.
File type management options:
--type-list
@@ -138,7 +150,7 @@ File type management options:
";
/// RawArgs are the args as they are parsed from Docopt. They aren't used
/// directly by the rest of xrep.
/// directly by the rest of ripgrep.
#[derive(Debug, RustcDecodable)]
pub struct RawArgs {
arg_pattern: String,
@@ -146,6 +158,7 @@ pub struct RawArgs {
flag_after_context: usize,
flag_before_context: usize,
flag_color: String,
flag_column: bool,
flag_context: usize,
flag_context_separator: String,
flag_count: bool,
@@ -158,12 +171,13 @@ pub struct RawArgs {
flag_ignore_case: bool,
flag_invert_match: bool,
flag_line_number: bool,
flag_line_terminator: String,
flag_literal: bool,
flag_mmap: bool,
flag_no_heading: bool,
flag_no_ignore: bool,
flag_no_ignore_parent: bool,
flag_no_line_number: bool,
flag_no_mmap: bool,
flag_pretty: bool,
flag_quiet: bool,
flag_replace: Option<String>,
@@ -186,17 +200,20 @@ pub struct Args {
after_context: usize,
before_context: usize,
color: bool,
column: bool,
context_separator: Vec<u8>,
count: bool,
eol: u8,
files: bool,
follow: bool,
glob_overrides: Option<Gitignore>,
grep: Grep,
heading: bool,
hidden: bool,
ignore_case: bool,
invert_match: bool,
line_number: bool,
mmap: bool,
no_ignore: bool,
no_ignore_parent: bool,
quiet: bool,
@@ -210,7 +227,7 @@ pub struct Args {
}
impl RawArgs {
/// Convert arguments parsed into a configuration used by xrep.
/// Convert arguments parsed into a configuration used by ripgrep.
fn to_args(&self) -> Result<Args> {
let pattern = {
let pattern =
@@ -227,7 +244,9 @@ impl RawArgs {
};
let paths =
if self.arg_path.is_empty() {
if sys::stdin_is_atty() {
if atty::on_stdin()
|| self.flag_files
|| self.flag_type_list {
vec![Path::new("./").to_path_buf()]
} else {
vec![Path::new("-").to_path_buf()]
@@ -243,15 +262,19 @@ impl RawArgs {
} else {
(self.flag_after_context, self.flag_before_context)
};
let eol = {
let eol = unescape(&self.flag_line_terminator);
if eol.is_empty() {
errored!("Empty line terminator is not allowed.");
} else if eol.len() > 1 {
errored!("Line terminators are limited to exactly 1 byte.");
}
eol[0]
};
let mmap =
if before_context > 0 || after_context > 0 || self.flag_no_mmap {
false
} else if self.flag_mmap {
true
} else {
// If we're only searching a few paths and all of them are
// files, then memory maps are probably faster.
paths.len() <= 10 && paths.iter().all(|p| p.is_file())
};
if mmap {
debug!("will try to use memory maps");
}
let glob_overrides =
if self.flag_glob.is_empty() {
None
@@ -265,16 +288,17 @@ impl RawArgs {
};
let threads =
if self.flag_threads == 0 {
cmp::min(6, num_cpus::get())
cmp::min(8, num_cpus::get())
} else {
self.flag_threads
};
let color =
if self.flag_color == "auto" {
sys::stdout_is_atty() || self.flag_pretty
atty::on_stdout() || self.flag_pretty
} else {
self.flag_color == "always"
};
let eol = b'\n';
let mut with_filename = self.flag_with_filename;
if !with_filename {
with_filename = paths.len() > 1 || paths[0].is_dir();
@@ -283,23 +307,32 @@ impl RawArgs {
btypes.add_defaults();
try!(self.add_types(&mut btypes));
let types = try!(btypes.build());
let grep = try!(
GrepBuilder::new(&pattern)
.case_insensitive(self.flag_ignore_case)
.line_terminator(eol)
.build()
);
let mut args = Args {
pattern: pattern,
paths: paths,
after_context: after_context,
before_context: before_context,
color: color,
column: self.flag_column,
context_separator: unescape(&self.flag_context_separator),
count: self.flag_count,
eol: eol,
files: self.flag_files,
follow: self.flag_follow,
glob_overrides: glob_overrides,
grep: grep,
heading: !self.flag_no_heading && self.flag_heading,
hidden: self.flag_hidden,
ignore_case: self.flag_ignore_case,
invert_match: self.flag_invert_match,
line_number: !self.flag_no_line_number && self.flag_line_number,
mmap: mmap,
no_ignore: self.flag_no_ignore,
no_ignore_parent: self.flag_no_ignore_parent,
quiet: self.flag_quiet,
@@ -312,7 +345,7 @@ impl RawArgs {
with_filename: with_filename,
};
// If stdout is a tty, then apply some special default options.
if sys::stdout_is_atty() || self.flag_pretty {
if atty::on_stdout() || self.flag_pretty {
if !self.flag_no_line_number && !args.count {
args.line_number = true;
}
@@ -345,7 +378,7 @@ impl Args {
///
/// If a CLI usage error occurred, then exit the process and print a usage
/// or error message. Similarly, if the user requested the version of
/// xrep, then print the version and exit.
/// ripgrep, then print the version and exit.
///
/// Also, initialize a global logger.
pub fn parse() -> Result<Args> {
@@ -367,7 +400,7 @@ impl Args {
raw.to_args().map_err(From::from)
}
/// Returns true if xrep should print the files it will search and exit
/// Returns true if ripgrep should print the files it will search and exit
/// (but not do any actual searching).
pub fn files(&self) -> bool {
self.files
@@ -378,12 +411,8 @@ impl Args {
/// basic searching of regular expressions in a single buffer.
///
/// The pattern and other flags are taken from the command line.
pub fn grep(&self) -> Result<Grep> {
GrepBuilder::new(&self.pattern)
.case_insensitive(self.ignore_case)
.line_terminator(self.eol)
.build()
.map_err(From::from)
pub fn grep(&self) -> Grep {
self.grep.clone()
}
/// Creates a new input buffer that is used in searching.
@@ -393,10 +422,16 @@ impl Args {
inp
}
/// Whether we should prefer memory maps for searching or not.
pub fn mmap(&self) -> bool {
self.mmap
}
/// Create a new printer of individual search results that writes to the
/// writer given.
pub fn printer<W: Send + io::Write>(&self, wtr: W) -> Printer<W> {
let mut p = Printer::new(wtr, self.color)
pub fn printer<W: Send + Terminal>(&self, wtr: W) -> Printer<W> {
let mut p = Printer::new(wtr)
.column(self.column)
.context_separator(self.context_separator.clone())
.eol(self.eol)
.heading(self.heading)
@@ -410,8 +445,8 @@ impl Args {
/// Create a new printer of search results for an entire file that writes
/// to the writer given.
pub fn out<W: io::Write>(&self, wtr: W) -> Out<W> {
let mut out = Out::new(wtr);
pub fn out(&self) -> Out {
let mut out = Out::new(self.color);
if self.heading && !self.count {
out = out.file_separator(b"".to_vec());
} else if self.before_context > 0 || self.after_context > 0 {
@@ -420,6 +455,11 @@ impl Args {
out
}
/// Create a new buffer for use with searching.
pub fn outbuf(&self) -> OutBuffer {
OutBuffer::new(self.color)
}
/// Return the paths that should be searched.
pub fn paths(&self) -> &[PathBuf] {
&self.paths
@@ -428,7 +468,7 @@ impl Args {
/// Create a new line based searcher whose configuration is taken from the
/// command line. This searcher supports a dizzying array of features:
/// inverted matching, line counting, context control and more.
pub fn searcher<'a, R: io::Read, W: Send + io::Write>(
pub fn searcher<'a, R: io::Read, W: Send + Terminal>(
&self,
inp: &'a mut InputBuffer,
printer: &'a mut Printer<W>,
@@ -446,6 +486,24 @@ impl Args {
.text(self.text)
}
/// Create a new line based searcher whose configuration is taken from the
/// command line. This search operates on an entire file all once (which
/// may have been memory mapped).
pub fn searcher_buffer<'a, W: Send + Terminal>(
&self,
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
buf: &'a [u8],
) -> BufferSearcher<'a, W> {
BufferSearcher::new(printer, grep, path, buf)
.count(self.count)
.eol(self.eol)
.line_number(self.line_number)
.invert_match(self.invert_match)
.text(self.text)
}
/// Returns the number of worker search threads that should be used.
pub fn threads(&self) -> usize {
self.threads
@@ -456,8 +514,8 @@ impl Args {
&self.type_defs
}
/// Returns true if xrep should print the type definitions currently loaded
/// and then exit.
/// Returns true if ripgrep should print the type definitions currently
/// loaded and then exit.
pub fn type_list(&self) -> bool {
self.type_list
}

View File

@@ -1,24 +1,23 @@
/*!
This io module contains various platform specific functions for detecting
how xrep is being used. e.g., Is stdin being piped into it? Is stdout being
redirected to a file? etc... We use this information to tweak various default
configuration parameters such as colors and match formatting.
This atty module contains functions for detecting whether ripgrep is being fed
from (or to) a terminal. Windows and Unix do this differently, so implement
both here.
*/
use libc;
#[cfg(unix)]
pub fn stdin_is_atty() -> bool {
pub fn on_stdin() -> bool {
use libc;
0 < unsafe { libc::isatty(libc::STDIN_FILENO) }
}
#[cfg(unix)]
pub fn stdout_is_atty() -> bool {
pub fn on_stdout() -> bool {
use libc;
0 < unsafe { libc::isatty(libc::STDOUT_FILENO) }
}
#[cfg(windows)]
pub fn stdin_is_atty() -> bool {
pub fn on_stdin() -> bool {
use kernel32;
use winapi;
@@ -30,7 +29,7 @@ pub fn stdin_is_atty() -> bool {
}
#[cfg(windows)]
pub fn stdout_is_atty() -> bool {
pub fn on_stdout() -> bool {
use kernel32;
use winapi;

View File

@@ -9,7 +9,7 @@ The motivation for this submodule is performance and portability:
2. We could shell out to a `git` sub-command like ls-files or status, but it
seems better to not rely on the existence of external programs for a search
tool. Besides, we need to implement this logic anyway to support things like
an .xrepignore file.
an .rgignore file.
The key implementation detail here is that a single gitignore file is compiled
into a single RegexSet, which can be used to report which globs match a
@@ -379,7 +379,7 @@ mod tests {
};
}
const ROOT: &'static str = "/home/foobar/rust/xrep";
const ROOT: &'static str = "/home/foobar/rust/rg";
ignored!(ig1, ROOT, "months", "months");
ignored!(ig2, ROOT, "*.lock", "Cargo.lock");

View File

@@ -29,7 +29,6 @@ to make its way into `glob` proper.
use std::error::Error as StdError;
use std::fmt;
use std::iter;
use std::path;
use std::str;
use regex;
@@ -214,7 +213,7 @@ impl Pattern {
/// regular expression and will represent the matching semantics of this
/// glob pattern and the options given.
pub fn to_regex_with(&self, options: &MatchOptions) -> String {
let sep = regex::quote(&path::MAIN_SEPARATOR.to_string());
let seps = regex::quote(r"/\");
let mut re = String::new();
re.push_str("(?-u)");
if options.case_insensitive {
@@ -235,26 +234,27 @@ impl Pattern {
}
Token::Any => {
if options.require_literal_separator {
re.push_str(&format!("[^{}]", sep));
re.push_str(&format!("[^{}]", seps));
} else {
re.push_str(".");
}
}
Token::ZeroOrMore => {
if options.require_literal_separator {
re.push_str(&format!("[^{}]*", sep));
re.push_str(&format!("[^{}]*", seps));
} else {
re.push_str(".*");
}
}
Token::RecursivePrefix => {
re.push_str(&format!("(?:{sep}?|.*{sep})", sep=sep));
re.push_str(&format!("(?:[{sep}]?|.*[{sep}])", sep=seps));
}
Token::RecursiveSuffix => {
re.push_str(&format!("(?:{sep}?|{sep}.*)", sep=sep));
re.push_str(&format!("(?:[{sep}]?|[{sep}].*)", sep=seps));
}
Token::RecursiveZeroOrMore => {
re.push_str(&format!("(?:{sep}|{sep}.*{sep})", sep=sep));
re.push_str(&format!("(?:[{sep}]|[{sep}].*[{sep}])",
sep=seps));
}
Token::Class { negated, ref ranges } => {
re.push('[');
@@ -480,6 +480,9 @@ mod tests {
let pat = Pattern::new($pat).unwrap();
let path = &Path::new($path).to_str().unwrap();
let re = Regex::new(&pat.to_regex_with(&$options)).unwrap();
// println!("PATTERN: {}", $pat);
// println!("REGEX: {:?}", re);
// println!("PATH: {}", path);
assert!(!re.is_match(path.as_bytes()));
}
};
@@ -561,12 +564,11 @@ mod tests {
case_insensitive: true,
require_literal_separator: false,
};
const SEP: char = ::std::path::MAIN_SEPARATOR;
toregex!(re_casei, "a", "(?i)^a$", &CASEI);
toregex!(re_slash1, "?", format!("^[^{}]$", SEP), SLASHLIT);
toregex!(re_slash2, "*", format!("^[^{}]*$", SEP), SLASHLIT);
toregex!(re_slash1, "?", r"^[^/\\]$", SLASHLIT);
toregex!(re_slash2, "*", r"^[^/\\]*$", SLASHLIT);
toregex!(re1, "a", "^a$");
toregex!(re2, "?", "^.$");
@@ -642,6 +644,7 @@ mod tests {
matches!(matchslash1, "abc/def", "abc/def", SLASHLIT);
nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT);
nmatches!(matchslash2_win, "abc?def", "abc\\def", SLASHLIT);
nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT);
matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs

View File

@@ -5,7 +5,7 @@ whether a *single* file path should be searched or not.
In general, there are two ways to ignore a particular file:
1. Specify an ignore rule in some "global" configuration, such as a
$HOME/.xrepignore or on the command line.
$HOME/.rgignore or on the command line.
2. A specific ignore file (like .gitignore) found during directory traversal.
The `IgnoreDir` type handles ignore patterns for any one particular directory
@@ -24,7 +24,7 @@ use types::Types;
const IGNORE_NAMES: &'static [&'static str] = &[
".gitignore",
".agignore",
".xrepignore",
".rgignore",
];
/// Represents an error that can occur when parsing a gitignore file.
@@ -257,8 +257,8 @@ pub struct IgnoreDir {
/// A single accumulation of glob patterns for this directory, matched
/// using gitignore semantics.
///
/// This will include patterns from xrepignore as well. The patterns are
/// ordered so that precedence applies automatically (e.g., xrepignore
/// This will include patterns from rgignore as well. The patterns are
/// ordered so that precedence applies automatically (e.g., rgignore
/// patterns procede gitignore patterns).
gi: Option<Gitignore>,
// TODO(burntsushi): Matching other types of glob patterns that don't
@@ -422,7 +422,7 @@ mod tests {
};
}
const ROOT: &'static str = "/home/foobar/rust/xrep";
const ROOT: &'static str = "/home/foobar/rust/rg";
ignored_dir!(id1, ROOT, "src/main.rs", "", "src/main.rs");
ignored_dir!(id2, ROOT, "", "src/main.rs", "src/main.rs");

View File

@@ -34,12 +34,14 @@ use std::thread;
use crossbeam::sync::chase_lev::{self, Steal, Stealer};
use grep::Grep;
use memmap::{Mmap, Protection};
use term::Terminal;
use walkdir::DirEntry;
use args::Args;
use out::Out;
use out::{NoColorTerminal, Out, OutBuffer};
use printer::Printer;
use search::InputBuffer;
use search_stream::InputBuffer;
macro_rules! errored {
($($tt:tt)*) => {
@@ -55,13 +57,14 @@ macro_rules! eprintln {
}
mod args;
mod atty;
mod gitignore;
mod glob;
mod ignore;
mod out;
mod printer;
mod search;
mod sys;
mod search_buffer;
mod search_stream;
mod terminal;
mod types;
mod walk;
@@ -87,7 +90,8 @@ fn run(args: Args) -> Result<u64> {
return run_types(args);
}
let args = Arc::new(args);
let out = Arc::new(Mutex::new(args.out(io::stdout())));
let out = Arc::new(Mutex::new(args.out()));
let outbuf = args.outbuf();
let mut workers = vec![];
let mut workq = {
@@ -98,8 +102,8 @@ fn run(args: Args) -> Result<u64> {
out: out.clone(),
chan_work: stealer.clone(),
inpbuf: args.input_buffer(),
outbuf: Some(vec![]),
grep: try!(args.grep()),
outbuf: Some(outbuf.clone()),
grep: args.grep(),
match_count: 0,
};
workers.push(thread::spawn(move || worker.run()));
@@ -126,7 +130,8 @@ fn run(args: Args) -> Result<u64> {
}
fn run_files(args: Args) -> Result<u64> {
let mut printer = args.printer(io::BufWriter::new(io::stdout()));
let term = NoColorTerminal::new(io::BufWriter::new(io::stdout()));
let mut printer = args.printer(term);
let mut file_count = 0;
for p in args.paths() {
if p == Path::new("-") {
@@ -143,7 +148,8 @@ fn run_files(args: Args) -> Result<u64> {
}
fn run_types(args: Args) -> Result<u64> {
let mut printer = args.printer(io::BufWriter::new(io::stdout()));
let term = NoColorTerminal::new(io::BufWriter::new(io::stdout()));
let mut printer = args.printer(term);
let mut ty_count = 0;
for def in args.type_defs() {
printer.type_def(def);
@@ -165,10 +171,10 @@ enum WorkReady {
struct Worker {
args: Arc<Args>,
out: Arc<Mutex<Out<io::Stdout>>>,
out: Arc<Mutex<Out>>,
chan_work: Stealer<Work>,
inpbuf: InputBuffer,
outbuf: Option<Vec<u8>>,
outbuf: Option<OutBuffer>,
grep: Grep,
match_count: u64,
}
@@ -196,7 +202,7 @@ impl Worker {
let mut printer = self.args.printer(outbuf);
self.do_work(&mut printer, work);
let outbuf = printer.into_inner();
if !outbuf.is_empty() {
if !outbuf.get_ref().is_empty() {
let mut out = self.out.lock().unwrap();
out.write(&outbuf);
}
@@ -205,7 +211,7 @@ impl Worker {
self.match_count
}
fn do_work<W: Send + io::Write>(
fn do_work<W: Send + Terminal>(
&mut self,
printer: &mut Printer<W>,
work: WorkReady,
@@ -221,7 +227,11 @@ impl Worker {
if let Ok(p) = path.strip_prefix("./") {
path = p;
}
self.search(printer, path, file)
if self.args.mmap() {
self.search_mmap(printer, path, &file)
} else {
self.search(printer, path, file)
}
}
};
match result {
@@ -234,7 +244,7 @@ impl Worker {
}
}
fn search<R: io::Read, W: Send + io::Write>(
fn search<R: io::Read, W: Send + Terminal>(
&mut self,
printer: &mut Printer<W>,
path: &Path,
@@ -248,4 +258,23 @@ impl Worker {
rdr,
).run().map_err(From::from)
}
fn search_mmap<W: Send + Terminal>(
&mut self,
printer: &mut Printer<W>,
path: &Path,
file: &File,
) -> Result<u64> {
if try!(file.metadata()).len() == 0 {
// Opening a memory map with an empty file results in an error.
return Ok(0);
}
let mmap = try!(Mmap::open(file, Protection::Read));
Ok(self.args.searcher_buffer(
printer,
&self.grep,
path,
unsafe { mmap.as_slice() },
).run())
}
}

View File

@@ -1,4 +1,40 @@
use std::io::{self, Write};
use std::sync::Arc;
use term::{self, Terminal};
use term::color::Color;
use term::terminfo::TermInfo;
#[cfg(windows)]
use term::WinConsole;
use terminal::TerminfoTerminal;
pub type StdoutTerminal = Box<Terminal<Output=io::Stdout> + Send>;
/// Gets a terminal that supports color if available.
#[cfg(windows)]
fn term_stdout(color: bool) -> StdoutTerminal {
let stdout = io::stdout();
WinConsole::new(stdout)
.ok()
.map(|t| Box::new(t) as StdoutTerminal)
.unwrap_or_else(|| {
let stdout = io::stdout();
Box::new(NoColorTerminal::new(stdout)) as StdoutTerminal
})
}
/// Gets a terminal that supports color if available.
#[cfg(not(windows))]
fn term_stdout(color: bool) -> StdoutTerminal {
let stdout = io::stdout();
if !color || TERMINFO.is_none() {
Box::new(NoColorTerminal::new(stdout))
} else {
let info = TERMINFO.clone().unwrap();
Box::new(TerminfoTerminal::new_with_terminfo(stdout, info))
}
}
/// Out controls the actual output of all search results for a particular file
/// to the end user.
@@ -6,17 +42,17 @@ use std::io::{self, Write};
/// (The difference between Out and Printer is that a Printer works with
/// individual search results where as Out works with search results for each
/// file as a whole. For example, it knows when to print a file separator.)
pub struct Out<W: io::Write> {
wtr: io::BufWriter<W>,
pub struct Out {
term: StdoutTerminal,
printed: bool,
file_separator: Option<Vec<u8>>,
}
impl<W: io::Write> Out<W> {
impl Out {
/// Create a new Out that writes to the wtr given.
pub fn new(wtr: W) -> Out<W> {
pub fn new(color: bool) -> Out {
Out {
wtr: io::BufWriter::new(wtr),
term: term_stdout(color),
printed: false,
file_separator: None,
}
@@ -26,22 +62,422 @@ impl<W: io::Write> Out<W> {
/// By default, no separator is printed.
///
/// If sep is empty, then no file separator is printed.
pub fn file_separator(mut self, sep: Vec<u8>) -> Out<W> {
pub fn file_separator(mut self, sep: Vec<u8>) -> Out {
self.file_separator = Some(sep);
self
}
/// Write the search results of a single file to the underlying wtr and
/// flush wtr.
pub fn write(&mut self, buf: &[u8]) {
pub fn write(&mut self, buf: &OutBuffer) {
if let Some(ref sep) = self.file_separator {
if self.printed {
let _ = self.wtr.write_all(sep);
let _ = self.wtr.write_all(b"\n");
let _ = self.term.write_all(sep);
let _ = self.term.write_all(b"\n");
}
}
let _ = self.wtr.write_all(buf);
let _ = self.wtr.flush();
match *buf {
OutBuffer::Colored(ref tt) => {
let _ = self.term.write_all(tt.get_ref());
}
OutBuffer::Windows(ref w) => {
w.print_stdout(&mut self.term);
}
OutBuffer::NoColor(ref buf) => {
let _ = self.term.write_all(buf);
}
}
let _ = self.term.flush();
self.printed = true;
}
}
/// OutBuffer corresponds to the final output buffer for search results. All
/// search results are written to a buffer and then a buffer is flushed to
/// stdout only after the full search has completed.
#[derive(Clone, Debug)]
pub enum OutBuffer {
Colored(TerminfoTerminal<Vec<u8>>),
Windows(WindowsBuffer),
NoColor(Vec<u8>),
}
#[derive(Clone, Debug)]
pub struct WindowsBuffer {
buf: Vec<u8>,
pos: usize,
colors: Vec<WindowsColor>,
}
#[derive(Clone, Debug)]
pub struct WindowsColor {
pos: usize,
opt: WindowsOption,
}
#[derive(Clone, Debug)]
pub enum WindowsOption {
Foreground(Color),
Background(Color),
Reset,
}
lazy_static! {
static ref TERMINFO: Option<Arc<TermInfo>> = {
match TermInfo::from_env() {
Ok(info) => Some(Arc::new(info)),
Err(err) => {
debug!("error loading terminfo for coloring: {}", err);
None
}
}
};
}
impl OutBuffer {
/// Create a new output buffer.
///
/// When color is true, the buffer will attempt to support coloring.
pub fn new(color: bool) -> OutBuffer {
// If we want color, build a TerminfoTerminal and see if the current
// environment supports coloring. If not, bail with NoColor. To avoid
// losing our writer (ownership), do this the long way.
if !color {
return OutBuffer::NoColor(vec![]);
}
if cfg!(windows) {
return OutBuffer::Windows(WindowsBuffer {
buf: vec![],
pos: 0,
colors: vec![]
});
}
if TERMINFO.is_none() {
return OutBuffer::NoColor(vec![]);
}
let info = TERMINFO.clone().unwrap();
let tt = TerminfoTerminal::new_with_terminfo(vec![], info);
if !tt.supports_color() {
debug!("environment doesn't support coloring");
return OutBuffer::NoColor(tt.into_inner());
}
OutBuffer::Colored(tt)
}
/// Clear the give buffer of all search results such that it is reusable
/// in another search.
pub fn clear(&mut self) {
match *self {
OutBuffer::Colored(ref mut tt) => {
tt.get_mut().clear();
}
OutBuffer::Windows(ref mut win) => {
win.buf.clear();
win.colors.clear();
win.pos = 0;
}
OutBuffer::NoColor(ref mut buf) => {
buf.clear();
}
}
}
fn map_result<F, G>(
&mut self,
mut f: F,
mut g: G,
) -> term::Result<()>
where F: FnMut(&mut TerminfoTerminal<Vec<u8>>) -> term::Result<()>,
G: FnMut(&mut WindowsBuffer) -> term::Result<()> {
match *self {
OutBuffer::Colored(ref mut w) => f(w),
OutBuffer::Windows(ref mut w) => g(w),
OutBuffer::NoColor(_) => Err(term::Error::NotSupported),
}
}
fn map_bool<F, G>(
&self,
mut f: F,
mut g: G,
) -> bool
where F: FnMut(&TerminfoTerminal<Vec<u8>>) -> bool,
G: FnMut(&WindowsBuffer) -> bool {
match *self {
OutBuffer::Colored(ref w) => f(w),
OutBuffer::Windows(ref w) => g(w),
OutBuffer::NoColor(_) => false,
}
}
}
impl io::Write for OutBuffer {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
match *self {
OutBuffer::Colored(ref mut w) => w.write(buf),
OutBuffer::Windows(ref mut w) => w.write(buf),
OutBuffer::NoColor(ref mut w) => w.write(buf),
}
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl term::Terminal for OutBuffer {
type Output = Vec<u8>;
fn fg(&mut self, fg: term::color::Color) -> term::Result<()> {
self.map_result(|w| w.fg(fg), |w| w.fg(fg))
}
fn bg(&mut self, bg: term::color::Color) -> term::Result<()> {
self.map_result(|w| w.bg(bg), |w| w.bg(bg))
}
fn attr(&mut self, attr: term::Attr) -> term::Result<()> {
self.map_result(|w| w.attr(attr), |w| w.attr(attr))
}
fn supports_attr(&self, attr: term::Attr) -> bool {
self.map_bool(|w| w.supports_attr(attr), |w| w.supports_attr(attr))
}
fn reset(&mut self) -> term::Result<()> {
self.map_result(|w| w.reset(), |w| w.reset())
}
fn supports_reset(&self) -> bool {
self.map_bool(|w| w.supports_reset(), |w| w.supports_reset())
}
fn supports_color(&self) -> bool {
self.map_bool(|w| w.supports_color(), |w| w.supports_color())
}
fn cursor_up(&mut self) -> term::Result<()> {
self.map_result(|w| w.cursor_up(), |w| w.cursor_up())
}
fn delete_line(&mut self) -> term::Result<()> {
self.map_result(|w| w.delete_line(), |w| w.delete_line())
}
fn carriage_return(&mut self) -> term::Result<()> {
self.map_result(|w| w.carriage_return(), |w| w.carriage_return())
}
fn get_ref(&self) -> &Vec<u8> {
match *self {
OutBuffer::Colored(ref w) => w.get_ref(),
OutBuffer::Windows(ref w) => w.get_ref(),
OutBuffer::NoColor(ref w) => w,
}
}
fn get_mut(&mut self) -> &mut Vec<u8> {
match *self {
OutBuffer::Colored(ref mut w) => w.get_mut(),
OutBuffer::Windows(ref mut w) => w.get_mut(),
OutBuffer::NoColor(ref mut w) => w,
}
}
fn into_inner(self) -> Vec<u8> {
match self {
OutBuffer::Colored(w) => w.into_inner(),
OutBuffer::Windows(w) => w.into_inner(),
OutBuffer::NoColor(w) => w,
}
}
}
impl WindowsBuffer {
fn push(&mut self, opt: WindowsOption) {
let pos = self.pos;
self.colors.push(WindowsColor { pos: pos, opt: opt });
}
}
impl WindowsBuffer {
/// Print the contents to the given terminal.
pub fn print_stdout(&self, tt: &mut StdoutTerminal) {
if !tt.supports_color() {
let _ = tt.write_all(&self.buf);
let _ = tt.flush();
return;
}
let mut last = 0;
for col in &self.colors {
let _ = tt.write_all(&self.buf[last..col.pos]);
match col.opt {
WindowsOption::Foreground(c) => {
let _ = tt.fg(c);
}
WindowsOption::Background(c) => {
let _ = tt.bg(c);
}
WindowsOption::Reset => {
let _ = tt.reset();
}
}
last = col.pos;
}
let _ = tt.write_all(&self.buf[last..]);
let _ = tt.flush();
}
}
impl io::Write for WindowsBuffer {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let n = try!(self.buf.write(buf));
self.pos += n;
Ok(n)
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl term::Terminal for WindowsBuffer {
type Output = Vec<u8>;
fn fg(&mut self, fg: term::color::Color) -> term::Result<()> {
self.push(WindowsOption::Foreground(fg));
Ok(())
}
fn bg(&mut self, bg: term::color::Color) -> term::Result<()> {
self.push(WindowsOption::Background(bg));
Ok(())
}
fn attr(&mut self, attr: term::Attr) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn supports_attr(&self, attr: term::Attr) -> bool {
false
}
fn reset(&mut self) -> term::Result<()> {
self.push(WindowsOption::Reset);
Ok(())
}
fn supports_reset(&self) -> bool {
true
}
fn supports_color(&self) -> bool {
true
}
fn cursor_up(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn delete_line(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn carriage_return(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn get_ref(&self) -> &Vec<u8> {
&self.buf
}
fn get_mut(&mut self) -> &mut Vec<u8> {
&mut self.buf
}
fn into_inner(self) -> Vec<u8> {
self.buf
}
}
/// NoColorTerminal implements Terminal, but supports no coloring.
///
/// Its useful when an API requires a Terminal, but coloring isn't needed.
pub struct NoColorTerminal<W> {
wtr: W,
}
impl<W: Send + io::Write> NoColorTerminal<W> {
/// Wrap the given writer in a Terminal interface.
pub fn new(wtr: W) -> NoColorTerminal<W> {
NoColorTerminal {
wtr: wtr,
}
}
}
impl<W: Send + io::Write> io::Write for NoColorTerminal<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.wtr.write(buf)
}
fn flush(&mut self) -> io::Result<()> {
self.wtr.flush()
}
}
impl<W: Send + io::Write> term::Terminal for NoColorTerminal<W> {
type Output = W;
fn fg(&mut self, fg: term::color::Color) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn bg(&mut self, bg: term::color::Color) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn attr(&mut self, attr: term::Attr) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn supports_attr(&self, attr: term::Attr) -> bool {
false
}
fn reset(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn supports_reset(&self) -> bool {
false
}
fn supports_color(&self) -> bool {
false
}
fn cursor_up(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn delete_line(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn carriage_return(&mut self) -> term::Result<()> {
Err(term::Error::NotSupported)
}
fn get_ref(&self) -> &W {
&self.wtr
}
fn get_mut(&mut self) -> &mut W {
&mut self.wtr
}
fn into_inner(self) -> W {
self.wtr
}
}

View File

@@ -1,17 +1,11 @@
use std::io::{self, Write};
use std::path::Path;
use std::sync::Arc;
use regex::bytes::Regex;
use term::{self, Terminal};
use term::color::*;
use term::terminfo::TermInfo;
use term::{Attr, Terminal};
use term::color;
use terminal::TerminfoTerminal;
use types::FileTypeDef;
use self::Writer::*;
/// Printer encapsulates all output logic for searching.
///
/// Note that we currently ignore all write errors. It's probably worthwhile
@@ -19,9 +13,11 @@ use self::Writer::*;
/// writes to memory, neither of which commonly fail.
pub struct Printer<W> {
/// The underlying writer.
wtr: Writer<W>,
wtr: W,
/// Whether anything has been printed to wtr yet.
has_printed: bool,
/// Whether to show column numbers for the first match or not.
column: bool,
/// The string to use to separate non-contiguous runs of context lines.
context_separator: Vec<u8>,
/// The end-of-line terminator used by the printer. In general, eols are
@@ -40,14 +36,13 @@ pub struct Printer<W> {
with_filename: bool,
}
impl<W: Send + io::Write> Printer<W> {
impl<W: Send + Terminal> Printer<W> {
/// Create a new printer that writes to wtr.
///
/// `color` should be true if the printer should try to use coloring.
pub fn new(wtr: W, color: bool) -> Printer<W> {
pub fn new(wtr: W) -> Printer<W> {
Printer {
wtr: Writer::new(wtr, color),
wtr: wtr,
has_printed: false,
column: false,
context_separator: "--".to_string().into_bytes(),
eol: b'\n',
heading: false,
@@ -57,6 +52,13 @@ impl<W: Send + io::Write> Printer<W> {
}
}
/// When set, column numbers will be printed for the first match on each
/// line.
pub fn column(mut self, yes: bool) -> Printer<W> {
self.column = yes;
self
}
/// Set the context separator. The default is `--`.
pub fn context_separator(mut self, sep: Vec<u8>) -> Printer<W> {
self.context_separator = sep;
@@ -107,7 +109,7 @@ impl<W: Send + io::Write> Printer<W> {
/// Flushes the underlying writer and returns it.
pub fn into_inner(mut self) -> W {
let _ = self.wtr.flush();
self.wtr.into_inner()
self.wtr
}
/// Prints a type definition.
@@ -173,6 +175,11 @@ impl<W: Send + io::Write> Printer<W> {
if let Some(line_number) = line_number {
self.line_number(line_number, b':');
}
if self.column {
let c = re.find(&buf[start..end]).map(|(s, _)| s + 1).unwrap_or(0);
self.write(c.to_string().as_bytes());
self.write(b":");
}
if self.replace.is_some() {
let line = re.replace_all(
&buf[start..end], &**self.replace.as_ref().unwrap());
@@ -186,15 +193,15 @@ impl<W: Send + io::Write> Printer<W> {
}
pub fn write_match(&mut self, re: &Regex, buf: &[u8]) {
if !self.wtr.is_color() {
if !self.wtr.supports_color() {
self.write(buf);
return;
}
let mut last_written = 0;
for (s, e) in re.find_iter(buf) {
self.write(&buf[last_written..s]);
let _ = self.wtr.fg(BRIGHT_RED);
let _ = self.wtr.attr(term::Attr::Bold);
let _ = self.wtr.fg(color::BRIGHT_RED);
let _ = self.wtr.attr(Attr::Bold);
self.write(&buf[s..e]);
let _ = self.wtr.reset();
last_written = e;
@@ -226,23 +233,24 @@ impl<W: Send + io::Write> Printer<W> {
}
fn write_heading<P: AsRef<Path>>(&mut self, path: P) {
if self.wtr.is_color() {
let _ = self.wtr.fg(GREEN);
if self.wtr.supports_color() {
let _ = self.wtr.fg(color::BRIGHT_GREEN);
let _ = self.wtr.attr(Attr::Bold);
}
self.write(path.as_ref().to_string_lossy().as_bytes());
self.write_eol();
if self.wtr.is_color() {
if self.wtr.supports_color() {
let _ = self.wtr.reset();
}
}
fn line_number(&mut self, n: u64, sep: u8) {
if self.wtr.is_color() {
let _ = self.wtr.fg(YELLOW);
let _ = self.wtr.attr(term::Attr::Bold);
if self.wtr.supports_color() {
let _ = self.wtr.fg(color::BRIGHT_BLUE);
let _ = self.wtr.attr(Attr::Bold);
}
self.write(n.to_string().as_bytes());
if self.wtr.is_color() {
if self.wtr.supports_color() {
let _ = self.wtr.reset();
}
self.write(&[sep]);
@@ -261,148 +269,3 @@ impl<W: Send + io::Write> Printer<W> {
self.write(&[eol]);
}
}
enum Writer<W> {
Colored(TerminfoTerminal<W>),
NoColor(W),
}
lazy_static! {
static ref TERMINFO: Option<Arc<TermInfo>> = {
match term::terminfo::TermInfo::from_env() {
Ok(info) => Some(Arc::new(info)),
Err(err) => {
debug!("error loading terminfo for coloring: {}", err);
None
}
}
};
}
impl<W: Send + io::Write> Writer<W> {
fn new(wtr: W, color: bool) -> Writer<W> {
// If we want color, build a TerminfoTerminal and see if the current
// environment supports coloring. If not, bail with NoColor. To avoid
// losing our writer (ownership), do this the long way.
if !color || TERMINFO.is_none() {
return NoColor(wtr);
}
let info = TERMINFO.clone().unwrap();
let tt = TerminfoTerminal::new_with_terminfo(wtr, info);
if !tt.supports_color() {
debug!("environment doesn't support coloring");
return NoColor(tt.into_inner());
}
Colored(tt)
}
fn is_color(&self) -> bool {
match *self {
Colored(_) => true,
NoColor(_) => false,
}
}
fn map_result<F>(
&mut self,
mut f: F,
) -> term::Result<()>
where F: FnMut(&mut TerminfoTerminal<W>) -> term::Result<()> {
match *self {
Colored(ref mut w) => f(w),
NoColor(_) => Err(term::Error::NotSupported),
}
}
fn map_bool<F>(
&self,
mut f: F,
) -> bool
where F: FnMut(&TerminfoTerminal<W>) -> bool {
match *self {
Colored(ref w) => f(w),
NoColor(_) => false,
}
}
}
impl<W: Send + io::Write> io::Write for Writer<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
match *self {
Colored(ref mut w) => w.write(buf),
NoColor(ref mut w) => w.write(buf),
}
}
fn flush(&mut self) -> io::Result<()> {
match *self {
Colored(ref mut w) => w.flush(),
NoColor(ref mut w) => w.flush(),
}
}
}
impl<W: Send + io::Write> term::Terminal for Writer<W> {
type Output = W;
fn fg(&mut self, fg: term::color::Color) -> term::Result<()> {
self.map_result(|w| w.fg(fg))
}
fn bg(&mut self, bg: term::color::Color) -> term::Result<()> {
self.map_result(|w| w.bg(bg))
}
fn attr(&mut self, attr: term::Attr) -> term::Result<()> {
self.map_result(|w| w.attr(attr))
}
fn supports_attr(&self, attr: term::Attr) -> bool {
self.map_bool(|w| w.supports_attr(attr))
}
fn reset(&mut self) -> term::Result<()> {
self.map_result(|w| w.reset())
}
fn supports_reset(&self) -> bool {
self.map_bool(|w| w.supports_reset())
}
fn supports_color(&self) -> bool {
self.map_bool(|w| w.supports_color())
}
fn cursor_up(&mut self) -> term::Result<()> {
self.map_result(|w| w.cursor_up())
}
fn delete_line(&mut self) -> term::Result<()> {
self.map_result(|w| w.delete_line())
}
fn carriage_return(&mut self) -> term::Result<()> {
self.map_result(|w| w.carriage_return())
}
fn get_ref(&self) -> &W {
match *self {
Colored(ref w) => w.get_ref(),
NoColor(ref w) => w,
}
}
fn get_mut(&mut self) -> &mut W {
match *self {
Colored(ref mut w) => w.get_mut(),
NoColor(ref mut w) => w,
}
}
fn into_inner(self) -> W {
match self {
Colored(w) => w.into_inner(),
NoColor(w) => w,
}
}
}

294
src/search_buffer.rs Normal file
View File

@@ -0,0 +1,294 @@
/*!
The search_buffer module is responsible for searching a single file all in a
single buffer. Typically, the source of the buffer is a memory map. This can
be useful for when memory maps are faster than streaming search.
Note that this module doesn't quite support everything that search_stream does.
Notably, showing contexts.
*/
use std::cmp;
use std::path::Path;
use grep::Grep;
use term::Terminal;
use printer::Printer;
use search_stream::{IterLines, Options, count_lines, is_binary};
pub struct BufferSearcher<'a, W: 'a> {
opts: Options,
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
buf: &'a [u8],
match_count: u64,
line_count: Option<u64>,
last_line: usize,
}
impl<'a, W: Send + Terminal> BufferSearcher<'a, W> {
pub fn new(
printer: &'a mut Printer<W>,
grep: &'a Grep,
path: &'a Path,
buf: &'a [u8],
) -> BufferSearcher<'a, W> {
BufferSearcher {
opts: Options::default(),
printer: printer,
grep: grep,
path: path,
buf: buf,
match_count: 0,
line_count: None,
last_line: 0,
}
}
/// If enabled, searching will print a count instead of each match.
///
/// Disabled by default.
pub fn count(mut self, yes: bool) -> Self {
self.opts.count = yes;
self
}
/// Set the end-of-line byte used by this searcher.
pub fn eol(mut self, eol: u8) -> Self {
self.opts.eol = eol;
self
}
/// If enabled, matching is inverted so that lines that *don't* match the
/// given pattern are treated as matches.
pub fn invert_match(mut self, yes: bool) -> Self {
self.opts.invert_match = yes;
self
}
/// If enabled, compute line numbers and prefix each line of output with
/// them.
pub fn line_number(mut self, yes: bool) -> Self {
self.opts.line_number = yes;
self
}
/// If enabled, search binary files as if they were text.
pub fn text(mut self, yes: bool) -> Self {
self.opts.text = yes;
self
}
#[inline(never)]
pub fn run(mut self) -> u64 {
let binary_upto = cmp::min(4096, self.buf.len());
if !self.opts.text && is_binary(&self.buf[..binary_upto]) {
return 0;
}
self.match_count = 0;
self.line_count = if self.opts.line_number { Some(0) } else { None };
let mut last_end = 0;
for m in self.grep.iter(self.buf) {
if self.opts.invert_match {
self.print_inverted_matches(last_end, m.start());
} else {
self.print_match(m.start(), m.end());
}
last_end = m.end();
}
if self.opts.invert_match {
let upto = self.buf.len();
self.print_inverted_matches(last_end, upto);
}
if self.opts.count && self.match_count > 0 {
self.printer.path_count(self.path, self.match_count);
}
self.match_count
}
#[inline(always)]
pub fn print_match(&mut self, start: usize, end: usize) {
self.match_count += 1;
if self.opts.count {
return;
}
self.count_lines(start);
self.add_line(end);
self.printer.matched(
self.grep.regex(), self.path, self.buf,
start, end, self.line_count);
}
#[inline(always)]
fn print_inverted_matches(&mut self, start: usize, end: usize) {
debug_assert!(self.opts.invert_match);
let mut it = IterLines::new(self.opts.eol, start);
while let Some((s, e)) = it.next(&self.buf[..end]) {
self.print_match(s, e);
}
}
#[inline(always)]
fn count_lines(&mut self, upto: usize) {
if let Some(ref mut line_count) = self.line_count {
*line_count += count_lines(
&self.buf[self.last_line..upto], self.opts.eol);
self.last_line = upto;
}
}
#[inline(always)]
fn add_line(&mut self, line_end: usize) {
if let Some(ref mut line_count) = self.line_count {
*line_count += 1;
self.last_line = line_end;
}
}
}
#[cfg(test)]
mod tests {
use std::path::Path;
use grep::{Grep, GrepBuilder};
use term::Terminal;
use out::OutBuffer;
use printer::Printer;
use super::BufferSearcher;
const SHERLOCK: &'static str = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.\
";
const CODE: &'static str = "\
extern crate snap;
use std::io;
fn main() {
let stdin = io::stdin();
let stdout = io::stdout();
// Wrap the stdin reader in a Snappy reader.
let mut rdr = snap::Reader::new(stdin.lock());
let mut wtr = stdout.lock();
io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\");
}
";
fn matcher(pat: &str) -> Grep {
GrepBuilder::new(pat).build().unwrap()
}
fn test_path() -> &'static Path {
&Path::new("/baz.rs")
}
type TestSearcher<'a> = BufferSearcher<'a, OutBuffer>;
fn search<F: FnMut(TestSearcher) -> TestSearcher>(
pat: &str,
haystack: &str,
mut map: F,
) -> (u64, String) {
let outbuf = OutBuffer::NoColor(vec![]);
let mut pp = Printer::new(outbuf).with_filename(true);
let grep = GrepBuilder::new(pat).build().unwrap();
let count = {
let searcher = BufferSearcher::new(
&mut pp, &grep, test_path(), haystack.as_bytes());
map(searcher).run()
};
(count, String::from_utf8(pp.into_inner().into_inner()).unwrap())
}
#[test]
fn basic_search() {
let (count, out) = search("Sherlock", SHERLOCK, |s|s);
assert_eq!(2, count);
assert_eq!(out, "\
/baz.rs:For the Doctor Watsons of this world, as opposed to the Sherlock
/baz.rs:be, to a very large extent, the result of luck. Sherlock Holmes
");
}
#[test]
fn binary() {
let text = "Sherlock\n\x00Holmes\n";
let (count, out) = search("Sherlock|Holmes", text, |s|s);
assert_eq!(0, count);
assert_eq!(out, "");
}
#[test]
fn binary_text() {
let text = "Sherlock\n\x00Holmes\n";
let (count, out) = search("Sherlock|Holmes", text, |s| s.text(true));
assert_eq!(2, count);
assert_eq!(out, "/baz.rs:Sherlock\n/baz.rs:\x00Holmes\n");
}
#[test]
fn line_numbers() {
let (count, out) = search(
"Sherlock", SHERLOCK, |s| s.line_number(true));
assert_eq!(2, count);
assert_eq!(out, "\
/baz.rs:1:For the Doctor Watsons of this world, as opposed to the Sherlock
/baz.rs:3:be, to a very large extent, the result of luck. Sherlock Holmes
");
}
#[test]
fn count() {
let (count, out) = search(
"Sherlock", SHERLOCK, |s| s.count(true));
assert_eq!(2, count);
assert_eq!(out, "/baz.rs:2\n");
}
#[test]
fn invert_match() {
let (count, out) = search(
"Sherlock", SHERLOCK, |s| s.invert_match(true));
assert_eq!(4, count);
assert_eq!(out, "\
/baz.rs:Holmeses, success in the province of detective work must always
/baz.rs:can extract a clew from a wisp of straw or a flake of cigar ash;
/baz.rs:but Doctor Watson has to have it taken out for him and dusted,
/baz.rs:and exhibited clearly, with a label attached.
");
}
#[test]
fn invert_match_line_numbers() {
let (count, out) = search("Sherlock", SHERLOCK, |s| {
s.invert_match(true).line_number(true)
});
assert_eq!(4, count);
assert_eq!(out, "\
/baz.rs:2:Holmeses, success in the province of detective work must always
/baz.rs:4:can extract a clew from a wisp of straw or a flake of cigar ash;
/baz.rs:5:but Doctor Watson has to have it taken out for him and dusted,
/baz.rs:6:and exhibited clearly, with a label attached.
");
}
#[test]
fn invert_match_count() {
let (count, out) = search("Sherlock", SHERLOCK, |s| {
s.invert_match(true).count(true)
});
assert_eq!(4, count);
assert_eq!(out, "/baz.rs:4\n");
}
}

View File

@@ -1,6 +1,7 @@
/*!
The search module is responsible for searching a single file and printing
matches.
The search_stream module is responsible for searching a single file and
printing matches. In particular, it searches the file in a streaming fashion
using `read` calls and a (roughly) fixed size buffer.
*/
use std::cmp;
@@ -11,6 +12,7 @@ use std::path::{Path, PathBuf};
use grep::{Grep, Match};
use memchr::{memchr, memrchr};
use term::Terminal;
use printer::Printer;
@@ -74,14 +76,14 @@ pub struct Searcher<'a, R, W: 'a> {
/// Options for configuring search.
#[derive(Clone)]
struct Options {
after_context: usize,
before_context: usize,
count: bool,
eol: u8,
invert_match: bool,
line_number: bool,
text: bool,
pub struct Options {
pub after_context: usize,
pub before_context: usize,
pub count: bool,
pub eol: u8,
pub invert_match: bool,
pub line_number: bool,
pub text: bool,
}
impl Default for Options {
@@ -98,7 +100,7 @@ impl Default for Options {
}
}
impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
impl<'a, R: io::Read, W: Send + Terminal> Searcher<'a, R, W> {
/// Create a new searcher.
///
/// `inp` is a reusable input buffer that is used as scratch space by this
@@ -219,14 +221,11 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
self.print_inverted_matches(upto);
}
} else if matched {
self.match_count += 1;
if !self.opts.count {
let start = self.last_match.start();
let end = self.last_match.end();
self.print_after_context(start);
self.print_before_context(start);
self.print_match(start, end);
}
let start = self.last_match.start();
let end = self.last_match.end();
self.print_after_context(start);
self.print_before_context(start);
self.print_match(start, end);
}
if matched {
self.inp.pos = self.last_match.end();
@@ -275,11 +274,8 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
debug_assert!(self.opts.invert_match);
let mut it = IterLines::new(self.opts.eol, self.inp.pos);
while let Some((start, end)) = it.next(&self.inp.buf[..upto]) {
if !self.opts.count {
self.print_match(start, end);
}
self.print_match(start, end);
self.inp.pos = end;
self.match_count += 1;
}
}
@@ -325,11 +321,15 @@ impl<'a, R: io::Read, W: Send + io::Write> Searcher<'a, R, W> {
#[inline(always)]
fn print_match(&mut self, start: usize, end: usize) {
self.match_count += 1;
if self.opts.count {
return;
}
self.print_separator(start);
self.count_lines(start);
self.add_line(end);
self.printer.matched(
self.grep.regex(), &self.path,
self.grep.regex(), self.path,
&self.inp.buf, start, end, self.line_count);
self.last_printed = end;
self.after_context_remaining = self.opts.after_context;
@@ -535,7 +535,7 @@ impl InputBuffer {
///
/// Note that this may return both false positives and false negatives.
#[inline(always)]
fn is_binary(buf: &[u8]) -> bool {
pub fn is_binary(buf: &[u8]) -> bool {
if buf.len() >= 4 && &buf[0..4] == b"%PDF" {
return true;
}
@@ -543,13 +543,88 @@ fn is_binary(buf: &[u8]) -> bool {
}
/// Count the number of lines in the given buffer.
#[inline(always)]
fn count_lines(mut buf: &[u8], eol: u8) -> u64 {
let mut count = 0;
while let Some(pos) = memchr(eol, buf) {
count += 1;
buf = &buf[pos + 1..];
#[inline(never)]
#[inline(never)]
pub fn count_lines(buf: &[u8], eol: u8) -> u64 {
// This was adapted from code in the memchr crate. The specific benefit
// here is that we can avoid a branch in the inner loop because all we're
// doing is counting.
// The technique to count EOL bytes was adapted from:
// http://bits.stephan-brumme.com/null.html
const LO_U64: u64 = 0x0101010101010101;
const HI_U64: u64 = 0x8080808080808080;
// use truncation
const LO_USIZE: usize = LO_U64 as usize;
const HI_USIZE: usize = HI_U64 as usize;
#[cfg(target_pointer_width = "32")]
const USIZE_BYTES: usize = 4;
#[cfg(target_pointer_width = "64")]
const USIZE_BYTES: usize = 8;
fn count_eol(eol: usize) -> u64 {
// Ideally, this would compile down to a POPCNT instruction, but
// it looks like you need to set RUSTFLAGS="-C target-cpu=native"
// (or target-feature=+popcnt) to get that to work. Bummer.
(eol.wrapping_sub(LO_USIZE) & !eol & HI_USIZE).count_ones() as u64
}
#[cfg(target_pointer_width = "32")]
fn repeat_byte(b: u8) -> usize {
let mut rep = (b as usize) << 8 | b as usize;
rep = rep << 16 | rep;
rep
}
#[cfg(target_pointer_width = "64")]
fn repeat_byte(b: u8) -> usize {
let mut rep = (b as usize) << 8 | b as usize;
rep = rep << 16 | rep;
rep = rep << 32 | rep;
rep
}
fn count_lines_slow(mut buf: &[u8], eol: u8) -> u64 {
let mut count = 0;
while let Some(pos) = memchr(eol, buf) {
count += 1;
buf = &buf[pos + 1..];
}
count
}
let len = buf.len();
let ptr = buf.as_ptr();
let mut count = 0;
// Search up to an aligned boundary...
let align = (ptr as usize) & (USIZE_BYTES - 1);
let mut i = 0;
if align > 0 {
i = cmp::min(USIZE_BYTES - align, len);
count += count_lines_slow(&buf[..i], eol);
}
// ... and search the rest.
let repeated_eol = repeat_byte(eol);
if len >= 2 * USIZE_BYTES {
while i <= len - (2 * USIZE_BYTES) {
unsafe {
let u = *(ptr.offset(i as isize) as *const usize);
let v = *(ptr.offset((i + USIZE_BYTES) as isize)
as *const usize);
count += count_eol(u ^ repeated_eol);
count += count_eol(v ^ repeated_eol);
}
i += USIZE_BYTES * 2;
}
}
count += count_lines_slow(&buf[i..], eol);
count
}
@@ -575,7 +650,7 @@ fn replace_buf(buf: &mut [u8], a: u8, b: u8) {
/// advance over the positions of each line. We neglect that approach to avoid
/// the borrow in the search code. (Because the borrow prevents composition
/// through other mutable methods.)
struct IterLines {
pub struct IterLines {
eol: u8,
pos: usize,
}
@@ -585,7 +660,7 @@ impl IterLines {
///
/// The buffer is passed to the `next` method.
#[inline(always)]
fn new(eol: u8, start: usize) -> IterLines {
pub fn new(eol: u8, start: usize) -> IterLines {
IterLines {
eol: eol,
pos: start,
@@ -597,7 +672,7 @@ impl IterLines {
///
/// The range returned includes the new line.
#[inline(always)]
fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> {
pub fn next(&mut self, buf: &[u8]) -> Option<(usize, usize)> {
match memchr(self.eol, &buf[self.pos..]) {
None => {
if self.pos < buf.len() {
@@ -689,13 +764,14 @@ mod tests {
use std::path::Path;
use grep::{Grep, GrepBuilder};
use term::Terminal;
use out::OutBuffer;
use printer::Printer;
use super::{InputBuffer, Searcher, start_of_previous_lines};
lazy_static! {
static ref SHERLOCK: &'static str = "\
const SHERLOCK: &'static str = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
@@ -703,7 +779,8 @@ can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.\
";
static ref CODE: &'static str = "\
const CODE: &'static str = "\
extern crate snap;
use std::io;
@@ -718,7 +795,6 @@ fn main() {
io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\");
}
";
}
fn hay(s: &str) -> io::Cursor<Vec<u8>> {
io::Cursor::new(s.to_string().into_bytes())
@@ -732,7 +808,7 @@ fn main() {
&Path::new("/baz.rs")
}
type TestSearcher<'a> = Searcher<'a, io::Cursor<Vec<u8>>, Vec<u8>>;
type TestSearcher<'a> = Searcher<'a, io::Cursor<Vec<u8>>, OutBuffer>;
fn search_smallcap<F: FnMut(TestSearcher) -> TestSearcher>(
pat: &str,
@@ -740,14 +816,15 @@ fn main() {
mut map: F,
) -> (u64, String) {
let mut inp = InputBuffer::with_capacity(1);
let mut pp = Printer::new(vec![], false).with_filename(true);
let outbuf = OutBuffer::NoColor(vec![]);
let mut pp = Printer::new(outbuf).with_filename(true);
let grep = GrepBuilder::new(pat).build().unwrap();
let count = {
let searcher = Searcher::new(
&mut inp, &mut pp, &grep, test_path(), hay(haystack));
map(searcher).run().unwrap()
};
(count, String::from_utf8(pp.into_inner()).unwrap())
(count, String::from_utf8(pp.into_inner().into_inner()).unwrap())
}
fn search<F: FnMut(TestSearcher) -> TestSearcher>(
@@ -756,14 +833,15 @@ fn main() {
mut map: F,
) -> (u64, String) {
let mut inp = InputBuffer::with_capacity(4096);
let mut pp = Printer::new(vec![], false).with_filename(true);
let outbuf = OutBuffer::NoColor(vec![]);
let mut pp = Printer::new(outbuf).with_filename(true);
let grep = GrepBuilder::new(pat).build().unwrap();
let count = {
let searcher = Searcher::new(
&mut inp, &mut pp, &grep, test_path(), hay(haystack));
map(searcher).run().unwrap()
};
(count, String::from_utf8(pp.into_inner()).unwrap())
(count, String::from_utf8(pp.into_inner().into_inner()).unwrap())
}
#[test]
@@ -870,8 +948,8 @@ fn main() {
}
#[test]
fn basic_search() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s|s);
fn basic_search1() {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s|s);
assert_eq!(2, count);
assert_eq!(out, "\
/baz.rs:For the Doctor Watsons of this world, as opposed to the Sherlock
@@ -887,7 +965,6 @@ fn main() {
assert_eq!(out, "");
}
#[test]
fn binary_text() {
let text = "Sherlock\n\x00Holmes\n";
@@ -899,7 +976,7 @@ fn main() {
#[test]
fn line_numbers() {
let (count, out) = search_smallcap(
"Sherlock", &*SHERLOCK, |s| s.line_number(true));
"Sherlock", SHERLOCK, |s| s.line_number(true));
assert_eq!(2, count);
assert_eq!(out, "\
/baz.rs:1:For the Doctor Watsons of this world, as opposed to the Sherlock
@@ -910,7 +987,7 @@ fn main() {
#[test]
fn count() {
let (count, out) = search_smallcap(
"Sherlock", &*SHERLOCK, |s| s.count(true));
"Sherlock", SHERLOCK, |s| s.count(true));
assert_eq!(2, count);
assert_eq!(out, "/baz.rs:2\n");
}
@@ -918,7 +995,7 @@ fn main() {
#[test]
fn invert_match() {
let (count, out) = search_smallcap(
"Sherlock", &*SHERLOCK, |s| s.invert_match(true));
"Sherlock", SHERLOCK, |s| s.invert_match(true));
assert_eq!(4, count);
assert_eq!(out, "\
/baz.rs:Holmeses, success in the province of detective work must always
@@ -930,7 +1007,7 @@ fn main() {
#[test]
fn invert_match_line_numbers() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.invert_match(true).line_number(true)
});
assert_eq!(4, count);
@@ -944,7 +1021,7 @@ fn main() {
#[test]
fn invert_match_count() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.invert_match(true).count(true)
});
assert_eq!(4, count);
@@ -953,7 +1030,7 @@ fn main() {
#[test]
fn before_context_one1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).before_context(1)
});
assert_eq!(2, count);
@@ -966,7 +1043,7 @@ fn main() {
#[test]
fn before_context_invert_one1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).before_context(1).invert_match(true)
});
assert_eq!(4, count);
@@ -982,7 +1059,7 @@ fn main() {
#[test]
fn before_context_invert_one2() {
let (count, out) = search_smallcap(" a ", &*SHERLOCK, |s| {
let (count, out) = search_smallcap(" a ", SHERLOCK, |s| {
s.line_number(true).before_context(1).invert_match(true)
});
assert_eq!(3, count);
@@ -997,7 +1074,7 @@ fn main() {
#[test]
fn before_context_two1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).before_context(2)
});
assert_eq!(2, count);
@@ -1010,7 +1087,7 @@ fn main() {
#[test]
fn before_context_two2() {
let (count, out) = search_smallcap("dusted", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("dusted", SHERLOCK, |s| {
s.line_number(true).before_context(2)
});
assert_eq!(1, count);
@@ -1024,7 +1101,7 @@ fn main() {
#[test]
fn before_context_two3() {
let (count, out) = search_smallcap(
"success|attached", &*SHERLOCK, |s| {
"success|attached", SHERLOCK, |s| {
s.line_number(true).before_context(2)
});
assert_eq!(2, count);
@@ -1040,7 +1117,7 @@ fn main() {
#[test]
fn before_context_two4() {
let (count, out) = search("stdin", &*CODE, |s| {
let (count, out) = search("stdin", CODE, |s| {
s.line_number(true).before_context(2)
});
assert_eq!(3, count);
@@ -1057,7 +1134,7 @@ fn main() {
#[test]
fn before_context_two5() {
let (count, out) = search("stdout", &*CODE, |s| {
let (count, out) = search("stdout", CODE, |s| {
s.line_number(true).before_context(2)
});
assert_eq!(2, count);
@@ -1074,7 +1151,7 @@ fn main() {
#[test]
fn before_context_three1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).before_context(3)
});
assert_eq!(2, count);
@@ -1087,7 +1164,7 @@ fn main() {
#[test]
fn after_context_one1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).after_context(1)
});
assert_eq!(2, count);
@@ -1101,7 +1178,7 @@ fn main() {
#[test]
fn after_context_invert_one1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).after_context(1).invert_match(true)
});
assert_eq!(4, count);
@@ -1116,7 +1193,7 @@ fn main() {
#[test]
fn after_context_invert_one2() {
let (count, out) = search_smallcap(" a ", &*SHERLOCK, |s| {
let (count, out) = search_smallcap(" a ", SHERLOCK, |s| {
s.line_number(true).after_context(1).invert_match(true)
});
assert_eq!(3, count);
@@ -1132,7 +1209,7 @@ fn main() {
#[test]
fn after_context_two1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).after_context(2)
});
assert_eq!(2, count);
@@ -1147,7 +1224,7 @@ fn main() {
#[test]
fn after_context_two2() {
let (count, out) = search_smallcap("dusted", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("dusted", SHERLOCK, |s| {
s.line_number(true).after_context(2)
});
assert_eq!(1, count);
@@ -1160,7 +1237,7 @@ fn main() {
#[test]
fn after_context_two3() {
let (count, out) = search_smallcap(
"success|attached", &*SHERLOCK, |s| {
"success|attached", SHERLOCK, |s| {
s.line_number(true).after_context(2)
});
assert_eq!(2, count);
@@ -1175,7 +1252,7 @@ fn main() {
#[test]
fn after_context_three1() {
let (count, out) = search_smallcap("Sherlock", &*SHERLOCK, |s| {
let (count, out) = search_smallcap("Sherlock", SHERLOCK, |s| {
s.line_number(true).after_context(3)
});
assert_eq!(2, count);
@@ -1192,7 +1269,7 @@ fn main() {
#[test]
fn before_after_context_two1() {
let (count, out) = search(
r"fn main|let mut rdr", &*CODE, |s| {
r"fn main|let mut rdr", CODE, |s| {
s.line_number(true).after_context(2).before_context(2)
});
assert_eq!(2, count);

24
tests/hay.rs Normal file
View File

@@ -0,0 +1,24 @@
pub const SHERLOCK: &'static str = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.
";
pub const CODE: &'static str = "\
extern crate snap;
use std::io;
fn main() {
let stdin = io::stdin();
let stdout = io::stdout();
// Wrap the stdin reader in a Snappy reader.
let mut rdr = snap::Reader::new(stdin.lock());
let mut wtr = stdout.lock();
io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\");
}
";

577
tests/tests.rs Normal file
View File

@@ -0,0 +1,577 @@
/*!
This module contains *integration* tests. Their purpose is to test the CLI
interface. Namely, that passing a flag does what it says on the tin.
Tests for more fine grained behavior (like the search or the globber) should be
unit tests in their respective modules.
*/
#![allow(dead_code, unused_imports)]
use std::process::Command;
use workdir::WorkDir;
mod hay;
mod workdir;
macro_rules! sherlock {
($name:ident, $fun:expr) => {
sherlock!($name, "Sherlock", $fun);
};
($name:ident, $query:expr, $fun:expr) => {
sherlock!($name, $query, "sherlock", $fun);
};
($name:ident, $query:expr, $path:expr, $fun:expr) => {
#[test]
fn $name() {
let wd = WorkDir::new(stringify!($name));
wd.create("sherlock", hay::SHERLOCK);
let mut cmd = wd.command();
cmd.arg($query).arg($path);
$fun(wd, cmd);
}
};
}
sherlock!(single_file, |wd: WorkDir, mut cmd| {
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(dir, "Sherlock", ".", |wd: WorkDir, mut cmd| {
let lines: String = wd.stdout(&mut cmd);
let expected = "\
sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(line_numbers, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-n");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
1:For the Doctor Watsons of this world, as opposed to the Sherlock
3:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(columns, |wd: WorkDir, mut cmd: Command| {
cmd.arg("--column");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
57:For the Doctor Watsons of this world, as opposed to the Sherlock
49:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(with_filename, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-H");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(with_heading, |wd: WorkDir, mut cmd: Command| {
// This forces the issue since --with-filename is disabled by default
// when searching one fil.e
cmd.arg("--with-filename").arg("--heading");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
sherlock
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(with_heading_default, "Sherlock", ".",
|wd: WorkDir, mut cmd: Command| {
// Search two or more and get --with-filename enabled by default.
// Use -j1 to get deterministic results.
wd.create("foo", "Sherlock Holmes lives on Baker Street.");
cmd.arg("-j1").arg("--heading");
let lines: String = wd.stdout(&mut cmd);
let expected1 = "\
foo
Sherlock Holmes lives on Baker Street.
sherlock
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
let expected2 = "\
sherlock
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
foo
Sherlock Holmes lives on Baker Street.
";
assert!(lines == expected1 || lines == expected2);
});
sherlock!(inverted, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-v");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
Holmeses, success in the province of detective work must always
can extract a clew from a wisp of straw or a flake of cigar ash;
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.
";
assert_eq!(lines, expected);
});
sherlock!(inverted_line_numbers, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-n").arg("-v");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
2:Holmeses, success in the province of detective work must always
4:can extract a clew from a wisp of straw or a flake of cigar ash;
5:but Doctor Watson has to have it taken out for him and dusted,
6:and exhibited clearly, with a label attached.
";
assert_eq!(lines, expected);
});
sherlock!(case_insensitive, "sherlock", |wd: WorkDir, mut cmd: Command| {
cmd.arg("-i");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(word, "as", |wd: WorkDir, mut cmd: Command| {
cmd.arg("-w");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
";
assert_eq!(lines, expected);
});
sherlock!(literal, "()", "file", |wd: WorkDir, mut cmd: Command| {
wd.create("file", "blib\n()\nblab\n");
cmd.arg("-Q");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "()\n");
});
sherlock!(quiet, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-q");
let lines: String = wd.stdout(&mut cmd);
assert!(lines.is_empty());
});
sherlock!(replace, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-r").arg("FooBar");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the FooBar
be, to a very large extent, the result of luck. FooBar Holmes
";
assert_eq!(lines, expected);
});
sherlock!(replace_groups, "([A-Z][a-z]+) ([A-Z][a-z]+)",
|wd: WorkDir, mut cmd: Command| {
cmd.arg("-r").arg("$2, $1");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Watsons, Doctor of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Holmes, Sherlock
but Watson, Doctor has to have it taken out for him and dusted,
";
assert_eq!(lines, expected);
});
sherlock!(replace_named_groups, "(?P<first>[A-Z][a-z]+) (?P<last>[A-Z][a-z]+)",
|wd: WorkDir, mut cmd: Command| {
cmd.arg("-r").arg("$last, $first");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Watsons, Doctor of this world, as opposed to the Sherlock
be, to a very large extent, the result of luck. Holmes, Sherlock
but Watson, Doctor has to have it taken out for him and dusted,
";
assert_eq!(lines, expected);
});
sherlock!(file_types, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create("file.py", "Sherlock");
wd.create("file.rs", "Sherlock");
cmd.arg("-t").arg("rust");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "file.rs:Sherlock\n");
});
sherlock!(file_types_negate, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create("file.py", "Sherlock");
wd.create("file.rs", "Sherlock");
cmd.arg("-T").arg("rust");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "file.py:Sherlock\n");
});
sherlock!(file_type_clear, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create("file.py", "Sherlock");
wd.create("file.rs", "Sherlock");
cmd.arg("--type-clear").arg("rust").arg("-t").arg("rust");
wd.assert_err(&mut cmd);
});
sherlock!(file_type_add, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create("file.py", "Sherlock");
wd.create("file.rs", "Sherlock");
wd.create("file.wat", "Sherlock");
cmd.arg("--type-add").arg("wat:*.wat").arg("-t").arg("wat");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "file.wat:Sherlock\n");
});
sherlock!(glob, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create("file.py", "Sherlock");
wd.create("file.rs", "Sherlock");
cmd.arg("-g").arg("*.rs");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "file.rs:Sherlock\n");
});
sherlock!(glob_negate, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create("file.py", "Sherlock");
wd.create("file.rs", "Sherlock");
cmd.arg("-g").arg("!*.rs");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "file.py:Sherlock\n");
});
sherlock!(after_context, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-A").arg("1");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
can extract a clew from a wisp of straw or a flake of cigar ash;
";
assert_eq!(lines, expected);
});
sherlock!(after_context_line_numbers, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-A").arg("1").arg("-n");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
1:For the Doctor Watsons of this world, as opposed to the Sherlock
2-Holmeses, success in the province of detective work must always
3:be, to a very large extent, the result of luck. Sherlock Holmes
4-can extract a clew from a wisp of straw or a flake of cigar ash;
";
assert_eq!(lines, expected);
});
sherlock!(before_context, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-B").arg("1");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(before_context_line_numbers, |wd: WorkDir, mut cmd: Command| {
cmd.arg("-B").arg("1").arg("-n");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
1:For the Doctor Watsons of this world, as opposed to the Sherlock
2-Holmeses, success in the province of detective work must always
3:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(context, "world|attached", |wd: WorkDir, mut cmd: Command| {
cmd.arg("-C").arg("1");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
For the Doctor Watsons of this world, as opposed to the Sherlock
Holmeses, success in the province of detective work must always
--
but Doctor Watson has to have it taken out for him and dusted,
and exhibited clearly, with a label attached.
";
assert_eq!(lines, expected);
});
sherlock!(context_line_numbers, "world|attached",
|wd: WorkDir, mut cmd: Command| {
cmd.arg("-C").arg("1").arg("-n");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
1:For the Doctor Watsons of this world, as opposed to the Sherlock
2-Holmeses, success in the province of detective work must always
--
5-but Doctor Watson has to have it taken out for him and dusted,
6:and exhibited clearly, with a label attached.
";
assert_eq!(lines, expected);
});
sherlock!(ignore_hidden, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create(".sherlock", hay::SHERLOCK);
wd.assert_err(&mut cmd);
});
sherlock!(no_ignore_hidden, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create(".sherlock", hay::SHERLOCK);
cmd.arg("--hidden");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
.sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
.sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(ignore_git, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create(".gitignore", "sherlock\n");
wd.assert_err(&mut cmd);
});
sherlock!(ignore_ripgrep, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create(".rgignore", "sherlock\n");
wd.assert_err(&mut cmd);
});
sherlock!(no_ignore, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.create(".gitignore", "sherlock\n");
cmd.arg("--no-ignore");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(ignore_git_parent, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create(".gitignore", "sherlock\n");
wd.create_dir(".git");
wd.create_dir("foo");
wd.create("foo/sherlock", hay::SHERLOCK);
// Even though we search in foo/, which has no .gitignore, ripgrep will
// search parent directories and respect the gitignore files found.
cmd.current_dir(wd.path().join("foo"));
wd.assert_err(&mut cmd);
});
sherlock!(ignore_git_parent_stop, "Sherlock", ".",
|wd: WorkDir, mut cmd: Command| {
// This tests that searching parent directories for .gitignore files stops
// after it sees a .git directory. To test this, we create this directory
// hierarchy:
//
// .gitignore (contains `sherlock`)
// foo/
// .git
// bar/
// sherlock
//
// And we perform the search inside `foo/bar/`. ripgrep will stop looking
// for .gitignore files after it sees `foo/.git/`, and therefore not
// respect the top-level `.gitignore` containing `sherlock`.
wd.remove("sherlock");
wd.create(".gitignore", "sherlock\n");
wd.create_dir("foo");
wd.create_dir("foo/.git");
wd.create_dir("foo/bar");
wd.create("foo/bar/sherlock", hay::SHERLOCK);
cmd.current_dir(wd.path().join("foo").join("bar"));
let lines: String = wd.stdout(&mut cmd);
let expected = "\
sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(ignore_ripgrep_parent_no_stop, "Sherlock", ".",
|wd: WorkDir, mut cmd: Command| {
// This is like the `ignore_git_parent_stop` test, except it checks that
// ripgrep *doesn't* stop checking for .rgignore files.
wd.remove("sherlock");
wd.create(".rgignore", "sherlock\n");
wd.create_dir("foo");
wd.create_dir("foo/.git");
wd.create_dir("foo/bar");
wd.create("foo/bar/sherlock", hay::SHERLOCK);
cmd.current_dir(wd.path().join("foo").join("bar"));
// The top-level .rgignore applies.
wd.assert_err(&mut cmd);
});
sherlock!(no_parent_ignore_git, "Sherlock", ".",
|wd: WorkDir, mut cmd: Command| {
// Set up a directory hierarchy like this:
//
// .gitignore
// foo/
// .gitignore
// sherlock
// watson
//
// Where `.gitignore` contains `sherlock` and `foo/.gitignore` contains
// `watson`.
//
// Now *do the search* from the foo directory. By default, ripgrep will
// search parent directories for .gitignore files. The --no-ignore-parent
// flag should prevent that. At the same time, the `foo/.gitignore` file
// will still be respected (since the search is happening in `foo/`).
//
// In other words, we should only see results from `sherlock`, not from
// `watson`.
wd.remove("sherlock");
wd.create(".gitignore", "sherlock\n");
wd.create_dir("foo");
wd.create("foo/.gitignore", "watson\n");
wd.create("foo/sherlock", hay::SHERLOCK);
wd.create("foo/watson", hay::SHERLOCK);
cmd.current_dir(wd.path().join("foo"));
cmd.arg("--no-ignore-parent");
let lines: String = wd.stdout(&mut cmd);
let expected = "\
sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
});
sherlock!(symlink_nofollow, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create_dir("foo");
wd.create_dir("foo/bar");
wd.link("foo/baz", "foo/bar/baz");
wd.create_dir("foo/baz");
wd.create("foo/baz/sherlock", hay::SHERLOCK);
cmd.current_dir(wd.path().join("foo/bar"));
wd.assert_err(&mut cmd);
});
sherlock!(symlink_follow, "Sherlock", ".", |wd: WorkDir, mut cmd: Command| {
wd.remove("sherlock");
wd.create_dir("foo");
wd.create_dir("foo/bar");
wd.create_dir("foo/baz");
wd.create("foo/baz/sherlock", hay::SHERLOCK);
wd.link("foo/baz", "foo/bar/baz");
cmd.arg("-L");
cmd.current_dir(wd.path().join("foo/bar"));
let lines: String = wd.stdout(&mut cmd);
if cfg!(windows) {
let expected = "\
baz\\sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
baz\\sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
} else {
let expected = "\
baz/sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock
baz/sherlock:be, to a very large extent, the result of luck. Sherlock Holmes
";
assert_eq!(lines, expected);
}
});
#[test]
fn binary_nosearch() {
let wd = WorkDir::new("binary_nosearch");
wd.create("file", "foo\x00bar\nfoo\x00baz\n");
let mut cmd = wd.command();
cmd.arg("foo").arg("file");
wd.assert_err(&mut cmd);
}
// The following two tests show a discrepancy in search results between
// searching with memory mapped files and stream searching. Stream searching
// uses a heuristic (that GNU grep also uses) where NUL bytes are replaced with
// the EOL terminator, which tends to avoid allocating large amounts of memory
// for really long "lines." The memory map searcher has no need to worry about
// such things, and more than that, it would be pretty hard for it to match
// the semantics of streaming search in this case.
//
// Binary files with lots of NULs aren't really part of the use case of ripgrep
// (or any other grep-like tool for that matter), so we shouldn't feel too bad
// about it.
#[test]
fn binary_search_mmap() {
let wd = WorkDir::new("binary_search_mmap");
wd.create("file", "foo\x00bar\nfoo\x00baz\n");
let mut cmd = wd.command();
cmd.arg("-a").arg("--mmap").arg("foo").arg("file");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "foo\x00bar\nfoo\x00baz\n");
}
#[test]
fn binary_search_no_mmap() {
let wd = WorkDir::new("binary_search_no_mmap");
wd.create("file", "foo\x00bar\nfoo\x00baz\n");
let mut cmd = wd.command();
cmd.arg("-a").arg("--no-mmap").arg("foo").arg("file");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "foo\nfoo\n");
}
#[test]
fn files() {
let wd = WorkDir::new("files");
wd.create("file", "");
wd.create_dir("dir");
wd.create("dir/file", "");
let mut cmd = wd.command();
cmd.arg("--files");
let lines: String = wd.stdout(&mut cmd);
if cfg!(windows) {
assert!(lines == "./dir\\file\n./file\n"
|| lines == "./file\n./dir\\file\n");
} else {
assert!(lines == "./file\n./dir/file\n"
|| lines == "./dir/file\n./file\n");
}
}
#[test]
fn type_list() {
let wd = WorkDir::new("type_list");
let mut cmd = wd.command();
cmd.arg("--type-list");
let lines: String = wd.stdout(&mut cmd);
// This can change over time, so just make sure we print something.
assert!(!lines.is_empty());
}

189
tests/workdir.rs Normal file
View File

@@ -0,0 +1,189 @@
use std::env;
use std::error;
use std::fmt;
use std::fs::{self, File};
use std::io::{self, Write};
use std::path::{Path, PathBuf};
use std::process;
use std::str::FromStr;
use std::sync::atomic::{ATOMIC_USIZE_INIT, AtomicUsize, Ordering};
use std::thread;
use std::time::Duration;
static TEST_DIR: &'static str = "ripgrep-tests";
static NEXT_ID: AtomicUsize = ATOMIC_USIZE_INIT;
/// WorkDir represents a directory in which tests are run.
///
/// Directories are created from a global atomic counter to avoid duplicates.
#[derive(Debug)]
pub struct WorkDir {
/// The directory in which this test executable is running.
root: PathBuf,
/// The directory in which the test should run. If a test needs to create
/// files, they should go in here.
dir: PathBuf,
}
impl WorkDir {
/// Create a new test working directory with the given name. The name
/// does not need to be distinct for each invocation, but should correspond
/// to a logical grouping of tests.
pub fn new(name: &str) -> WorkDir {
let id = NEXT_ID.fetch_add(1, Ordering::SeqCst);
let root = env::current_exe().unwrap()
.parent().expect("executable's directory").to_path_buf();
let dir = root.join(TEST_DIR).join(name).join(&format!("{}", id));
nice_err(&dir, repeat(|| fs::create_dir_all(&dir)));
WorkDir {
root: root,
dir: dir,
}
}
/// Create a new file with the given name and contents in this directory.
pub fn create<P: AsRef<Path>>(&self, name: P, contents: &str) {
let path = self.dir.join(name);
let mut file = nice_err(&path, File::create(&path));
nice_err(&path, file.write_all(contents.as_bytes()));
nice_err(&path, file.flush());
}
/// Remove a file with the given name from this directory.
pub fn remove<P: AsRef<Path>>(&self, name: P) {
let path = self.dir.join(name);
nice_err(&path, fs::remove_file(&path));
}
/// Create a new directory with the given path (and any directories above
/// it) inside this directory.
pub fn create_dir<P: AsRef<Path>>(&self, path: P) {
let path = self.dir.join(path);
nice_err(&path, repeat(|| fs::create_dir_all(&path)));
}
/// Creates a new command that is set to use the ripgrep executable in
/// this working directory.
pub fn command(&self) -> process::Command {
let mut cmd = process::Command::new(&self.bin());
cmd.current_dir(&self.dir);
cmd
}
/// Returns the path to the ripgrep executable.
pub fn bin(&self) -> PathBuf {
self.root.join("rg")
}
/// Returns the path to this directory.
pub fn path(&self) -> &Path {
&self.dir
}
/// Creates a directory symlink to the src with the given target name
/// in this directory.
#[cfg(not(windows))]
pub fn link<S: AsRef<Path>, T: AsRef<Path>>(&self, src: S, target: T) {
use std::os::unix::fs::symlink;
let src = self.dir.join(src);
let target = self.dir.join(target);
let _ = fs::remove_file(&target);
nice_err(&target, symlink(&src, &target));
}
#[cfg(windows)]
pub fn link<S: AsRef<Path>, T: AsRef<Path>>(&self, src: S, target: T) {
use std::os::windows::fs::symlink_dir;
let src = self.dir.join(src);
let target = self.dir.join(target);
let _ = fs::remove_dir(&target);
nice_err(&target, symlink_dir(&src, &target));
}
/// Runs and captures the stdout of the given command.
///
/// If the return type could not be created from a string, then this
/// panics.
pub fn stdout<E: fmt::Debug, T: FromStr<Err=E>>(
&self,
cmd: &mut process::Command,
) -> T {
let o = self.output(cmd);
let stdout = String::from_utf8_lossy(&o.stdout);
match stdout.parse() {
Ok(t) => t,
Err(err) => {
panic!("could not convert from string: {:?}\n\n{}", err, stdout);
}
}
}
/// Gets the output of a command. If the command failed, then this panics.
pub fn output(&self, cmd: &mut process::Command) -> process::Output {
let o = cmd.output().unwrap();
if !o.status.success() {
let suggest =
if o.stderr.is_empty() {
"\n\nDid your search end up with no results?".to_string()
} else {
"".to_string()
};
panic!("\n\n==========\n\
command failed but expected success!\
{}\
\n\ncommand: {:?}\
\ncwd: {}\
\n\nstatus: {}\
\n\nstdout: {}\
\n\nstderr: {}\
\n\n==========\n",
suggest, cmd, self.dir.display(), o.status,
String::from_utf8_lossy(&o.stdout),
String::from_utf8_lossy(&o.stderr));
}
o
}
/// Runs the given command and asserts that it resulted in an error exit
/// code.
pub fn assert_err(&self, cmd: &mut process::Command) {
let o = cmd.output().unwrap();
if o.status.success() {
panic!("\n\n===== {:?} =====\n\
command succeeded but expected failure!\
\n\ncwd: {}\
\n\nstatus: {}\
\n\nstdout: {}\n\nstderr: {}\
\n\n=====\n",
cmd, self.dir.display(), o.status,
String::from_utf8_lossy(&o.stdout),
String::from_utf8_lossy(&o.stderr));
}
}
}
fn nice_err<P: AsRef<Path>, T, E: error::Error>(
path: P,
res: Result<T, E>,
) -> T {
match res {
Ok(t) => t,
Err(err) => {
panic!("{}: {:?}", path.as_ref().display(), err);
}
}
}
fn repeat<F: FnMut() -> io::Result<()>>(mut f: F) -> io::Result<()> {
let mut last_err = None;
for _ in 0..10 {
if let Err(err) = f() {
last_err = Some(err);
thread::sleep(Duration::from_millis(500));
} else {
return Ok(());
}
}
Err(last_err.unwrap())
}