benchsuite: remove sift, pt and ucg

None of these tools got particularly popular (except for pt briefly), but they do not appear to be active projects nowadays. While ucg was fast, sift and pt were ecscruiating slow in a number of cases that required special care in the benchmarks. This also fixes the ordering of benchmark output to reflect the ordering in the source of the benchsuite script.
2025-05-19 09:40:22 -07:00 · 2020-10-14 15:01:15 -04:00 · 2020-10-14 15:01:15 -04:00 · 5ebb3ad039
commit 5ebb3ad039
parent b0066274cb
1 changed files with 45 additions and 118 deletions
--- a/benchsuite/benchsuite
+++ b/benchsuite/benchsuite
@ -71,15 +71,8 @@ def bench_linux_literal_default(suite_dir):
    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', pat]),
        mkcmd('ag', ['ag', pat]),
        # ucg reports the exact same matches as ag and rg even though it
        # doesn't read gitignore files. Instead, it has a file whitelist
        # that happens to match up exactly with the gitignores for this search.
        mkcmd('ucg', ['ucg', pat]),
        # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
        # default, but I'd guess it to be on most desktop systems.
        mkcmd('pt', ['pt', pat]),
        # sift reports an extra line here for a binary file matched.
        mkcmd('sift', ['sift', pat]),
        mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
    ])
@ -102,16 +95,12 @@ def bench_linux_literal(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]),
+        mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
-        mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]),
+        mkcmd('ag (mmap)', ['ag', '-s', pat]),
-        mkcmd('pt (ignore)', ['pt', pat]),
+        mkcmd('git grep', [
        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
        mkcmd('git grep (ignore)', [
            'git', 'grep', '-I', '-n', pat,
        ], env={'LC_ALL': 'C'}),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])
@ -131,31 +120,22 @@ def bench_linux_literal_casei(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
-        mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
+        mkcmd('rg (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
-        mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]),
+        mkcmd('ag (mmap)', ['ag', '-i', pat]),
        mkcmd('pt (ignore)', ['pt', '-i', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
        # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
        # since that is certainly what ripgrep is doing, but this is for an
        # ASCII literal, so we should give `git grep` all the opportunity to
        # do its best.
-        mkcmd('git grep (ignore)', [
+        mkcmd('git grep', [
            'git', 'grep', '-I', '-n', '-i', pat,
        ], env={'LC_ALL': 'C'}),
        mkcmd('rg (whitelist)', [
            'rg', '-n', '-i', '--no-ignore', '-tall', pat,
        ]),
        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
    ])
 def bench_linux_re_literal_suffix(suite_dir):
    '''
    Benchmark the speed of a literal inside a regex.
    This, for example, inhibits a prefix byte optimization used
    inside of Go's regex engine (relevant for sift and pt).
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
@ -166,26 +146,19 @@ def bench_linux_re_literal_suffix(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('ag (ignore)', ['ag', '-s', pat]),
+        mkcmd('ag', ['ag', '-s', pat]),
        mkcmd('pt (ignore)', ['pt', '-e', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
        mkcmd(
-            'git grep (ignore)',
+            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])
 def bench_linux_word(suite_dir):
    '''
    Benchmark use of the -w ("match word") flag in each tool.
    sift has a lot of trouble with this because it forces it into Go's
    regex engine by surrounding the pattern with \b assertions.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
@ -196,28 +169,19 @@ def bench_linux_word(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]),
+        mkcmd('rg', ['rg', '-n', '-w', pat]),
-        mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]),
+        mkcmd('ag', ['ag', '-s', '-w', pat]),
        mkcmd('pt (ignore)', ['pt', '-w', pat]),
        mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
        mkcmd(
-            'git grep (ignore)',
+            'git grep',
            ['git', 'grep', '-E', '-I', '-n', '-w', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', [
            'rg', '-n', '-w', '--no-ignore', '-tall', pat,
        ]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]),
    ])
 def bench_linux_unicode_greek(suite_dir):
    '''
    Benchmark matching of a Unicode category.
    Only three tools (ripgrep, sift and pt) support this. We omit
    pt because it is too slow.
    '''
    require(suite_dir, 'linux')
    cwd = path.join(suite_dir, LINUX_DIR)
@ -229,8 +193,6 @@ def bench_linux_unicode_greek(suite_dir):
    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', pat]),
        mkcmd('pt', ['pt', '-e', pat]),
        mkcmd('sift', SIFT + ['-n', '--git', pat]),
    ])
@ -250,8 +212,6 @@ def bench_linux_unicode_greek_casei(suite_dir):
    return Benchmark(pattern=pat, commands=[
        mkcmd('rg', ['rg', '-n', '-i', pat]),
        mkcmd('pt', ['pt', '-i', '-e', pat]),
        mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]),
    ])
@ -272,26 +232,19 @@ def bench_linux_unicode_word(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]),
-        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
+        mkcmd('ag (ASCII)', ['ag', '-s', pat]),
        mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
        mkcmd(
-            'git grep (ignore)',
+            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
-            'git grep (ignore) (ASCII)',
+            'git grep (ASCII)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('rg (whitelist) (ASCII)', [
            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
        ]),
        mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
    ])
@ -313,26 +266,19 @@ def bench_linux_no_literal(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]),
+        mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]),
-        mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]),
+        mkcmd('ag (ASCII)', ['ag', '-s', pat]),
        mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
        mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
        mkcmd(
-            'git grep (ignore)',
+            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'en_US.UTF-8'},
        ),
        mkcmd(
-            'git grep (ignore) (ASCII)',
+            'git grep (ASCII)',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]),
        mkcmd('rg (whitelist) (ASCII)', [
            'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat,
        ]),
        mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
    ])
@ -354,15 +300,13 @@ def bench_linux_alternates(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', pat]),
+        mkcmd('rg', ['rg', '-n', pat]),
-        mkcmd('ag (ignore)', ['ag', '-s', pat]),
+        mkcmd('ag', ['ag', '-s', pat]),
        mkcmd(
-            'git grep (ignore)',
+            'git grep',
            ['git', 'grep', '-E', '-I', '-n', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]),
    ])
@ -377,15 +321,13 @@ def bench_linux_alternates_casei(suite_dir):
        return Command(*args, **kwargs)
    return Benchmark(pattern=pat, commands=[
-        mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]),
+        mkcmd('rg', ['rg', '-n', '-i', pat]),
-        mkcmd('ag (ignore)', ['ag', '-i', pat]),
+        mkcmd('ag', ['ag', '-i', pat]),
        mkcmd(
-            'git grep (ignore)',
+            'git grep',
            ['git', 'grep', '-E', '-I', '-n', '-i', pat],
            env={'LC_ALL': 'C'},
        ),
        mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]),
        mkcmd('ucg (whitelist)', ['ucg', '-i', pat]),
    ])
@ -400,15 +342,10 @@ def bench_subtitles_en_literal(suite_dir):
    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', pat, en]),
        Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]),
-        Command('pt', ['pt', '-N', pat, en]),
+        Command('grep', ['grep', pat, en], env=GREP_ASCII),
        Command('sift', ['sift', pat, en]),
        Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', pat, en]),
        Command('ag (lines)', ['ag', '-s', pat, en]),
-        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
+        Command('grep (lines)', ['grep', '-n', pat, en], env=GREP_ASCII),
        Command('pt (lines)', ['pt', pat, en]),
        Command('sift (lines)', ['sift', '-n', pat, en]),
        Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
    ])
@ -428,7 +365,6 @@ def bench_subtitles_en_literal_casei(suite_dir):
        ], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
        Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]),
    ])
@ -445,7 +381,6 @@ def bench_subtitles_en_literal_word(suite_dir):
            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
        ]),
        Command('ag (ASCII)', ['ag', '-sw', pat, en]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-anw', pat, en,
        ], env=GREP_ASCII),
@ -471,7 +406,6 @@ def bench_subtitles_en_alternate(suite_dir):
    return Benchmark(pattern=pat, commands=[
        Command('rg (lines)', ['rg', '-n', pat, en]),
        Command('ag (lines)', ['ag', '-s', pat, en]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (lines)', [
            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
@ -498,7 +432,6 @@ def bench_subtitles_en_alternate_casei(suite_dir):
    return Benchmark(pattern=pat, commands=[
        Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
        Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-E', '-ani', pat, en,
        ], env=GREP_ASCII),
@ -520,7 +453,6 @@ def bench_subtitles_en_surrounding_words(suite_dir):
        Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE),
        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
        Command('ag (ASCII)', ['ag', '-s', pat, en]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
@ -544,7 +476,6 @@ def bench_subtitles_en_no_literal(suite_dir):
        Command('rg', ['rg', '-n', pat, en]),
        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
        Command('ag (ASCII)', ['ag', '-s', pat, en]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, en,
        ], env=GREP_ASCII),
@ -562,14 +493,9 @@ def bench_subtitles_ru_literal(suite_dir):
    return Benchmark(pattern=pat, commands=[
        Command('rg', ['rg', pat, ru]),
        Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]),
        Command('pt', ['pt', '-N', pat, ru]),
        Command('sift', ['sift', pat, ru]),
        Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', pat, ru]),
        Command('ag (lines)', ['ag', '-s', pat, ru]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
        Command('pt (lines)', ['pt', pat, ru]),
        Command('sift (lines)', ['sift', '-n', pat, ru]),
        Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
    ])
@ -590,7 +516,6 @@ def bench_subtitles_ru_literal_casei(suite_dir):
        ], env=GREP_ASCII),
        Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
        Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
        Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]),
    ])
@ -607,7 +532,6 @@ def bench_subtitles_ru_literal_word(suite_dir):
            'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru,
        ]),
        Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-anw', pat, ru,
        ], env=GREP_ASCII),
@ -633,7 +557,6 @@ def bench_subtitles_ru_alternate(suite_dir):
    return Benchmark(pattern=pat, commands=[
        Command('rg (lines)', ['rg', '-n', pat, ru]),
        Command('ag (lines)', ['ag', '-s', pat, ru]),
        Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (lines)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
@ -660,7 +583,6 @@ def bench_subtitles_ru_alternate_casei(suite_dir):
    return Benchmark(pattern=pat, commands=[
        Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-E', '-ani', pat, ru,
        ], env=GREP_ASCII),
@ -681,7 +603,6 @@ def bench_subtitles_ru_surrounding_words(suite_dir):
        Command('rg', ['rg', '-n', pat, ru]),
        Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE),
        Command('ag (ASCII)', ['ag', '-s', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
@ -705,7 +626,6 @@ def bench_subtitles_ru_no_literal(suite_dir):
        Command('rg', ['rg', '-n', pat, ru]),
        Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
        Command('ag (ASCII)', ['ag', '-s', pat, ru]),
        Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
        Command('grep (ASCII)', [
            'grep', '-E', '-an', pat, ru,
        ], env=GREP_ASCII),
@ -758,7 +678,7 @@ class Benchmark(object):
    def __init__(self, name=None, pattern=None, commands=None,
                 warmup_count=1, count=3, line_count=True,
                 allow_missing_commands=False,
-                 disabled_cmds=None):
+                 disabled_cmds=None, order=0):
        '''
        Create a single benchmark.
@ -794,6 +714,8 @@ class Benchmark(object):
            will simply skip it.
        :param list(str) disabled_cmds:
            A list of commands to skip.
        :param int order:
            An integer indicating the sequence number of this benchmark.
        '''
        self.name = name
        self.pattern = pattern
@ -803,6 +725,7 @@ class Benchmark(object):
        self.line_count = line_count
        self.allow_missing_commands = allow_missing_commands
        self.disabled_cmds = set(disabled_cmds or [])
        self.order = order
    def raise_if_missing(self):
        '''
@ -1165,19 +1088,22 @@ def collect_benchmarks(suite_dir, filter_pat=None,
        requires corpora that are missing, then a log message is
        emitted to stderr and it is not yielded.
    '''
-    for fun in sorted(globals()):
+    benchmarks = []
-        if not fun.startswith('bench_'):
+    for global_name in globals():
        if not global_name.startswith('bench_'):
            continue
-        name = re.sub('^bench_', '', fun)
+        name = re.sub('^bench_', '', global_name)
        if filter_pat is not None and not re.search(filter_pat, name):
            continue
        try:
-            benchmark = globals()[fun](suite_dir)
+            fun = globals()[global_name]
            benchmark = fun(suite_dir)
            benchmark.name = name
            benchmark.warmup_count = warmup_iter
            benchmark.count = bench_iter
            benchmark.allow_missing_commands = allow_missing_commands
            benchmark.disabled_cmds = disabled_cmds
            benchmark.order = fun.__code__.co_firstlineno
            benchmark.raise_if_missing()
        except MissingDependencies as e:
            eprint(
@ -1192,7 +1118,8 @@ def collect_benchmarks(suite_dir, filter_pat=None,
                  '(run with --allow-missing to run incomplete benchmarks)'
            eprint(fmt % (', '.join(e.missing_names), name))
            continue
-        yield benchmark
+        benchmarks.append(benchmark)
    return sorted(benchmarks, key=lambda b: b.order)
 def main():