mirror of
https://github.com/BurntSushi/ripgrep.git
synced 2025-05-19 09:40:22 -07:00
More benchmarks for subtitle corpus.
This commit is contained in:
parent
954fbeb1d8
commit
466cd70a8e
168
benchsuite
168
benchsuite
@ -23,6 +23,7 @@ import time
|
|||||||
|
|
||||||
SUBTITLES_DIR = 'subtitles'
|
SUBTITLES_DIR = 'subtitles'
|
||||||
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
|
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en'
|
||||||
|
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en'
|
||||||
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
|
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
|
||||||
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
|
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitles2016.raw.en.gz'
|
||||||
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
|
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru'
|
||||||
@ -32,6 +33,12 @@ SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitl
|
|||||||
LINUX_DIR = 'linux'
|
LINUX_DIR = 'linux'
|
||||||
LINUX_CLONE = 'git://github.com/BurntSushi/linux'
|
LINUX_CLONE = 'git://github.com/BurntSushi/linux'
|
||||||
|
|
||||||
|
# Grep takes locale settings from the environment. There is a *substantial*
|
||||||
|
# performance impact for enabling Unicode, so we need to handle this explicitly
|
||||||
|
# in our benchmarks.
|
||||||
|
GREP_ASCII = {'LC_ALL': 'C'}
|
||||||
|
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
|
||||||
|
|
||||||
|
|
||||||
def bench_linux_literal_default(suite_dir):
|
def bench_linux_literal_default(suite_dir):
|
||||||
'''
|
'''
|
||||||
@ -320,10 +327,10 @@ def bench_linux_no_literal(suite_dir):
|
|||||||
|
|
||||||
return Benchmark(pattern=pat, commands=[
|
return Benchmark(pattern=pat, commands=[
|
||||||
mkcmd('rg', ['rg', '-n', pat]),
|
mkcmd('rg', ['rg', '-n', pat]),
|
||||||
mkcmd('rg-novcs', ['rg', '--no-ignore', '-n', pat]),
|
mkcmd('rg-whitelist', ['rg', '-tall', '--no-ignore', '-n', pat]),
|
||||||
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
|
mkcmd('rg (no Unicode)', ['rg', '-n', '(?-u)' + pat]),
|
||||||
mkcmd('rg-novcs (no Unicode)', [
|
mkcmd('rg-whitelist (no Unicode)', [
|
||||||
'rg', '--no-ignore', '-n', '(?-u)' + pat,
|
'rg', '-tall', '--no-ignore', '-n', '(?-u)' + pat,
|
||||||
]),
|
]),
|
||||||
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
|
mkcmd('ag (no Unicode)', ['ag', '-s', pat]),
|
||||||
mkcmd('ag-novcs (no Unicode)', [
|
mkcmd('ag-novcs (no Unicode)', [
|
||||||
@ -411,18 +418,141 @@ def bench_linux_alternates_casei(suite_dir):
|
|||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
# BREADCRUMBS(burntsushi): We should benchmark an alternation for `linux` as
|
def bench_subtitles_en_literal(suite_dir):
|
||||||
# well.
|
'''
|
||||||
|
Benchmark the speed of an ASCII string literal.
|
||||||
def bench_sherlock(suite_dir):
|
'''
|
||||||
'TODO: Fix this and add more single file benchmarks.'
|
|
||||||
require(suite_dir, 'subtitles-en')
|
require(suite_dir, 'subtitles-en')
|
||||||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME)
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
|
||||||
pat = 'Sherlock'
|
pat = 'Sherlock Holmes'
|
||||||
|
|
||||||
return Benchmark(pattern=pat, commands=[
|
return Benchmark(pattern=pat, commands=[
|
||||||
Command('rg', ['rg', pat, en]),
|
Command('rg', ['rg', '-n', pat, ru]),
|
||||||
Command('grep', ['grep', '-a', pat, en])
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
||||||
|
Command('ag', ['ag', '-s', pat, ru]),
|
||||||
|
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
|
||||||
|
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
|
||||||
|
Command('grep (no line numbers)', [
|
||||||
|
'grep', '-a', pat, ru,
|
||||||
|
], env=GREP_ASCII),
|
||||||
|
Command('pt', ['pt', pat, ru]),
|
||||||
|
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
|
||||||
|
Command('sift', ['sift', '-n', pat, ru]),
|
||||||
|
Command('sift (no line numbers)', ['sift', pat, ru]),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def bench_subtitles_ru_literal(suite_dir):
|
||||||
|
'''
|
||||||
|
Benchmark the speed of a Unicode-y string literal.
|
||||||
|
'''
|
||||||
|
require(suite_dir, 'subtitles-ru')
|
||||||
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
||||||
|
pat = 'Шерлок Холмс' # Sherlock Holmes
|
||||||
|
|
||||||
|
return Benchmark(pattern=pat, commands=[
|
||||||
|
Command('rg', ['rg', '-n', pat, ru]),
|
||||||
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
||||||
|
Command('ag', ['ag', '-s', pat, ru]),
|
||||||
|
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
|
||||||
|
Command('grep', ['grep', '-an', pat, ru], env=GREP_ASCII),
|
||||||
|
Command('grep (no line numbers)', [
|
||||||
|
'grep', '-a', pat, ru,
|
||||||
|
], env=GREP_ASCII),
|
||||||
|
Command('pt', ['pt', pat, ru]),
|
||||||
|
Command('pt (no line numbers)', ['pt', '-N', pat, ru]),
|
||||||
|
Command('sift', ['sift', '-n', pat, ru]),
|
||||||
|
Command('sift (no line numbers)', ['sift', pat, ru]),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def bench_subtitles_ru_literal_casei(suite_dir):
|
||||||
|
'''
|
||||||
|
Benchmark the speed of a Unicode-y string case insensitively.
|
||||||
|
'''
|
||||||
|
require(suite_dir, 'subtitles-ru')
|
||||||
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
||||||
|
pat = 'Шерлок Холмс' # Sherlock Holmes
|
||||||
|
|
||||||
|
return Benchmark(pattern=pat, commands=[
|
||||||
|
Command('rg', ['rg', '-n', '-i', pat, ru]),
|
||||||
|
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
|
||||||
|
Command('grep', ['grep', '-ani', pat, ru], env=GREP_UNICODE),
|
||||||
|
Command('grep (not Unicode)', [
|
||||||
|
'grep', '-E', '-ani', pat, ru,
|
||||||
|
], env=GREP_ASCII),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def bench_subtitles_ru_alternate(suite_dir):
|
||||||
|
'''
|
||||||
|
Benchmark the speed of a set of alternate literals.
|
||||||
|
'''
|
||||||
|
require(suite_dir, 'subtitles-ru')
|
||||||
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
||||||
|
pat = '|'.join([
|
||||||
|
'Шерлок Холмс', # Sherlock Holmes
|
||||||
|
'Джон Уотсон', # John Watson
|
||||||
|
'Ирен Адлер', # Irene Adler
|
||||||
|
'инспектор Лестрейд', # Inspector Lestrade
|
||||||
|
'профессор Мориарти', # Professor Moriarty
|
||||||
|
])
|
||||||
|
|
||||||
|
return Benchmark(pattern=pat, commands=[
|
||||||
|
Command('rg', ['rg', '-n', pat, ru]),
|
||||||
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
||||||
|
Command('ucg', ['ucg', '--nosmart-case', pat, ru]),
|
||||||
|
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_ASCII),
|
||||||
|
Command('grep (no line numbers)', [
|
||||||
|
'grep', '-E', '-a', pat, ru,
|
||||||
|
], env=GREP_ASCII),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def bench_subtitles_ru_alternate_casei(suite_dir):
|
||||||
|
'''
|
||||||
|
Benchmark the speed of a set of alternate literals.
|
||||||
|
'''
|
||||||
|
require(suite_dir, 'subtitles-ru')
|
||||||
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
||||||
|
pat = '|'.join([
|
||||||
|
'Шерлок Холмс', # Sherlock Holmes
|
||||||
|
'Джон Уотсон', # John Watson
|
||||||
|
'Ирен Адлер', # Irene Adler
|
||||||
|
'инспектор Лестрейд', # Inspector Lestrade
|
||||||
|
'профессор Мориарти', # Professor Moriarty
|
||||||
|
])
|
||||||
|
|
||||||
|
return Benchmark(pattern=pat, commands=[
|
||||||
|
Command('rg', ['rg', '-n', '-i', pat, ru]),
|
||||||
|
Command('ucg (not Unicode)', ['ucg', '-i', pat, ru]),
|
||||||
|
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE),
|
||||||
|
Command('grep (not Unicode)', [
|
||||||
|
'grep', '-E', '-ani', pat, ru,
|
||||||
|
], env=GREP_ASCII),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
def bench_subtitles_ru_no_literal(suite_dir):
|
||||||
|
'''
|
||||||
|
Benchmark the speed of a regex with no literals.
|
||||||
|
|
||||||
|
Note that we don't even try to run grep with Unicode support
|
||||||
|
on this one. While it should eventually get the right answer,
|
||||||
|
I killed it after it had already been running for two minutes
|
||||||
|
and showed no signs of finishing soon.
|
||||||
|
'''
|
||||||
|
require(suite_dir, 'subtitles-ru')
|
||||||
|
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
|
||||||
|
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
|
||||||
|
|
||||||
|
return Benchmark(pattern=pat, commands=[
|
||||||
|
Command('rg', ['rg', '-n', pat, ru]),
|
||||||
|
Command('rg (no line numbers)', ['rg', pat, ru]),
|
||||||
|
Command('ucg (no Unicode)', ['ucg', '--nosmart-case', pat, ru]),
|
||||||
|
Command('grep (no Unicode)', [
|
||||||
|
'grep', '-E', '-an', pat, ru,
|
||||||
|
], env=GREP_ASCII),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
|
||||||
@ -723,6 +853,7 @@ def download_subtitles_en(suite_dir):
|
|||||||
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
||||||
en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
|
en_path_gz = path.join(subtitle_dir, SUBTITLES_EN_NAME_GZ)
|
||||||
en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
|
en_path = path.join(subtitle_dir, SUBTITLES_EN_NAME)
|
||||||
|
en_path_sample = path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)
|
||||||
|
|
||||||
if not os.path.isdir(subtitle_dir):
|
if not os.path.isdir(subtitle_dir):
|
||||||
os.makedirs(subtitle_dir)
|
os.makedirs(subtitle_dir)
|
||||||
@ -730,12 +861,19 @@ def download_subtitles_en(suite_dir):
|
|||||||
if not os.path.exists(en_path_gz):
|
if not os.path.exists(en_path_gz):
|
||||||
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
|
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
|
||||||
run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
|
run_cmd(['gunzip', en_path_gz], cwd=subtitle_dir)
|
||||||
|
if not os.path.exists(en_path_sample):
|
||||||
|
# Get a sample roughly the same size as the Russian corpus so that
|
||||||
|
# benchmarks finish in a reasonable time.
|
||||||
|
with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
|
||||||
|
run_cmd(
|
||||||
|
['head', '-n', '32722372', en_path],
|
||||||
|
cwd=subtitle_dir, stdout=f)
|
||||||
|
|
||||||
|
|
||||||
def has_subtitles_en(suite_dir):
|
def has_subtitles_en(suite_dir):
|
||||||
'Returns true if English subtitles have been downloaded.'
|
'Returns true if English subtitles have been downloaded.'
|
||||||
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
|
||||||
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME))
|
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE))
|
||||||
|
|
||||||
|
|
||||||
def download_subtitles_ru(suite_dir):
|
def download_subtitles_ru(suite_dir):
|
||||||
@ -770,7 +908,7 @@ def download(suite_dir, choices):
|
|||||||
A list of corpora to download. Available choices are:
|
A list of corpora to download. Available choices are:
|
||||||
all, linux, subtitles-en, subtitles-ru.
|
all, linux, subtitles-en, subtitles-ru.
|
||||||
'''
|
'''
|
||||||
for choice in args.download:
|
for choice in choices:
|
||||||
if choice == 'linux':
|
if choice == 'linux':
|
||||||
download_linux(suite_dir)
|
download_linux(suite_dir)
|
||||||
elif choice == 'subtitles-en':
|
elif choice == 'subtitles-en':
|
||||||
@ -849,7 +987,7 @@ def main():
|
|||||||
args = p.parse_args()
|
args = p.parse_args()
|
||||||
|
|
||||||
if args.download is not None and len(args.download) > 0:
|
if args.download is not None and len(args.download) > 0:
|
||||||
download(args.dir, args.choices)
|
download(args.dir, args.download)
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
if not path.isdir(args.dir):
|
if not path.isdir(args.dir):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user