globset: remove use of unsafe

This commit removes, in retrospect, a silly use of `unsafe`. In particular,
to extract a file name extension (distinct from how `std` implements it),
we were transmuting an OsStr to its underlying WTF-8 byte representation
and then searching that. This required `unsafe` and relied on an
undocumented std API, so it was a bad choice to make, but everything gets
sacrificed at the Alter of Performance.

The thing I didn't seem to realize at the time was that:

  1. On Unix, you can already get the raw byte representation in a manner
     that has zero cost.
  2. On Windows, paths are already being encoded and copied every which
     way. So doing a UTF-8 check and, in rare cases (for invalid UTF-8),
     an extra copy, doesn't seem like that much more of an added expense.

Thus, rewrite the extension extraction using safe APIs. On Unix, this
should have identical performance characteristics as the previous
implementation. On Windows, we do pay a higher cost in the UTF-8
check, but Windows is already paying a similar cost a few times over
anyway.
This commit is contained in:
Andrew Gallant
2018-02-10 21:37:13 -05:00
parent 3effea0b7c
commit 96ee4482cd
3 changed files with 54 additions and 60 deletions

View File

@@ -108,7 +108,7 @@ extern crate regex;
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
use std::error::Error as StdError;
use std::ffi::{OsStr, OsString};
use std::ffi::OsStr;
use std::fmt;
use std::hash;
use std::path::Path;
@@ -458,7 +458,7 @@ impl GlobSetBuilder {
pub struct Candidate<'a> {
path: Cow<'a, [u8]>,
basename: Cow<'a, [u8]>,
ext: &'a OsStr,
ext: Cow<'a, [u8]>,
}
impl<'a> Candidate<'a> {
@@ -469,7 +469,7 @@ impl<'a> Candidate<'a> {
Candidate {
path: normalize_path(path_bytes(path)),
basename: os_str_bytes(basename),
ext: file_name_ext(basename).unwrap_or(OsStr::new("")),
ext: file_name_ext(basename).unwrap_or(Cow::Borrowed(b"")),
}
}
@@ -584,22 +584,22 @@ impl BasenameLiteralStrategy {
}
#[derive(Clone, Debug)]
struct ExtensionStrategy(HashMap<OsString, Vec<usize>, Fnv>);
struct ExtensionStrategy(HashMap<Vec<u8>, Vec<usize>, Fnv>);
impl ExtensionStrategy {
fn new() -> ExtensionStrategy {
ExtensionStrategy(HashMap::with_hasher(Fnv::default()))
}
fn add(&mut self, global_index: usize, ext: OsString) {
self.0.entry(ext).or_insert(vec![]).push(global_index);
fn add(&mut self, global_index: usize, ext: String) {
self.0.entry(ext.into_bytes()).or_insert(vec![]).push(global_index);
}
fn is_match(&self, candidate: &Candidate) -> bool {
if candidate.ext.is_empty() {
return false;
}
self.0.contains_key(candidate.ext)
self.0.contains_key(&*candidate.ext)
}
#[inline(never)]
@@ -607,7 +607,7 @@ impl ExtensionStrategy {
if candidate.ext.is_empty() {
return;
}
if let Some(hits) = self.0.get(candidate.ext) {
if let Some(hits) = self.0.get(&*candidate.ext) {
matches.extend(hits);
}
}
@@ -670,14 +670,14 @@ impl SuffixStrategy {
}
#[derive(Clone, Debug)]
struct RequiredExtensionStrategy(HashMap<OsString, Vec<(usize, Regex)>, Fnv>);
struct RequiredExtensionStrategy(HashMap<Vec<u8>, Vec<(usize, Regex)>, Fnv>);
impl RequiredExtensionStrategy {
fn is_match(&self, candidate: &Candidate) -> bool {
if candidate.ext.is_empty() {
return false;
}
match self.0.get(candidate.ext) {
match self.0.get(&*candidate.ext) {
None => false,
Some(regexes) => {
for &(_, ref re) in regexes {
@@ -695,7 +695,7 @@ impl RequiredExtensionStrategy {
if candidate.ext.is_empty() {
return;
}
if let Some(regexes) = self.0.get(candidate.ext) {
if let Some(regexes) = self.0.get(&*candidate.ext) {
for &(global_index, ref re) in regexes {
if re.is_match(&*candidate.path) {
matches.push(global_index);
@@ -775,7 +775,7 @@ impl MultiStrategyBuilder {
#[derive(Clone, Debug)]
struct RequiredExtensionStrategyBuilder(
HashMap<OsString, Vec<(usize, String)>>,
HashMap<Vec<u8>, Vec<(usize, String)>>,
);
impl RequiredExtensionStrategyBuilder {
@@ -783,8 +783,11 @@ impl RequiredExtensionStrategyBuilder {
RequiredExtensionStrategyBuilder(HashMap::new())
}
fn add(&mut self, global_index: usize, ext: OsString, regex: String) {
self.0.entry(ext).or_insert(vec![]).push((global_index, regex));
fn add(&mut self, global_index: usize, ext: String, regex: String) {
self.0
.entry(ext.into_bytes())
.or_insert(vec![])
.push((global_index, regex));
}
fn build(self) -> Result<RequiredExtensionStrategy, Error> {