// This module defines the types we use for JSON serialization. We specifically // omit deserialization, partially because there isn't a clear use case for // them at this time, but also because deserialization will complicate things. // Namely, the types below are designed in a way that permits JSON // serialization with little or no allocation. Allocation is often quite // convenient for deserialization however, so these types would become a bit // more complex. use std::{borrow::Cow, path::Path}; pub(crate) enum Message<'a> { Begin(Begin<'a>), End(End<'a>), Match(Match<'a>), Context(Context<'a>), } impl<'a> serde::Serialize for Message<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("Message", 2)?; match *self { Message::Begin(ref msg) => { state.serialize_field("type", &"begin")?; state.serialize_field("data", msg)?; } Message::End(ref msg) => { state.serialize_field("type", &"end")?; state.serialize_field("data", msg)?; } Message::Match(ref msg) => { state.serialize_field("type", &"match")?; state.serialize_field("data", msg)?; } Message::Context(ref msg) => { state.serialize_field("type", &"context")?; state.serialize_field("data", msg)?; } } state.end() } } pub(crate) struct Begin<'a> { pub(crate) path: Option<&'a Path>, } impl<'a> serde::Serialize for Begin<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("Begin", 1)?; state.serialize_field("path", &self.path.map(Data::from_path))?; state.end() } } pub(crate) struct End<'a> { pub(crate) path: Option<&'a Path>, pub(crate) binary_offset: Option, pub(crate) stats: crate::stats::Stats, } impl<'a> serde::Serialize for End<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("End", 3)?; state.serialize_field("path", &self.path.map(Data::from_path))?; state.serialize_field("binary_offset", &self.binary_offset)?; state.serialize_field("stats", &self.stats)?; state.end() } } pub(crate) struct Match<'a> { pub(crate) path: Option<&'a Path>, pub(crate) lines: &'a [u8], pub(crate) line_number: Option, pub(crate) absolute_offset: u64, pub(crate) submatches: &'a [SubMatch<'a>], } impl<'a> serde::Serialize for Match<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("Match", 5)?; state.serialize_field("path", &self.path.map(Data::from_path))?; state.serialize_field("lines", &Data::from_bytes(self.lines))?; state.serialize_field("line_number", &self.line_number)?; state.serialize_field("absolute_offset", &self.absolute_offset)?; state.serialize_field("submatches", &self.submatches)?; state.end() } } pub(crate) struct Context<'a> { pub(crate) path: Option<&'a Path>, pub(crate) lines: &'a [u8], pub(crate) line_number: Option, pub(crate) absolute_offset: u64, pub(crate) submatches: &'a [SubMatch<'a>], } impl<'a> serde::Serialize for Context<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("Context", 5)?; state.serialize_field("path", &self.path.map(Data::from_path))?; state.serialize_field("lines", &Data::from_bytes(self.lines))?; state.serialize_field("line_number", &self.line_number)?; state.serialize_field("absolute_offset", &self.absolute_offset)?; state.serialize_field("submatches", &self.submatches)?; state.end() } } pub(crate) struct SubMatch<'a> { pub(crate) m: &'a [u8], pub(crate) start: usize, pub(crate) end: usize, } impl<'a> serde::Serialize for SubMatch<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("SubMatch", 3)?; state.serialize_field("match", &Data::from_bytes(self.m))?; state.serialize_field("start", &self.start)?; state.serialize_field("end", &self.end)?; state.end() } } /// Data represents things that look like strings, but may actually not be /// valid UTF-8. To handle this, `Data` is serialized as an object with one /// of two keys: `text` (for valid UTF-8) or `bytes` (for invalid UTF-8). /// /// The happy path is valid UTF-8, which streams right through as-is, since /// it is natively supported by JSON. When invalid UTF-8 is found, then it is /// represented as arbitrary bytes and base64 encoded. #[derive(Clone, Debug, Hash, PartialEq, Eq)] enum Data<'a> { Text { text: Cow<'a, str> }, Bytes { bytes: &'a [u8] }, } impl<'a> Data<'a> { fn from_bytes(bytes: &[u8]) -> Data<'_> { match std::str::from_utf8(bytes) { Ok(text) => Data::Text { text: Cow::Borrowed(text) }, Err(_) => Data::Bytes { bytes }, } } #[cfg(unix)] fn from_path(path: &Path) -> Data<'_> { use std::os::unix::ffi::OsStrExt; match path.to_str() { Some(text) => Data::Text { text: Cow::Borrowed(text) }, None => Data::Bytes { bytes: path.as_os_str().as_bytes() }, } } #[cfg(not(unix))] fn from_path(path: &Path) -> Data { // Using lossy conversion means some paths won't round trip precisely, // but it's not clear what we should actually do. Serde rejects // non-UTF-8 paths, and OsStr's are serialized as a sequence of UTF-16 // code units on Windows. Neither seem appropriate for this use case, // so we do the easy thing for now. Data::Text { text: path.to_string_lossy() } } } impl<'a> serde::Serialize for Data<'a> { fn serialize( &self, s: S, ) -> Result { use serde::ser::SerializeStruct; let mut state = s.serialize_struct("Data", 1)?; match *self { Data::Text { ref text } => state.serialize_field("text", text)?, Data::Bytes { bytes } => { // use base64::engine::{general_purpose::STANDARD, Engine}; // let encoded = STANDARD.encode(bytes); state.serialize_field("bytes", &base64_standard(bytes))?; } } state.end() } } /// Implements "standard" base64 encoding as described in RFC 3548[1]. /// /// We roll our own here instead of bringing in something heavier weight like /// the `base64` crate. In particular, we really don't care about perf much /// here, since this is only used for data or file paths that are not valid /// UTF-8. /// /// [1]: https://tools.ietf.org/html/rfc3548#section-3 fn base64_standard(bytes: &[u8]) -> String { const ALPHABET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; let mut out = String::new(); let mut it = bytes.chunks_exact(3); while let Some(chunk) = it.next() { let group24 = (usize::from(chunk[0]) << 16) | (usize::from(chunk[1]) << 8) | usize::from(chunk[2]); let index1 = (group24 >> 18) & 0b111_111; let index2 = (group24 >> 12) & 0b111_111; let index3 = (group24 >> 6) & 0b111_111; let index4 = (group24 >> 0) & 0b111_111; out.push(char::from(ALPHABET[index1])); out.push(char::from(ALPHABET[index2])); out.push(char::from(ALPHABET[index3])); out.push(char::from(ALPHABET[index4])); } match it.remainder() { &[] => {} &[byte0] => { let group8 = usize::from(byte0); let index1 = (group8 >> 2) & 0b111_111; let index2 = (group8 << 4) & 0b111_111; out.push(char::from(ALPHABET[index1])); out.push(char::from(ALPHABET[index2])); out.push('='); out.push('='); } &[byte0, byte1] => { let group16 = (usize::from(byte0) << 8) | usize::from(byte1); let index1 = (group16 >> 10) & 0b111_111; let index2 = (group16 >> 4) & 0b111_111; let index3 = (group16 << 2) & 0b111_111; out.push(char::from(ALPHABET[index1])); out.push(char::from(ALPHABET[index2])); out.push(char::from(ALPHABET[index3])); out.push('='); } _ => unreachable!("remainder must have length < 3"), } out } #[cfg(test)] mod tests { use super::*; // Tests taken from RFC 4648[1]. // // [1]: https://datatracker.ietf.org/doc/html/rfc4648#section-10 #[test] fn base64_basic() { let b64 = |s: &str| base64_standard(s.as_bytes()); assert_eq!(b64(""), ""); assert_eq!(b64("f"), "Zg=="); assert_eq!(b64("fo"), "Zm8="); assert_eq!(b64("foo"), "Zm9v"); assert_eq!(b64("foob"), "Zm9vYg=="); assert_eq!(b64("fooba"), "Zm9vYmE="); assert_eq!(b64("foobar"), "Zm9vYmFy"); } }