// This module defines the types we use for JSON serialization. We specifically // omit deserialization, partially because there isn't a clear use case for // them at this time, but also because deserialization will complicate things. // Namely, the types below are designed in a way that permits JSON // serialization with little or no allocation. Allocation is often quite // convenient for deserialization however, so these types would become a bit // more complex. use std::borrow::Cow; use std::path::Path; use std::str; use base64; use serde::{Serialize, Serializer}; use crate::stats::Stats; #[derive(Serialize)] #[serde(tag = "type", content = "data")] #[serde(rename_all = "snake_case")] pub enum Message<'a> { Begin(Begin<'a>), End(End<'a>), Match(Match<'a>), Context(Context<'a>), } #[derive(Serialize)] pub struct Begin<'a> { #[serde(serialize_with = "ser_path")] pub path: Option<&'a Path>, } #[derive(Serialize)] pub struct End<'a> { #[serde(serialize_with = "ser_path")] pub path: Option<&'a Path>, pub binary_offset: Option, pub stats: Stats, } #[derive(Serialize)] pub struct Match<'a> { #[serde(serialize_with = "ser_path")] pub path: Option<&'a Path>, #[serde(serialize_with = "ser_bytes")] pub lines: &'a [u8], pub line_number: Option, pub absolute_offset: u64, pub submatches: &'a [SubMatch<'a>], } #[derive(Serialize)] pub struct Context<'a> { #[serde(serialize_with = "ser_path")] pub path: Option<&'a Path>, #[serde(serialize_with = "ser_bytes")] pub lines: &'a [u8], pub line_number: Option, pub absolute_offset: u64, pub submatches: &'a [SubMatch<'a>], } #[derive(Serialize)] pub struct SubMatch<'a> { #[serde(rename = "match")] #[serde(serialize_with = "ser_bytes")] pub m: &'a [u8], pub start: usize, pub end: usize, } /// Data represents things that look like strings, but may actually not be /// valid UTF-8. To handle this, `Data` is serialized as an object with one /// of two keys: `text` (for valid UTF-8) or `bytes` (for invalid UTF-8). /// /// The happy path is valid UTF-8, which streams right through as-is, since /// it is natively supported by JSON. When invalid UTF-8 is found, then it is /// represented as arbitrary bytes and base64 encoded. #[derive(Clone, Debug, Hash, PartialEq, Eq, Serialize)] #[serde(untagged)] enum Data<'a> { Text { text: Cow<'a, str>, }, Bytes { #[serde(serialize_with = "to_base64")] bytes: &'a [u8], }, } impl<'a> Data<'a> { fn from_bytes(bytes: &[u8]) -> Data<'_> { match str::from_utf8(bytes) { Ok(text) => Data::Text { text: Cow::Borrowed(text) }, Err(_) => Data::Bytes { bytes }, } } #[cfg(unix)] fn from_path(path: &Path) -> Data<'_> { use std::os::unix::ffi::OsStrExt; match path.to_str() { Some(text) => Data::Text { text: Cow::Borrowed(text) }, None => Data::Bytes { bytes: path.as_os_str().as_bytes() }, } } #[cfg(not(unix))] fn from_path(path: &Path) -> Data { // Using lossy conversion means some paths won't round trip precisely, // but it's not clear what we should actually do. Serde rejects // non-UTF-8 paths, and OsStr's are serialized as a sequence of UTF-16 // code units on Windows. Neither seem appropriate for this use case, // so we do the easy thing for now. Data::Text { text: path.to_string_lossy() } } } fn to_base64(bytes: T, ser: S) -> Result where T: AsRef<[u8]>, S: Serializer, { ser.serialize_str(&base64::encode(&bytes)) } fn ser_bytes(bytes: T, ser: S) -> Result where T: AsRef<[u8]>, S: Serializer, { Data::from_bytes(bytes.as_ref()).serialize(ser) } fn ser_path(path: &Option

, ser: S) -> Result where P: AsRef, S: Serializer, { path.as_ref().map(|p| Data::from_path(p.as_ref())).serialize(ser) }