Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Zero Copy String Deserialization #88

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ publish = false

[dependencies]
bytes = "0.5.6"
pb-jelly = "0.0.5"
pb-jelly = { path = "../pb-jelly" }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#multiple-locations - you can actually specify both path and version

Actually, in this case, since examples aren't being posted to crates.io, we probably should keep it as only a path dependency.

proto_box_it = { path = "gen/rust/proto/proto_box_it" }
proto_custom_type = { path = "gen/rust/proto/proto_custom_type" }
proto_linked_list = { path = "gen/rust/proto/proto_linked_list" }
Expand Down
2 changes: 1 addition & 1 deletion examples/protos/zero_copy/basic.proto
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ import "rust/extensions.proto";

message BytesMessage {
optional bytes data = 1 [(rust.zero_copy)=true];
optional string name = 2;
optional string name = 2 [(rust.zero_copy)=true];
}
2 changes: 1 addition & 1 deletion examples/src/zero_copy/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ fn main() -> std::io::Result<()> {
// Create our Proto Struct
let mut proto = BytesMessage::default();
proto.set_data(data);
proto.set_name("Parker".to_owned());
proto.set_name("Parker".into());

// Serialize our proto
let ser_bytes: Vec<u8> = proto.serialize_to_vec();
Expand Down
32 changes: 25 additions & 7 deletions pb-jelly-gen/codegen/codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
BLOB_TYPE = "::pb_jelly::Lazy<::blob_pb::WrappedBlob>"
VEC_SLICE_TYPE = "::pb_jelly::Lazy<::blob_pb::VecSlice>"
LAZY_BYTES_TYPE = "::pb_jelly::Lazy<::bytes::Bytes>"
ZERO_COPY_STRING_TYPE = "::pb_jelly::StrBytes"
# pull out `x` from every instance of `::x::y::z`, but not `y` or `z`
CRATE_NAME_REGEX = re.compile(r"(?:^|\W)::(\w+)(?:::\w+)*")

Expand Down Expand Up @@ -256,6 +257,12 @@ def is_lazy_bytes(self) -> bool:
and self.field.options.Extensions[extensions_pb2.zero_copy]
)

def is_zero_copy_string(self) -> bool:
return (
self.field.type == FieldDescriptorProto.TYPE_STRING
and self.field.options.Extensions[extensions_pb2.zero_copy]
)

def is_boxed(self) -> bool:
return (
self.field.type == FieldDescriptorProto.TYPE_MESSAGE
Expand Down Expand Up @@ -327,7 +334,10 @@ def set_method(self) -> Tuple[Text, Text]:
elif self.field.type == FieldDescriptorProto.TYPE_BOOL:
return "bool", "v"
elif self.field.type == FieldDescriptorProto.TYPE_STRING:
return "::std::string::String", "v"
if self.is_zero_copy_string():
return ZERO_COPY_STRING_TYPE, "v"
else:
return "::std::string::String", "v"
elif self.field.type == FieldDescriptorProto.TYPE_BYTES:
if self.is_blob():
return BLOB_TYPE, "v"
Expand Down Expand Up @@ -360,7 +370,10 @@ def take_method(self) -> Tuple[Optional[Text], Optional[Text]]:
expr = "self.%s.take().unwrap_or_default()" % self.field.name

if self.field.type == FieldDescriptorProto.TYPE_STRING:
return "::std::string::String", expr
if self.is_zero_copy_string():
return ZERO_COPY_STRING_TYPE, expr
else:
return "::std::string::String", expr
elif self.field.type == FieldDescriptorProto.TYPE_BYTES:
if self.is_blob():
return BLOB_TYPE, expr
Expand Down Expand Up @@ -410,10 +423,12 @@ def get_method(self) -> Tuple[Text, Text]:
elif self.field.type == FieldDescriptorProto.TYPE_BOOL:
return "bool", "self.%s.unwrap_or(false)" % name
elif self.field.type == FieldDescriptorProto.TYPE_STRING:
return (
"&str",
'self.%s.as_ref().map(|ref s| s.as_str()).unwrap_or("")' % name,
)
return_type = "&str"
if self.is_zero_copy_string():
return_expr = 'self.%s.as_ref().map(|s| &**s).unwrap_or("")' % name
else:
return_expr = 'self.%s.as_ref().map(|ref s| s.as_str()).unwrap_or("")' % name
return (return_type, return_expr)
elif self.field.type == FieldDescriptorProto.TYPE_BYTES:
assert not (
self.is_blob() or self.is_grpc_slices() or self.is_lazy_bytes()
Expand Down Expand Up @@ -446,6 +461,9 @@ def rust_type(self) -> Text:

if self.is_lazy_bytes():
return LAZY_BYTES_TYPE

if self.is_zero_copy_string():
return ZERO_COPY_STRING_TYPE

if typ in PRIMITIVE_TYPES:
return PRIMITIVE_TYPES[typ][0]
Expand Down Expand Up @@ -1561,7 +1579,7 @@ def get_cargo_toml_file(self, derive_serde: bool) -> Iterator[Tuple[Text, Text]]
features = {u"serde": u' features = ["serde_derive"]'}
versions = {
u"lazy_static": u' version = "1.4.0" ',
u"pb-jelly": u' version = "0.0.5" ',
u"pb-jelly": u' path = "../../../../../pb-jelly" ',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm, we gotta find a better way to do the right thing here =\

Bazel and Spec.toml totally sidestep this issue - it's actually the entire point of Spec.toml - to template the location/version of the dependency.

This seems like a place where we'd accidentally have the wrong version provided - and it seems like our test suite will not actually be using the right pb-jelly dependency here, unless you make this change.

Maybe one idea - is that the version of pb-jelly-gen could get passed in as an argument to codegen.py - where our testsuite could inject ../../../../../pb-jelly. Then we're not hard coding a version number here.

Perhaps an idea for a separate issue?

u"serde": u' version = "1.0.114" ',
u"bytes": u' version = "0.5.6" '
}
Expand Down
10 changes: 10 additions & 0 deletions pb-jelly/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,13 @@ categories = ["encoding", "parsing", "web-programming"]
byteorder = "1.3.4"
bytes = "0.5.6"
serde = { version = "1.0.114", features = ["derive"] }

[features]
# Skips UTF-8 validation when using zero-copy Strings
#
# The (protobuf specification)[https://developers.google.com/protocol-buffers/docs/proto3#scalar]
# states that Strings must contain UTF-8 encoded text, so theoretically, validating a chunk
# of bytes is UTF-8 on deserialization shouldn't be needed. As a precaution though, we do this
# validation step. But validation can take a relatively large amount of time, so for users
# who wish to skip this validation, we provide the `zero_copy_string_no_utf8_check` feature flag.
zero_copy_string_no_utf8_check = []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for cargo bench to test with and w/o this feature flag?
Seems like it might not be possible w/ just how cargo bench works.

49 changes: 48 additions & 1 deletion pb-jelly/src/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ use std::any::{
type_name,
Any,
};
use std::convert::TryFrom;
use std::default::Default;
use std::fmt::{
self,
Expand All @@ -87,7 +88,10 @@ use bytes::{
Bytes,
};

use super::Message;
use super::{
Message,
StrBytes,
};

/// A stand-in trait for any backing buffer store. Required to be object-safe for lazy evaluation.
/// PbBuffers are expected to own references to the data they reference, and should be cheap
Expand Down Expand Up @@ -196,6 +200,32 @@ impl<B: PbBuffer + PartialEq> Message for Lazy<B> {
}
}

impl Message for StrBytes {
fn compute_size(&self) -> usize {
match self.bytes() {
None => 0,
Some(bytes) => bytes.len(),
}
}

fn compute_grpc_slices_size(&self) -> usize {
self.compute_size()
}

fn serialize<W: PbBufferWriter>(&self, w: &mut W) -> std::io::Result<()> {
if let Some(bytes) = self.bytes() {
w.write_buffer(bytes)?;
}
Ok(())
}

fn deserialize<R: PbBufferReader>(&mut self, r: &mut R) -> std::io::Result<()> {
self.replace_bytes(Bytes::from_reader(r)?)
.map_err(|e| Error::new(ErrorKind::InvalidData, e.to_string()))?;
Ok(())
}
}

impl<'a> PbBufferReader for Cursor<&'a [u8]> {
fn split(&mut self, at: usize) -> Self {
let pos = self.position() as usize;
Expand Down Expand Up @@ -229,6 +259,23 @@ impl PbBufferReader for Cursor<Bytes> {
}
}

impl PbBuffer for StrBytes {
type Reader = Cursor<Bytes>;

fn len(&self) -> usize {
self.len()
}

fn into_reader(self) -> Self::Reader {
Cursor::new(self.into_bytes())
}

fn from_reader<R: PbBufferReader>(reader: &mut R) -> Result<Self> {
let bytes: Bytes = reader.as_buffer()?;
StrBytes::try_from(bytes).map_err(|e| Error::new(ErrorKind::InvalidData, e.to_string()))
}
}

impl<'a> PbBufferWriter for Cursor<&'a mut Vec<u8>> {
/// Note: this implementation freely copies the data out of `buf`.
#[inline]
Expand Down
3 changes: 3 additions & 0 deletions pb-jelly/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ pub use crate::base_types::{
mod descriptor;
pub use crate::descriptor::MessageDescriptor;

mod str_bytes;
pub use crate::str_bytes::StrBytes;

#[cfg(test)]
mod tests;

Expand Down
117 changes: 117 additions & 0 deletions pb-jelly/src/str_bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
use bytes::Bytes;
use std::{
convert::TryFrom,
fmt,
ops::Deref,
};

#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub struct StrBytes {
bytes: Option<Bytes>,
}

impl StrBytes {
pub fn new() -> StrBytes {
StrBytes { bytes: None }
}

/// Length of the underlying byte store
pub fn len(&self) -> usize {
match self.bytes.as_ref() {
None => 0,
Some(bytes) => bytes.len(),
}
}

pub fn bytes(&self) -> Option<&Bytes> {
self.bytes.as_ref()
}

pub fn into_bytes(self) -> Bytes {
match self.bytes {
None => Bytes::new(),
Some(bytes) => bytes,
}
}

pub fn replace_bytes(&mut self, bytes: Bytes) -> Result<Option<Bytes>, std::str::Utf8Error> {
if cfg!(feature = "zero_copy_string_no_utf8_check") {
// If we have UTF-8 validation turned off, simply replace the bytes
return Ok(self.bytes.replace(bytes));
}

std::str::from_utf8(&bytes)?;
Ok(self.bytes.replace(bytes))
}
}

impl Deref for StrBytes {
type Target = str;

fn deref(&self) -> &Self::Target {
match self.bytes.as_ref() {
// SAFTEY: We validate on creation the underlying bytes are valid UTF8
Some(bytes) => unsafe { std::str::from_utf8_unchecked(&bytes) },
None => "",
}
}
}

#[cfg(not(feature = "zero_copy_string_no_utf8_check"))]
impl TryFrom<Bytes> for StrBytes {
type Error = std::str::Utf8Error;

fn try_from(bytes: Bytes) -> Result<StrBytes, Self::Error> {
std::str::from_utf8(&bytes)?;
Ok(StrBytes { bytes: Some(bytes) })
}
}

#[cfg(not(feature = "zero_copy_string_no_utf8_check"))]
impl TryFrom<&[u8]> for StrBytes {
type Error = std::str::Utf8Error;

fn try_from(slice: &[u8]) -> Result<StrBytes, Self::Error> {
std::str::from_utf8(&slice)?;
Ok(StrBytes {
bytes: Some(Bytes::copy_from_slice(slice)),
})
}
}

impl From<&str> for StrBytes {
fn from(string: &str) -> StrBytes {
StrBytes {
bytes: Some(Bytes::copy_from_slice(string.as_bytes())),
}
}
}

impl From<String> for StrBytes {
fn from(string: String) -> StrBytes {
StrBytes {
bytes: Some(Bytes::from(string)),
}
}
}

impl fmt::Display for StrBytes {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&**self, f)
}
}

#[cfg(feature = "zero_copy_string_no_utf8_check")]
impl From<Bytes> for StrBytes {
fn from(bytes: Bytes) -> StrBytes {
StrBytes { bytes: Some(bytes) }
}
}

#[cfg(feature = "zero_copy_string_no_utf8_check")]
impl From<&[u8]> for StrBytes {
fn from(slice: &[u8]) -> StrBytes {
StrBytes { bytes: Some(Bytes::copy_from_slice(slice)) }
}
}
3 changes: 2 additions & 1 deletion pb-test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ edition = "2018"

[dependencies]
bytes = "0.5.6"
pb-jelly = "0.0.5"
# pb-jelly = "0.0.5"
pb-jelly = { path = "../pb-jelly" }
pretty_assertions = "0.6.1"
proto_pbtest = { path = "gen/pb-jelly/proto_pbtest" }
walkdir = "2.3.1"
Expand Down
1 change: 1 addition & 0 deletions pb-test/pb_test_gen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ fn main() -> std::io::Result<()> {
.include("../proto/packages")
.customize(Customize {
carllerche_bytes_for_bytes: Some(true),
carllerche_bytes_for_string: Some(true),
..Default::default()
})
.run()
Expand Down
5 changes: 5 additions & 0 deletions pb-test/proto/packages/pbtest/bench.proto
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ message VecData {
optional bytes data = 1;
}

message ZeroCopyStringMessage {
// Using the `zero_copy` option, the `string` type maps to `pb_jelly::StrBytes`
optional string data = 1 [(rust.zero_copy)=true];
}

message StringMessage {
optional string data = 1;
}
Loading