-
Notifications
You must be signed in to change notification settings - Fork 25
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] Zero Copy String Deserialization #88
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -66,6 +66,7 @@ | |
BLOB_TYPE = "::pb_jelly::Lazy<::blob_pb::WrappedBlob>" | ||
VEC_SLICE_TYPE = "::pb_jelly::Lazy<::blob_pb::VecSlice>" | ||
LAZY_BYTES_TYPE = "::pb_jelly::Lazy<::bytes::Bytes>" | ||
ZERO_COPY_STRING_TYPE = "::pb_jelly::StrBytes" | ||
# pull out `x` from every instance of `::x::y::z`, but not `y` or `z` | ||
CRATE_NAME_REGEX = re.compile(r"(?:^|\W)::(\w+)(?:::\w+)*") | ||
|
||
|
@@ -256,6 +257,12 @@ def is_lazy_bytes(self) -> bool: | |
and self.field.options.Extensions[extensions_pb2.zero_copy] | ||
) | ||
|
||
def is_zero_copy_string(self) -> bool: | ||
return ( | ||
self.field.type == FieldDescriptorProto.TYPE_STRING | ||
and self.field.options.Extensions[extensions_pb2.zero_copy] | ||
) | ||
|
||
def is_boxed(self) -> bool: | ||
return ( | ||
self.field.type == FieldDescriptorProto.TYPE_MESSAGE | ||
|
@@ -327,7 +334,10 @@ def set_method(self) -> Tuple[Text, Text]: | |
elif self.field.type == FieldDescriptorProto.TYPE_BOOL: | ||
return "bool", "v" | ||
elif self.field.type == FieldDescriptorProto.TYPE_STRING: | ||
return "::std::string::String", "v" | ||
if self.is_zero_copy_string(): | ||
return ZERO_COPY_STRING_TYPE, "v" | ||
else: | ||
return "::std::string::String", "v" | ||
elif self.field.type == FieldDescriptorProto.TYPE_BYTES: | ||
if self.is_blob(): | ||
return BLOB_TYPE, "v" | ||
|
@@ -360,7 +370,10 @@ def take_method(self) -> Tuple[Optional[Text], Optional[Text]]: | |
expr = "self.%s.take().unwrap_or_default()" % self.field.name | ||
|
||
if self.field.type == FieldDescriptorProto.TYPE_STRING: | ||
return "::std::string::String", expr | ||
if self.is_zero_copy_string(): | ||
return ZERO_COPY_STRING_TYPE, expr | ||
else: | ||
return "::std::string::String", expr | ||
elif self.field.type == FieldDescriptorProto.TYPE_BYTES: | ||
if self.is_blob(): | ||
return BLOB_TYPE, expr | ||
|
@@ -410,10 +423,12 @@ def get_method(self) -> Tuple[Text, Text]: | |
elif self.field.type == FieldDescriptorProto.TYPE_BOOL: | ||
return "bool", "self.%s.unwrap_or(false)" % name | ||
elif self.field.type == FieldDescriptorProto.TYPE_STRING: | ||
return ( | ||
"&str", | ||
'self.%s.as_ref().map(|ref s| s.as_str()).unwrap_or("")' % name, | ||
) | ||
return_type = "&str" | ||
if self.is_zero_copy_string(): | ||
return_expr = 'self.%s.as_ref().map(|s| &**s).unwrap_or("")' % name | ||
else: | ||
return_expr = 'self.%s.as_ref().map(|ref s| s.as_str()).unwrap_or("")' % name | ||
return (return_type, return_expr) | ||
elif self.field.type == FieldDescriptorProto.TYPE_BYTES: | ||
assert not ( | ||
self.is_blob() or self.is_grpc_slices() or self.is_lazy_bytes() | ||
|
@@ -446,6 +461,9 @@ def rust_type(self) -> Text: | |
|
||
if self.is_lazy_bytes(): | ||
return LAZY_BYTES_TYPE | ||
|
||
if self.is_zero_copy_string(): | ||
return ZERO_COPY_STRING_TYPE | ||
|
||
if typ in PRIMITIVE_TYPES: | ||
return PRIMITIVE_TYPES[typ][0] | ||
|
@@ -1561,7 +1579,7 @@ def get_cargo_toml_file(self, derive_serde: bool) -> Iterator[Tuple[Text, Text]] | |
features = {u"serde": u' features = ["serde_derive"]'} | ||
versions = { | ||
u"lazy_static": u' version = "1.4.0" ', | ||
u"pb-jelly": u' version = "0.0.5" ', | ||
u"pb-jelly": u' path = "../../../../../pb-jelly" ', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmm, we gotta find a better way to do the right thing here =\ Bazel and Spec.toml totally sidestep this issue - it's actually the entire point of Spec.toml - to template the location/version of the dependency. This seems like a place where we'd accidentally have the wrong version provided - and it seems like our test suite will not actually be using the right pb-jelly dependency here, unless you make this change. Maybe one idea - is that the version of pb-jelly-gen could get passed in as an argument to codegen.py - where our testsuite could inject ../../../../../pb-jelly. Then we're not hard coding a version number here. Perhaps an idea for a separate issue? |
||
u"serde": u' version = "1.0.114" ', | ||
u"bytes": u' version = "0.5.6" ' | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,3 +17,13 @@ categories = ["encoding", "parsing", "web-programming"] | |
byteorder = "1.3.4" | ||
bytes = "0.5.6" | ||
serde = { version = "1.0.114", features = ["derive"] } | ||
|
||
[features] | ||
# Skips UTF-8 validation when using zero-copy Strings | ||
# | ||
# The (protobuf specification)[https://developers.google.com/protocol-buffers/docs/proto3#scalar] | ||
# states that Strings must contain UTF-8 encoded text, so theoretically, validating a chunk | ||
# of bytes is UTF-8 on deserialization shouldn't be needed. As a precaution though, we do this | ||
# validation step. But validation can take a relatively large amount of time, so for users | ||
# who wish to skip this validation, we provide the `zero_copy_string_no_utf8_check` feature flag. | ||
zero_copy_string_no_utf8_check = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible for cargo bench to test with and w/o this feature flag? |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
use bytes::Bytes; | ||
use std::{ | ||
convert::TryFrom, | ||
fmt, | ||
ops::Deref, | ||
}; | ||
|
||
#[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Ord, PartialOrd)] | ||
pub struct StrBytes { | ||
bytes: Option<Bytes>, | ||
} | ||
|
||
impl StrBytes { | ||
pub fn new() -> StrBytes { | ||
StrBytes { bytes: None } | ||
} | ||
|
||
/// Length of the underlying byte store | ||
pub fn len(&self) -> usize { | ||
match self.bytes.as_ref() { | ||
None => 0, | ||
Some(bytes) => bytes.len(), | ||
} | ||
} | ||
|
||
pub fn bytes(&self) -> Option<&Bytes> { | ||
self.bytes.as_ref() | ||
} | ||
|
||
pub fn into_bytes(self) -> Bytes { | ||
match self.bytes { | ||
None => Bytes::new(), | ||
Some(bytes) => bytes, | ||
} | ||
} | ||
|
||
pub fn replace_bytes(&mut self, bytes: Bytes) -> Result<Option<Bytes>, std::str::Utf8Error> { | ||
if cfg!(feature = "zero_copy_string_no_utf8_check") { | ||
// If we have UTF-8 validation turned off, simply replace the bytes | ||
return Ok(self.bytes.replace(bytes)); | ||
} | ||
|
||
std::str::from_utf8(&bytes)?; | ||
Ok(self.bytes.replace(bytes)) | ||
} | ||
} | ||
|
||
impl Deref for StrBytes { | ||
type Target = str; | ||
|
||
fn deref(&self) -> &Self::Target { | ||
match self.bytes.as_ref() { | ||
// SAFTEY: We validate on creation the underlying bytes are valid UTF8 | ||
Some(bytes) => unsafe { std::str::from_utf8_unchecked(&bytes) }, | ||
None => "", | ||
} | ||
} | ||
} | ||
|
||
#[cfg(not(feature = "zero_copy_string_no_utf8_check"))] | ||
impl TryFrom<Bytes> for StrBytes { | ||
type Error = std::str::Utf8Error; | ||
|
||
fn try_from(bytes: Bytes) -> Result<StrBytes, Self::Error> { | ||
std::str::from_utf8(&bytes)?; | ||
Ok(StrBytes { bytes: Some(bytes) }) | ||
} | ||
} | ||
|
||
#[cfg(not(feature = "zero_copy_string_no_utf8_check"))] | ||
impl TryFrom<&[u8]> for StrBytes { | ||
type Error = std::str::Utf8Error; | ||
|
||
fn try_from(slice: &[u8]) -> Result<StrBytes, Self::Error> { | ||
std::str::from_utf8(&slice)?; | ||
Ok(StrBytes { | ||
bytes: Some(Bytes::copy_from_slice(slice)), | ||
}) | ||
} | ||
} | ||
|
||
impl From<&str> for StrBytes { | ||
fn from(string: &str) -> StrBytes { | ||
StrBytes { | ||
bytes: Some(Bytes::copy_from_slice(string.as_bytes())), | ||
} | ||
} | ||
} | ||
|
||
impl From<String> for StrBytes { | ||
fn from(string: String) -> StrBytes { | ||
StrBytes { | ||
bytes: Some(Bytes::from(string)), | ||
} | ||
} | ||
} | ||
|
||
impl fmt::Display for StrBytes { | ||
#[inline] | ||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||
fmt::Display::fmt(&**self, f) | ||
} | ||
} | ||
|
||
#[cfg(feature = "zero_copy_string_no_utf8_check")] | ||
impl From<Bytes> for StrBytes { | ||
fn from(bytes: Bytes) -> StrBytes { | ||
StrBytes { bytes: Some(bytes) } | ||
} | ||
} | ||
|
||
#[cfg(feature = "zero_copy_string_no_utf8_check")] | ||
impl From<&[u8]> for StrBytes { | ||
fn from(slice: &[u8]) -> StrBytes { | ||
StrBytes { bytes: Some(Bytes::copy_from_slice(slice)) } | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Per https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#multiple-locations - you can actually specify both path and version
Actually, in this case, since examples aren't being posted to crates.io, we probably should keep it as only a path dependency.