Skip to content

Commit

Permalink
feat: add new string view type [part 1] (#14662)
Browse files Browse the repository at this point in the history
* add new string view type

* add tests for string_view type
  • Loading branch information
ariesdevil authored Mar 1, 2024
1 parent 1b8e8c7 commit a8ab8bc
Show file tree
Hide file tree
Showing 83 changed files with 4,531 additions and 233 deletions.
112 changes: 67 additions & 45 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/common/arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ arrow-format = { workspace = true }
bitpacking = "0.8.0"
byteorder = { workspace = true }
bytes = "^1"
indexmap = "2.2.3"
log = { workspace = true }
num = { version = "0.4", default-features = false, features = ["std"] }
ordered-float = "3.7.0"
Expand Down
4 changes: 2 additions & 2 deletions src/common/arrow/src/arrow/array/binary/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ unsafe impl<O: Offset> ToFfi for BinaryArray<O> {
fn buffers(&self) -> Vec<Option<*const u8>> {
vec![
self.validity.as_ref().map(|x| x.as_ptr()),
Some(self.offsets.buffer().as_ptr().cast::<u8>()),
Some(self.values.as_ptr().cast::<u8>()),
Some(self.offsets.buffer().data_ptr().cast::<u8>()),
Some(self.values.data_ptr().cast::<u8>()),
]
}

Expand Down
118 changes: 118 additions & 0 deletions src/common/arrow/src/arrow/array/binview/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Copyright (c) 2020 Ritchie Vink
// Copyright 2021 Datafuse Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::Arc;

use crate::arrow::array::binview::BinaryViewArrayGeneric;
use crate::arrow::array::binview::View;
use crate::arrow::array::binview::ViewType;
use crate::arrow::array::FromFfi;
use crate::arrow::array::ToFfi;
use crate::arrow::bitmap::align;
use crate::arrow::error::Result;
use crate::arrow::ffi;

unsafe impl<T: ViewType + ?Sized> ToFfi for BinaryViewArrayGeneric<T> {
fn buffers(&self) -> Vec<Option<*const u8>> {
let mut buffers = Vec::with_capacity(self.buffers.len() + 2);
buffers.push(self.validity.as_ref().map(|x| x.as_ptr()));
buffers.push(Some(self.views.data_ptr().cast::<u8>()));
buffers.extend(self.buffers.iter().map(|b| Some(b.data_ptr())));
buffers
}

fn offset(&self) -> Option<usize> {
let offset = self.views.offset();
if let Some(bitmap) = self.validity.as_ref() {
if bitmap.offset() == offset {
Some(offset)
} else {
None
}
} else {
Some(offset)
}
}

fn to_ffi_aligned(&self) -> Self {
let offset = self.views.offset();

let validity = self.validity.as_ref().map(|bitmap| {
if bitmap.offset() == offset {
bitmap.clone()
} else {
align(bitmap, offset)
}
});

Self {
data_type: self.data_type.clone(),
validity,
views: self.views.clone(),
buffers: self.buffers.clone(),
raw_buffers: self.raw_buffers.clone(),
phantom: Default::default(),
total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
total_buffer_len: self.total_buffer_len,
}
}
}

impl<T: ViewType + ?Sized, A: ffi::ArrowArrayRef> FromFfi<A> for BinaryViewArrayGeneric<T> {
unsafe fn try_from_ffi(array: A) -> Result<Self> {
let data_type = array.data_type().clone();

let validity = unsafe { array.validity() }?;
let views = unsafe { array.buffer::<View>(1) }?;

// n_buffers - 2, 2 means validity + views
let n_buffers = array.n_buffers();
let mut remaining_buffers = n_buffers - 2;
if remaining_buffers <= 1 {
return Ok(Self::new_unchecked_unknown_md(
data_type,
views,
Arc::from([]),
validity,
None,
));
}

let n_variadic_buffers = remaining_buffers - 1;
let variadic_buffer_offset = n_buffers - 1;

let variadic_buffer_sizes =
array.buffer_known_len::<i64>(variadic_buffer_offset, n_variadic_buffers)?;
remaining_buffers -= 1;

let mut variadic_buffers = Vec::with_capacity(remaining_buffers);

let offset = 2;
for (i, &size) in (offset..remaining_buffers + offset).zip(variadic_buffer_sizes.iter()) {
let values = unsafe { array.buffer_known_len::<u8>(i, size as usize) }?;
variadic_buffers.push(values);
}

Ok(Self::new_unchecked_unknown_md(
data_type,
views,
Arc::from(variadic_buffers),
validity,
None,
))
}
}
56 changes: 56 additions & 0 deletions src/common/arrow/src/arrow/array/binview/fmt.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright (c) 2020 Ritchie Vink
// Copyright 2021 Datafuse Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::fmt::Debug;
use std::fmt::Formatter;
use std::fmt::Result;
use std::fmt::Write;

use crate::arrow::array::binview::BinaryViewArray;
use crate::arrow::array::binview::BinaryViewArrayGeneric;
use crate::arrow::array::binview::Utf8ViewArray;
use crate::arrow::array::binview::ViewType;
use crate::arrow::array::fmt::write_vec;
use crate::arrow::array::Array;

pub fn write_value<'a, T: ViewType + ?Sized, W: Write>(
array: &'a BinaryViewArrayGeneric<T>,
index: usize,
f: &mut W,
) -> Result
where
&'a T: Debug,
{
let bytes = array.value(index).to_bytes();
let writer = |f: &mut W, index| write!(f, "{}", bytes[index]);

write_vec(f, writer, None, bytes.len(), "None", false)
}

impl Debug for BinaryViewArray {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let writer = |f: &mut Formatter, index| write_value(self, index, f);
write!(f, "BinaryViewArray")?;
write_vec(f, writer, self.validity(), self.len(), "None", false)
}
}

impl Debug for Utf8ViewArray {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let writer = |f: &mut Formatter, index| write!(f, "{}", self.value(index));
write!(f, "Utf8ViewArray")?;
write_vec(f, writer, self.validity(), self.len(), "None", false)
}
}
24 changes: 24 additions & 0 deletions src/common/arrow/src/arrow/array/binview/from.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright 2021 Datafuse Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::arrow::array::BinaryViewArrayGeneric;
use crate::arrow::array::MutableBinaryViewArray;
use crate::arrow::array::ViewType;

impl<T: ViewType + ?Sized, P: AsRef<T>> FromIterator<Option<P>> for BinaryViewArrayGeneric<T> {
#[inline]
fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {
MutableBinaryViewArray::<T>::from_iter(iter).into()
}
}
64 changes: 64 additions & 0 deletions src/common/arrow/src/arrow/array/binview/iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2021 Datafuse Labs
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::arrow::array::binview::mutable::MutableBinaryViewArray;
use crate::arrow::array::binview::BinaryViewArrayGeneric;
use crate::arrow::array::binview::ViewType;
use crate::arrow::array::ArrayAccessor;
use crate::arrow::array::ArrayValuesIter;
use crate::arrow::bitmap::utils::BitmapIter;
use crate::arrow::bitmap::utils::ZipValidity;

unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for BinaryViewArrayGeneric<T> {
type Item = &'a T;

#[inline]
unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item {
self.value_unchecked(index)
}

#[inline]
fn len(&self) -> usize {
self.views.len()
}
}

/// Iterator of values of an [`BinaryArray`].
pub type BinaryViewValueIter<'a, T> = ArrayValuesIter<'a, BinaryViewArrayGeneric<T>>;

impl<'a, T: ViewType + ?Sized> IntoIterator for &'a BinaryViewArrayGeneric<T> {
type Item = Option<&'a T>;
type IntoIter = ZipValidity<&'a T, BinaryViewValueIter<'a, T>, BitmapIter<'a>>;

fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}

unsafe impl<'a, T: ViewType + ?Sized> ArrayAccessor<'a> for MutableBinaryViewArray<T> {
type Item = &'a T;

#[inline]
unsafe fn value_unchecked(&'a self, index: usize) -> Self::Item {
self.value_unchecked(index)
}

#[inline]
fn len(&self) -> usize {
self.views().len()
}
}

/// Iterator of values of an [`MutableBinaryViewArray`].
pub type MutableBinaryViewValueIter<'a, T> = ArrayValuesIter<'a, MutableBinaryViewArray<T>>;
Loading

0 comments on commit a8ab8bc

Please sign in to comment.