Skip to content

Commit

Permalink
Add better documentation, examples and builer-style API to ByteView (
Browse files Browse the repository at this point in the history
…apache#6479)

* Add better documentation, examples and builer-style API to `ByteView`

* simplify tests

* Apply suggestions from code review

Co-authored-by: Raphael Taylor-Davies <[email protected]>

* Moar inline

* Update arrow-data/src/byte_view.rs

Co-authored-by: Xiangpeng Hao <[email protected]>

* fmt

* Update test so it fails at the right place

---------

Co-authored-by: Raphael Taylor-Davies <[email protected]>
Co-authored-by: Xiangpeng Hao <[email protected]>
  • Loading branch information
3 people authored Oct 1, 2024
1 parent 2d67c4c commit 1e9e5a2
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 17 deletions.
29 changes: 14 additions & 15 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,11 @@ use super::ByteArrayType;
///
/// # See Also
///
/// See [`StringViewArray`] for storing utf8 encoded string data and
/// [`BinaryViewArray`] for storing bytes.
/// * [`StringViewArray`] for storing utf8 encoded string data
/// * [`BinaryViewArray`] for storing bytes
/// * [`ByteView`] to interpret `u128`s layout of the views.
///
/// [`ByteView`]: arrow_data::ByteView
///
/// # Notes
///
Expand Down Expand Up @@ -872,12 +875,9 @@ mod tests {
#[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
fn new_with_invalid_view_data() {
let v = "large payload over 12 bytes";
let view = ByteView {
length: 13,
prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()),
buffer_index: 3,
offset: 1,
};
let view = ByteView::new(13, &v.as_bytes()[0..4])
.with_buffer_index(3)
.with_offset(1);
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(v)];
StringViewArray::new(views, buffers, None);
Expand All @@ -888,13 +888,12 @@ mod tests {
expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
)]
fn new_with_invalid_utf8_data() {
let v: Vec<u8> = vec![0xf0, 0x80, 0x80, 0x80];
let view = ByteView {
length: v.len() as u32,
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
buffer_index: 0,
offset: 0,
};
let v: Vec<u8> = vec![
// invalid UTF8
0xf0, 0x80, 0x80, 0x80, // more bytes to make it larger than 12
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
let view = ByteView::new(v.len() as u32, &v[0..4]);
let views = ScalarBuffer::from(vec![view.into()]);
let buffers = vec![Buffer::from_slice_ref(v)];
StringViewArray::new(views, buffers, None);
Expand Down
72 changes: 70 additions & 2 deletions arrow-data/src/byte_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,40 @@ use arrow_schema::ArrowError;
/// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and
/// `BinaryViewArray`) where the length is greater than 12 bytes.
///
/// See the documentation on [`GenericByteViewArray`] for more information on
/// the layout of the views.
/// See Also:
/// * [`GenericByteViewArray`] for more information on the layout of the views.
/// * [`validate_binary_view`] and [`validate_string_view`] to validate
///
/// # Example: Create a new u128 view
///
/// ```rust
/// # use arrow_data::ByteView;;
/// // Create a view for a string of length 20
/// // first four bytes are "Rust"
/// // stored in buffer 3
/// // at offset 42
/// let prefix = "Rust";
/// let view = ByteView::new(20, prefix.as_bytes())
/// .with_buffer_index(3)
/// .with_offset(42);
///
/// // create the final u128
/// let v = view.as_u128();
/// assert_eq!(v, 0x2a000000037473755200000014);
/// ```
///
/// # Example: decode a `u128` into its constituent fields
/// ```rust
/// # use arrow_data::ByteView;
/// // Convert a u128 to a ByteView
/// // See validate_{string,binary}_view functions to validate
/// let v = ByteView::from(0x2a000000037473755200000014);
///
/// assert_eq!(v.length, 20);
/// assert_eq!(v.prefix, 0x74737552);
/// assert_eq!(v.buffer_index, 3);
/// assert_eq!(v.offset, 42);
/// ```
///
/// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html
#[derive(Debug, Copy, Clone, Default)]
Expand All @@ -39,6 +71,42 @@ pub struct ByteView {
}

impl ByteView {
/// Construct a [`ByteView`] for data `length` of bytes with the specified prefix.
///
/// See example on [`ByteView`] docs
///
/// Notes:
/// * the length should always be greater than 12 (Data less than 12
/// bytes is stored as an inline view)
/// * buffer and offset are set to `0`
///
/// # Panics
/// If the prefix is not exactly 4 bytes
#[inline]
pub fn new(length: u32, prefix: &[u8]) -> Self {
debug_assert!(length > 12);
Self {
length,
prefix: u32::from_le_bytes(prefix.try_into().unwrap()),
buffer_index: 0,
offset: 0,
}
}

/// Set the [`Self::buffer_index`] field
#[inline]
pub fn with_buffer_index(mut self, buffer_index: u32) -> Self {
self.buffer_index = buffer_index;
self
}

/// Set the [`Self::offset`] field
#[inline]
pub fn with_offset(mut self, offset: u32) -> Self {
self.offset = offset;
self
}

#[inline(always)]
/// Convert `ByteView` to `u128` by concatenating the fields
pub fn as_u128(self) -> u128 {
Expand Down

0 comments on commit 1e9e5a2

Please sign in to comment.