Skip to content

Commit

Permalink
feat(Rust): Implement utf16 to utf8 conversion algorithm (#1730)
Browse files Browse the repository at this point in the history
## What does this PR do?
Implement an util function converting utf16 encoded string to utf8
string in Rust.


## Related issues
#1547 


## Does this PR introduce any user-facing change?
- [x] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?


## Benchmark
  • Loading branch information
urlyy authored Jul 18, 2024
1 parent 219cdc7 commit 180311d
Show file tree
Hide file tree
Showing 3 changed files with 228 additions and 0 deletions.
2 changes: 2 additions & 0 deletions rust/fury/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ mod meta;
mod row;
mod serializer;
mod types;
mod util;

pub use deserializer::from_buffer;
pub use error::Error;
pub use fury_derive::*;
pub use meta::{Encoding, MetaStringDecoder, MetaStringEncoder};
pub use row::{from_row, to_row};
pub use serializer::to_buffer;
pub use util::to_utf8;

pub mod __derive {
pub use crate::buffer::{Reader, Writer};
Expand Down
113 changes: 113 additions & 0 deletions rust/fury/src/util.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use std::ptr;

// Swapping the high 8 bits and the low 8 bits of a 16-bit value
fn swap_endian(value: u16) -> u16 {
(value << 8) | (value >> 8)
}

pub fn to_utf8(utf16: &[u16], is_little_endian: bool) -> Result<Vec<u8>, String> {
// Pre-allocating capacity to avoid dynamic resizing
// Longest case: 1 u16 to 3 u8
let mut utf8_bytes: Vec<u8> = Vec::with_capacity(utf16.len() * 3);
// For unsafe write to Vec
let ptr = utf8_bytes.as_mut_ptr();
let mut offset = 0;
let mut iter = utf16.iter();
while let Some(&wc) = iter.next() {
// Using big endian in this conversion
let wc = if is_little_endian {
swap_endian(wc)
} else {
wc
};
match wc {
code_point if code_point < 0x80 => {
// 1-byte UTF-8
// [0000|0000|0ccc|cccc] => [0ccc|cccc]
unsafe {
ptr.add(offset).write(code_point as u8);
}
offset += 1;
}
code_point if code_point < 0x800 => {
// 2-byte UTF-8
// [0000|0bbb|bbcc|cccc] => [110|bbbbb], [10|cccccc]
let bytes = [
(code_point >> 6 & 0b1_1111) as u8 | 0b1100_0000,
(code_point & 0b11_1111) as u8 | 0b1000_0000,
];
unsafe {
ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 2);
}
offset += 2;
}
wc1 if (0xd800..=0xdbff).contains(&wc1) => {
// Surrogate pair (4-byte UTF-8)
// Need extra u16, 2 u16 -> 4 u8
if let Some(&wc2) = iter.next() {
let wc2 = if is_little_endian {
swap_endian(wc2)
} else {
wc2
};
if !(0xdc00..=0xdfff).contains(&wc2) {
return Err("Invalid UTF-16 string: wrong surrogate pair".to_string());
}
// utf16 to unicode
let code_point =
((((wc1 as u32) - 0xd800) << 10) | ((wc2 as u32) - 0xdc00)) + 0x10000;
// 11110??? 10?????? 10?????? 10??????
// Need 21 bit suffix of code_point
let bytes = [
(code_point >> 18 & 0b111) as u8 | 0b1111_0000,
(code_point >> 12 & 0b11_1111) as u8 | 0b1000_0000,
(code_point >> 6 & 0b11_1111) as u8 | 0b1000_0000,
(code_point & 0b11_1111) as u8 | 0b1000_0000,
];
unsafe {
ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 4);
}
offset += 4;
} else {
return Err("Invalid UTF-16 string: missing surrogate pair".to_string());
}
}
_ => {
// 3-byte UTF-8, 1 u16 -> 3 u8
// [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10|bbbbbb], [10|cccccc]
// Need 16 bit suffix of wc, as same as wc itself
let bytes = [
(wc >> 12 | 0b1110_0000) as u8,
(wc >> 6 & 0b11_1111) as u8 | 0b1000_0000,
(wc & 0b11_1111) as u8 | 0b1000_0000,
];
unsafe {
ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 3);
}
offset += 3;
}
}
}
unsafe {
// As ptr.write don't change the length
utf8_bytes.set_len(offset);
}
Ok(utf8_bytes)
}
113 changes: 113 additions & 0 deletions rust/tests/tests/test_util.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use fury::to_utf8;

#[test]
fn test_to_utf8() {
let s = "Hé€lo, 世界!😀";
let is_little_endian = false;
let utf16_bytes = s.encode_utf16().collect::<Vec<u16>>();
println!("==========init utf16:");
let utf16_strings: Vec<String> = utf16_bytes
.iter()
.map(|&byte| format!("0x{:04x}", byte))
.collect();
println!("{}", utf16_strings.join(","));
let utf8_bytes = to_utf8(&utf16_bytes, is_little_endian).unwrap();
println!("==========utf8:");
let utf8_strings: Vec<String> = utf8_bytes
.iter()
.map(|&byte| format!("0x{:02x}", byte))
.collect();
println!("{}", utf8_strings.join(","));
// final UTF-8 string
let final_string = String::from_utf8(utf8_bytes.clone()).unwrap();
println!("final string: {}", final_string);
assert_eq!(s, final_string);
}

// For test
fn swap_endian(value: u16) -> u16 {
((value & 0xff) << 8) | ((value & 0xff00) >> 8)
}

#[test]
fn test_to_utf8_3byte() {
let s = "é₫l₪₮";
let utf16_bytes = s.encode_utf16().collect::<Vec<u16>>();
let utf8_bytes = to_utf8(&utf16_bytes, false).unwrap();
assert_eq!(String::from_utf8(utf8_bytes.clone()).unwrap(), s);
let utf16_bytes_le = s
.encode_utf16()
.collect::<Vec<u16>>()
.iter()
.map(|&byte| swap_endian(byte))
.collect::<Vec<u16>>();
let utf8_bytes_le = to_utf8(&utf16_bytes_le, true).unwrap();
assert_eq!(String::from_utf8(utf8_bytes_le.clone()).unwrap(), s);
}

#[test]
fn test_to_utf8_endian() {
let utf16 = &[0x6100, 0x6200]; // 'ab' in UTF-16 little endian
let expected = b"ab";
let result = to_utf8(utf16, true).unwrap();
assert_eq!(result, expected, "Little endian test failed");
let utf16 = &[0x0061, 0x0062]; // 'ab' in UTF-16 big endian
let expected = b"ab";
let result = to_utf8(utf16, false).unwrap();
assert_eq!(result, expected, "Big endian test failed");
}

#[test]
fn test_to_utf8_surrogate_pair() {
let s = "𝄞💡😀🎻";
let utf16_bytes = s.encode_utf16().collect::<Vec<u16>>();
let result_be = to_utf8(&utf16_bytes, false);
assert!(result_be.is_ok());
assert_eq!(String::from_utf8(result_be.unwrap().clone()).unwrap(), s);
// test little endian
let utf16_bytes_le = s
.encode_utf16()
.collect::<Vec<u16>>()
.iter()
.map(|&byte| swap_endian(byte))
.collect::<Vec<u16>>();
let result_le = to_utf8(&utf16_bytes_le, true);
assert!(result_le.is_ok());
assert_eq!(String::from_utf8(result_le.unwrap().clone()).unwrap(), s);
}

#[test]
fn test_to_utf8_missing_surrogate_pair() {
let utf16 = &[0x00D8]; // Missing second surrogate
let result = to_utf8(utf16, true);
assert!(result.is_err());
assert_eq!(
result.unwrap_err(),
"Invalid UTF-16 string: missing surrogate pair"
);

let utf16 = &[0x00D8, 0x00DA]; // Wrong second surrogate
let result = to_utf8(utf16, true);
assert!(result.is_err());
assert_eq!(
result.unwrap_err(),
"Invalid UTF-16 string: wrong surrogate pair"
);
}

0 comments on commit 180311d

Please sign in to comment.