feat(Rust): Implement utf16 to utf8 conversion algorithm (#1730)

## What does this PR do? Implement an util function converting utf16 encoded string to utf8 string in Rust. ## Related issues #1547 ## Does this PR introduce any user-facing change? - [x] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark
apache · Jul 18, 2024 · 180311d · 180311d
1 parent 219cdc7
commit 180311d
Show file tree

Hide file tree

Showing 3 changed files with 228 additions and 0 deletions.
diff --git a/rust/fury/src/lib.rs b/rust/fury/src/lib.rs
@@ -22,13 +22,15 @@ mod meta;
 mod row;
 mod serializer;
 mod types;
+mod util;
 
 pub use deserializer::from_buffer;
 pub use error::Error;
 pub use fury_derive::*;
 pub use meta::{Encoding, MetaStringDecoder, MetaStringEncoder};
 pub use row::{from_row, to_row};
 pub use serializer::to_buffer;
+pub use util::to_utf8;
 
 pub mod __derive {
     pub use crate::buffer::{Reader, Writer};

diff --git a/rust/fury/src/util.rs b/rust/fury/src/util.rs
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::ptr;
+
+// Swapping the high 8 bits and the low 8 bits of a 16-bit value
+fn swap_endian(value: u16) -> u16 {
+    (value << 8) | (value >> 8)
+}
+
+pub fn to_utf8(utf16: &[u16], is_little_endian: bool) -> Result<Vec<u8>, String> {
+    // Pre-allocating capacity to avoid dynamic resizing
+    // Longest case: 1 u16 to 3 u8
+    let mut utf8_bytes: Vec<u8> = Vec::with_capacity(utf16.len() * 3);
+    // For unsafe write to Vec
+    let ptr = utf8_bytes.as_mut_ptr();
+    let mut offset = 0;
+    let mut iter = utf16.iter();
+    while let Some(&wc) = iter.next() {
+        // Using big endian in this conversion
+        let wc = if is_little_endian {
+            swap_endian(wc)
+        } else {
+            wc
+        };
+        match wc {
+            code_point if code_point < 0x80 => {
+                // 1-byte UTF-8
+                // [0000|0000|0ccc|cccc] => [0ccc|cccc]
+                unsafe {
+                    ptr.add(offset).write(code_point as u8);
+                }
+                offset += 1;
+            }
+            code_point if code_point < 0x800 => {
+                // 2-byte UTF-8
+                // [0000|0bbb|bbcc|cccc] => [110|bbbbb], [10|cccccc]
+                let bytes = [
+                    (code_point >> 6 & 0b1_1111) as u8 | 0b1100_0000,
+                    (code_point & 0b11_1111) as u8 | 0b1000_0000,
+                ];
+                unsafe {
+                    ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 2);
+                }
+                offset += 2;
+            }
+            wc1 if (0xd800..=0xdbff).contains(&wc1) => {
+                // Surrogate pair (4-byte UTF-8)
+                // Need extra u16, 2 u16 -> 4 u8
+                if let Some(&wc2) = iter.next() {
+                    let wc2 = if is_little_endian {
+                        swap_endian(wc2)
+                    } else {
+                        wc2
+                    };
+                    if !(0xdc00..=0xdfff).contains(&wc2) {
+                        return Err("Invalid UTF-16 string: wrong surrogate pair".to_string());
+                    }
+                    // utf16 to unicode
+                    let code_point =
+                        ((((wc1 as u32) - 0xd800) << 10) | ((wc2 as u32) - 0xdc00)) + 0x10000;
+                    // 11110??? 10?????? 10?????? 10??????
+                    // Need 21 bit suffix of code_point
+                    let bytes = [
+                        (code_point >> 18 & 0b111) as u8 | 0b1111_0000,
+                        (code_point >> 12 & 0b11_1111) as u8 | 0b1000_0000,
+                        (code_point >> 6 & 0b11_1111) as u8 | 0b1000_0000,
+                        (code_point & 0b11_1111) as u8 | 0b1000_0000,
+                    ];
+                    unsafe {
+                        ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 4);
+                    }
+                    offset += 4;
+                } else {
+                    return Err("Invalid UTF-16 string: missing surrogate pair".to_string());
+                }
+            }
+            _ => {
+                // 3-byte UTF-8, 1 u16 -> 3 u8
+                // [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10|bbbbbb], [10|cccccc]
+                // Need 16 bit suffix of wc, as same as wc itself
+                let bytes = [
+                    (wc >> 12 | 0b1110_0000) as u8,
+                    (wc >> 6 & 0b11_1111) as u8 | 0b1000_0000,
+                    (wc & 0b11_1111) as u8 | 0b1000_0000,
+                ];
+                unsafe {
+                    ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 3);
+                }
+                offset += 3;
+            }
+        }
+    }
+    unsafe {
+        // As ptr.write don't change the length
+        utf8_bytes.set_len(offset);
+    }
+    Ok(utf8_bytes)
+}
diff --git a/rust/tests/tests/test_util.rs b/rust/tests/tests/test_util.rs
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use fury::to_utf8;
+
+#[test]
+fn test_to_utf8() {
+    let s = "Hé€lo, 世界!😀";
+    let is_little_endian = false;
+    let utf16_bytes = s.encode_utf16().collect::<Vec<u16>>();
+    println!("==========init utf16:");
+    let utf16_strings: Vec<String> = utf16_bytes
+        .iter()
+        .map(|&byte| format!("0x{:04x}", byte))
+        .collect();
+    println!("{}", utf16_strings.join(","));
+    let utf8_bytes = to_utf8(&utf16_bytes, is_little_endian).unwrap();
+    println!("==========utf8:");
+    let utf8_strings: Vec<String> = utf8_bytes
+        .iter()
+        .map(|&byte| format!("0x{:02x}", byte))
+        .collect();
+    println!("{}", utf8_strings.join(","));
+    // final UTF-8 string
+    let final_string = String::from_utf8(utf8_bytes.clone()).unwrap();
+    println!("final string: {}", final_string);
+    assert_eq!(s, final_string);
+}
+
+// For test
+fn swap_endian(value: u16) -> u16 {
+    ((value & 0xff) << 8) | ((value & 0xff00) >> 8)
+}
+
+#[test]
+fn test_to_utf8_3byte() {
+    let s = "é₫l₪₮";
+    let utf16_bytes = s.encode_utf16().collect::<Vec<u16>>();
+    let utf8_bytes = to_utf8(&utf16_bytes, false).unwrap();
+    assert_eq!(String::from_utf8(utf8_bytes.clone()).unwrap(), s);
+    let utf16_bytes_le = s
+        .encode_utf16()
+        .collect::<Vec<u16>>()
+        .iter()
+        .map(|&byte| swap_endian(byte))
+        .collect::<Vec<u16>>();
+    let utf8_bytes_le = to_utf8(&utf16_bytes_le, true).unwrap();
+    assert_eq!(String::from_utf8(utf8_bytes_le.clone()).unwrap(), s);
+}
+
+#[test]
+fn test_to_utf8_endian() {
+    let utf16 = &[0x6100, 0x6200]; // 'ab' in UTF-16 little endian
+    let expected = b"ab";
+    let result = to_utf8(utf16, true).unwrap();
+    assert_eq!(result, expected, "Little endian test failed");
+    let utf16 = &[0x0061, 0x0062]; // 'ab' in UTF-16 big endian
+    let expected = b"ab";
+    let result = to_utf8(utf16, false).unwrap();
+    assert_eq!(result, expected, "Big endian test failed");
+}
+
+#[test]
+fn test_to_utf8_surrogate_pair() {
+    let s = "𝄞💡😀🎻";
+    let utf16_bytes = s.encode_utf16().collect::<Vec<u16>>();
+    let result_be = to_utf8(&utf16_bytes, false);
+    assert!(result_be.is_ok());
+    assert_eq!(String::from_utf8(result_be.unwrap().clone()).unwrap(), s);
+    // test little endian
+    let utf16_bytes_le = s
+        .encode_utf16()
+        .collect::<Vec<u16>>()
+        .iter()
+        .map(|&byte| swap_endian(byte))
+        .collect::<Vec<u16>>();
+    let result_le = to_utf8(&utf16_bytes_le, true);
+    assert!(result_le.is_ok());
+    assert_eq!(String::from_utf8(result_le.unwrap().clone()).unwrap(), s);
+}
+
+#[test]
+fn test_to_utf8_missing_surrogate_pair() {
+    let utf16 = &[0x00D8]; // Missing second surrogate
+    let result = to_utf8(utf16, true);
+    assert!(result.is_err());
+    assert_eq!(
+        result.unwrap_err(),
+        "Invalid UTF-16 string: missing surrogate pair"
+    );
+
+    let utf16 = &[0x00D8, 0x00DA]; // Wrong second surrogate
+    let result = to_utf8(utf16, true);
+    assert!(result.is_err());
+    assert_eq!(
+        result.unwrap_err(),
+        "Invalid UTF-16 string: wrong surrogate pair"
+    );
+}