diff --git a/rust/fury/src/lib.rs b/rust/fury/src/lib.rs index 6a58e504e1..20a152705b 100644 --- a/rust/fury/src/lib.rs +++ b/rust/fury/src/lib.rs @@ -18,6 +18,7 @@ mod buffer; mod deserializer; mod error; +mod meta; mod row; mod serializer; mod types; @@ -25,6 +26,7 @@ mod types; pub use deserializer::from_buffer; pub use error::Error; pub use fury_derive::*; +pub use meta::{Encoding, MetaStringDecoder, MetaStringEncoder}; pub use row::{from_row, to_row}; pub use serializer::to_buffer; diff --git a/rust/fury/src/meta/meta_string.rs b/rust/fury/src/meta/meta_string.rs new file mode 100644 index 0000000000..3b46265e8d --- /dev/null +++ b/rust/fury/src/meta/meta_string.rs @@ -0,0 +1,435 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[derive(Debug, PartialEq)] +pub enum Encoding { + Utf8 = 0x00, + LowerSpecial = 0x01, + LowerUpperDigitSpecial = 0x02, + FirstToLowerSpecial = 0x03, + AllToLowerSpecial = 0x04, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("encoded_data cannot be empty")] + EncodedDataEmpty, + + #[error("Long meta string than 32767 is not allowed")] + LengthExceed, + + #[error("Non-ASCII characters in meta string are not allowed")] + OnlyAllowASCII, + + #[error("Unsupported character for LOWER_SPECIAL encoding: {ch:?}")] + UnsupportedLowerSpecialCharacter { ch: char }, + + #[error("Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: {ch:?}")] + UnsupportedLowerUpperDigitSpecialCharacter { ch: char }, + + #[error("Invalid character value for LOWER_SPECIAL decoding: {value:?}")] + InvalidLowerSpecialValue { value: u8 }, + + #[error("Invalid character value for LOWER_UPPER_DIGIT_SPECIAL decoding: {value:?}")] + InvalidLowerUpperDigitSpecialValue { value: u8 }, +} + +#[derive(Debug, PartialEq)] +pub struct MetaString { + pub original: String, + pub encoding: Encoding, + pub bytes: Vec, + pub strip_last_char: bool, +} + +impl MetaString { + pub fn new(original: String, encoding: Encoding, bytes: Vec) -> Result { + let mut strip_last_char = false; + if encoding != Encoding::Utf8 { + if bytes.is_empty() { + return Err(Error::EncodedDataEmpty); + } + strip_last_char = (bytes[0] & 0x80) != 0; + } + Ok(MetaString { + original, + encoding, + bytes, + strip_last_char, + }) + } +} + +pub struct MetaStringDecoder {} +impl Default for MetaStringDecoder { + fn default() -> Self { + Self::new() + } +} + +pub struct MetaStringEncoder {} +impl Default for MetaStringEncoder { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug)] +struct StringStatistics { + digit_count: usize, + upper_count: usize, + can_lower_upper_digit_special_encoded: bool, + can_lower_special_encoded: bool, +} + +impl MetaStringEncoder { + pub fn new() -> Self { + MetaStringEncoder {} + } + + fn is_latin(&self, s: &str) -> bool { + s.bytes().all(|b| b.is_ascii()) + } + + pub fn encode(&self, input: &str) -> Result { + if input.is_empty() { + return MetaString::new(input.to_string(), Encoding::Utf8, vec![]); + } + // equal to "std::i16::MAX" + const SHORT_MAX_VALUE: usize = 32767; + if input.len() >= SHORT_MAX_VALUE { + return Err(Error::LengthExceed); + } + if !self.is_latin(input) { + return MetaString::new(input.to_string(), Encoding::Utf8, input.as_bytes().to_vec()); + } + let encoding = self.compute_encoding(input); + self.encode_with_encoding(input, encoding) + } + + fn compute_encoding(&self, input: &str) -> Encoding { + let statistics = self.compute_statistics(input); + if statistics.can_lower_special_encoded { + return Encoding::LowerSpecial; + } + if statistics.can_lower_upper_digit_special_encoded { + if statistics.digit_count != 0 { + return Encoding::LowerUpperDigitSpecial; + } + let upper_count: usize = statistics.upper_count; + if upper_count == 1 && input.chars().next().unwrap().is_uppercase() { + return Encoding::FirstToLowerSpecial; + } + if ((input.len() + upper_count) * 5) < (input.len() * 6) { + return Encoding::AllToLowerSpecial; + } + return Encoding::LowerUpperDigitSpecial; + } + Encoding::Utf8 + } + + fn compute_statistics(&self, chars: &str) -> StringStatistics { + let mut can_lower_upper_digit_special_encoded = true; + let mut can_lower_special_encoded = true; + let mut digit_count = 0; + let mut upper_count = 0; + for c in chars.chars() { + if can_lower_upper_digit_special_encoded + && !(c.is_lowercase() + || c.is_uppercase() + || c.is_ascii_digit() + || c == '.' + || c == '_') + { + can_lower_upper_digit_special_encoded = false; + } + if can_lower_special_encoded + && !(c.is_lowercase() || matches!(c, '.' | '_' | '$' | '|')) + { + can_lower_special_encoded = false; + } + if c.is_ascii_digit() { + digit_count += 1; + } + if c.is_uppercase() { + upper_count += 1; + } + } + StringStatistics { + digit_count, + upper_count, + can_lower_upper_digit_special_encoded, + can_lower_special_encoded, + } + } + + pub fn encode_with_encoding( + &self, + input: &str, + encoding: Encoding, + ) -> Result { + if input.is_empty() { + return MetaString::new(input.to_string(), Encoding::Utf8, vec![]); + } + // equal to "std::i16::MAX" + const SHORT_MAX_VALUE: usize = 32767; + if input.len() >= SHORT_MAX_VALUE { + return Err(Error::LengthExceed); + } + if encoding != Encoding::Utf8 && !self.is_latin(input) { + return Err(Error::OnlyAllowASCII); + }; + if input.is_empty() { + return MetaString::new(input.to_string(), Encoding::Utf8, vec![]); + }; + + match encoding { + Encoding::LowerSpecial => { + let encoded_data = self.encode_lower_special(input)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::LowerUpperDigitSpecial => { + let encoded_data = self.encode_lower_upper_digit_special(input)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::FirstToLowerSpecial => { + let encoded_data = self.encode_first_to_lower_special(input)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::AllToLowerSpecial => { + let upper_count = input.chars().filter(|c| c.is_uppercase()).count(); + let encoded_data = self.encode_all_to_lower_special(input, upper_count)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::Utf8 => { + let encoded_data = input.as_bytes().to_vec(); + MetaString::new(input.to_string(), Encoding::Utf8, encoded_data) + } + } + } + + fn encode_generic(&self, input: &str, bits_per_char: u8) -> Result, Error> { + let total_bits: usize = input.len() * bits_per_char as usize + 1; + let byte_length: usize = (total_bits + 7) / 8; + let mut bytes = vec![0; byte_length]; + let mut current_bit = 1; + for c in input.chars() { + let value = self.char_to_value(c, bits_per_char)?; + for i in (0..bits_per_char).rev() { + if (value & (1 << i)) != 0 { + let byte_pos: usize = current_bit / 8; + let bit_pos: usize = current_bit % 8; + bytes[byte_pos] |= 1 << (7 - bit_pos); + } + current_bit += 1; + } + } + if byte_length * 8 >= total_bits + bits_per_char as usize { + bytes[0] |= 0x80; + } + Ok(bytes) + } + pub fn encode_lower_special(&self, input: &str) -> Result, Error> { + self.encode_generic(input, 5) + } + + pub fn encode_lower_upper_digit_special(&self, input: &str) -> Result, Error> { + self.encode_generic(input, 6) + } + + pub fn encode_first_to_lower_special(&self, input: &str) -> Result, Error> { + let mut chars: Vec = input.chars().collect(); + chars[0] = chars[0].to_lowercase().next().unwrap(); + self.encode_generic(&chars.iter().collect::(), 5) + } + + pub fn encode_all_to_lower_special( + &self, + input: &str, + upper_count: usize, + ) -> Result, Error> { + let mut new_chars = Vec::with_capacity(input.len() + upper_count); + for c in input.chars() { + if c.is_uppercase() { + new_chars.push('|'); + new_chars.push(c.to_lowercase().next().unwrap()); + } else { + new_chars.push(c); + } + } + self.encode_generic(&new_chars.iter().collect::(), 5) + } + + fn char_to_value(&self, c: char, bits_per_char: u8) -> Result { + match bits_per_char { + 5 => match c { + 'a'..='z' => Ok(c as u8 - b'a'), + '.' => Ok(26), + '_' => Ok(27), + '$' => Ok(28), + '|' => Ok(29), + _ => Err(Error::UnsupportedLowerSpecialCharacter { ch: c }), + }, + 6 => match c { + 'a'..='z' => Ok(c as u8 - b'a'), + 'A'..='Z' => Ok(c as u8 - b'A' + 26), + '0'..='9' => Ok(c as u8 - b'0' + 52), + _ => { + if c == '.' { + Ok(62) + } else if c == '_' { + Ok(63) + } else { + Err(Error::UnsupportedLowerUpperDigitSpecialCharacter { ch: c }) + } + } + }, + _ => unreachable!(), + } + } +} + +impl MetaStringDecoder { + pub fn new() -> Self { + MetaStringDecoder {} + } + + pub fn decode(&self, encoded_data: &[u8], encoding: Encoding) -> Result { + if encoded_data.is_empty() { + return Ok("".to_string()); + } + match encoding { + Encoding::LowerSpecial => self.decode_lower_special(encoded_data), + Encoding::LowerUpperDigitSpecial => self.decode_lower_upper_digit_special(encoded_data), + Encoding::FirstToLowerSpecial => self.decode_rep_first_lower_special(encoded_data), + Encoding::AllToLowerSpecial => self.decode_rep_all_to_lower_special(encoded_data), + Encoding::Utf8 => Ok(String::from_utf8_lossy(encoded_data).into_owned()), + } + } + + fn decode_lower_special(&self, data: &[u8]) -> Result { + let mut decoded = String::new(); + let total_bits: usize = data.len() * 8; + let strip_last_char = (data[0] & 0x80) != 0; + let bit_mask: usize = 0b11111; + let mut bit_index = 1; + while bit_index + 5 <= total_bits && !(strip_last_char && (bit_index + 2 * 5 > total_bits)) + { + let byte_index = bit_index / 8; + let intra_byte_index = bit_index % 8; + let char_value: usize = if intra_byte_index > 3 { + ((data[byte_index] as usize) << 8 + | if byte_index + 1 < data.len() { + data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF + } else { + 0 + }) + >> (11 - intra_byte_index) + & bit_mask + } else { + (data[byte_index] as usize) >> (3 - intra_byte_index) & bit_mask + }; + bit_index += 5; + decoded.push(self.decode_lower_special_char(char_value as u8)?); + } + Ok(decoded) + } + + fn decode_lower_upper_digit_special(&self, data: &[u8]) -> Result { + let mut decoded = String::new(); + let num_bits = data.len() * 8; + let strip_last_char = (data[0] & 0x80) != 0; + let mut bit_index = 1; + let bit_mask: usize = 0b111111; + while bit_index + 6 <= num_bits && !(strip_last_char && (bit_index + 2 * 6 > num_bits)) { + let byte_index = bit_index / 8; + let intra_byte_index = bit_index % 8; + let char_value: usize = if intra_byte_index > 2 { + ((data[byte_index] as usize) << 8 + | if byte_index + 1 < data.len() { + data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF + } else { + 0 + }) + >> (10 - intra_byte_index) + & bit_mask + } else { + (data[byte_index] as usize) >> (2 - intra_byte_index) & bit_mask + }; + bit_index += 6; + decoded.push(self.decode_lower_upper_digit_special_char(char_value as u8)?); + } + Ok(decoded) + } + + fn decode_lower_special_char(&self, char_value: u8) -> Result { + match char_value { + 0..=25 => Ok((b'a' + char_value) as char), + 26 => Ok('.'), + 27 => Ok('_'), + 28 => Ok('$'), + 29 => Ok('|'), + _ => Err(Error::InvalidLowerSpecialValue { value: char_value }), + } + } + + fn decode_lower_upper_digit_special_char(&self, char_value: u8) -> Result { + match char_value { + 0..=25 => Ok((b'a' + char_value) as char), + 26..=51 => Ok((b'A' + char_value - 26) as char), + 52..=61 => Ok((b'0' + char_value - 52) as char), + 62 => Ok('.'), + 63 => Ok('_'), + _ => Err(Error::InvalidLowerUpperDigitSpecialValue { value: char_value }), + } + } + + fn decode_rep_first_lower_special(&self, data: &[u8]) -> Result { + let decoded_str = self.decode_lower_special(data)?; + let mut chars = decoded_str.chars(); + match chars.next() { + Some(first_char) => { + let mut result = first_char.to_ascii_uppercase().to_string(); + result.extend(chars); + Ok(result) + } + None => Ok(decoded_str), + } + } + fn decode_rep_all_to_lower_special(&self, data: &[u8]) -> Result { + let decoded_str = self.decode_lower_special(data)?; + let mut result = String::new(); + let mut skip = false; + for (i, char) in decoded_str.chars().enumerate() { + if skip { + skip = false; + continue; + } + // Encounter a '|', capitalize the next character + // and skip the following character. + if char == '|' { + if let Some(next_char) = decoded_str.chars().nth(i + 1) { + result.push(next_char.to_ascii_uppercase()); + } + skip = true; + } else { + result.push(char); + } + } + Ok(result) + } +} diff --git a/rust/fury/src/meta/mod.rs b/rust/fury/src/meta/mod.rs new file mode 100644 index 0000000000..4e4d40b29a --- /dev/null +++ b/rust/fury/src/meta/mod.rs @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod meta_string; +pub use meta_string::{Encoding, MetaStringDecoder, MetaStringEncoder}; diff --git a/rust/tests/tests/test_meta_string.rs b/rust/tests/tests/test_meta_string.rs new file mode 100644 index 0000000000..a1065f39be --- /dev/null +++ b/rust/tests/tests/test_meta_string.rs @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::iter; + +use fury::{Encoding, MetaStringDecoder, MetaStringEncoder}; + +#[test] +fn test_encode_meta_string_lower_special() { + let encoder = MetaStringEncoder::new(); + let bytes1 = encoder.encode_lower_special("abc_def").unwrap(); + assert_eq!(bytes1.len(), 5); + let bytes2 = encoder + .encode("org.apache.fury.benchmark.data") + .unwrap() + .bytes; + assert_eq!(bytes2.len(), 19); + let bytes3 = encoder.encode("MediaContent").unwrap().bytes; + assert_eq!(bytes3.len(), 9); + let decoder = MetaStringDecoder::new(); + assert_eq!( + decoder.decode(&bytes1, Encoding::LowerSpecial).unwrap(), + "abc_def" + ); + for i in 0..128 { + let origin_string: String = iter::repeat_with(|| { + let char_a = b'a'; + ((char_a + i as u8) % 26 + char_a) as char + }) + .take(i) + .collect(); + let encoded = encoder.encode_lower_special(&origin_string).unwrap(); + let decoded = decoder.decode(&encoded, Encoding::LowerSpecial).unwrap(); + assert_eq!(decoded, origin_string); + } +} + +fn create_string(length: usize) -> String { + (0..length) + .map(|j| { + let n = j % 64; + match n { + 0..=25 => (b'a' + n as u8) as char, + 26..=51 => (b'A' + (n - 26) as u8) as char, + 52..=61 => (b'0' + (n - 52) as u8) as char, + 62 => '.', + 63 => '_', + _ => unreachable!(), + } + }) + .collect() +} + +#[test] +fn test_encode_meta_string_lower_upper_digit_special() { + let encoder = MetaStringEncoder::new(); + let encoded = encoder + .encode_lower_upper_digit_special("ExampleInput123") + .unwrap(); + assert_eq!(encoded.len(), 12); + + let decoder = MetaStringDecoder::new(); + let decoded = decoder + .decode(&encoded, Encoding::LowerUpperDigitSpecial) + .unwrap(); + assert_eq!(decoded, "ExampleInput123"); + + for i in 1..128 { + let origin_string = create_string(i); + let encoded = encoder + .encode_lower_upper_digit_special(&origin_string) + .unwrap(); + let decoded = decoder + .decode(&encoded, Encoding::LowerUpperDigitSpecial) + .unwrap(); + assert_eq!(decoded, origin_string); + } +} + +#[test] +fn test_meta_string() { + let encoder = MetaStringEncoder::new(); + + for i in 1..=127 { + let origin_string = create_string(i); + + let meta_string = encoder.encode(&origin_string).unwrap(); + assert_ne!(meta_string.encoding, Encoding::Utf8); + assert_eq!(meta_string.original, origin_string); + + let decoder = MetaStringDecoder::new(); + let new_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(new_string, origin_string); + } +} + +#[test] +fn test_encode_empty_string() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + for encoding in [ + Encoding::LowerSpecial, + Encoding::LowerUpperDigitSpecial, + Encoding::FirstToLowerSpecial, + Encoding::AllToLowerSpecial, + Encoding::Utf8, + ] { + let meta_string = encoder.encode_with_encoding("", encoding).unwrap(); + assert_eq!(meta_string.bytes.len(), 0); + let decoded = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded, ""); + } +} + +#[test] +fn test_encode_characters_outside_of_lower_special() { + let encoder = MetaStringEncoder::new(); + let test_string = "abcdefABCDEF1234!@#"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::Utf8); +} + +#[test] +fn test_all_to_upper_special_encoding() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + let test_string = "ABC_DEF"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::LowerUpperDigitSpecial); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); +} + +#[test] +fn test_first_to_lower_special_encoding() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + let test_string = "Aabcdef"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::FirstToLowerSpecial); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); +} + +#[test] +fn test_utf8_encoding() { + let encoder = MetaStringEncoder::new(); + let test_string = "你好,世界"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::Utf8); + let decoder = MetaStringDecoder::new(); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); + let test_string = "aA$"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::Utf8); + let decoder = MetaStringDecoder::new(); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); +} + +#[test] +fn test_strip_last_char() { + let encoder = MetaStringEncoder::new(); + let test_string = "abc"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert!(!encoded_meta_string.strip_last_char); + + let test_string = "abcde"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert!(encoded_meta_string.strip_last_char); +} + +#[test] +fn test_empty_string() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + let meta_string = encoder.encode("").unwrap(); + assert!(meta_string.bytes.is_empty()); + let decoded = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded, ""); +} + +#[test] +fn test_ascii_encoding() { + let encoder = MetaStringEncoder::new(); + let test_string = "asciiOnly"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert_ne!(encoded_meta_string.encoding, Encoding::Utf8); + assert_eq!(encoded_meta_string.encoding, Encoding::AllToLowerSpecial); +} + +#[test] +fn test_non_ascii_encoding() { + let encoder = MetaStringEncoder::new(); + let test_string = "こんにちは"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(encoded_meta_string.encoding, Encoding::Utf8); +} + +#[test] +fn test_non_ascii_encoding_and_non_utf8() { + let encoder = MetaStringEncoder::new(); + let non_ascii_string = "こんにちは"; + + match encoder.encode_with_encoding(non_ascii_string, Encoding::LowerSpecial) { + Err(err) => { + assert_eq!( + err.to_string(), + "Non-ASCII characters in meta string are not allowed" + ); + } + Ok(_) => panic!("Expected an error due to non-ASCII character with non-UTF8 encoding"), + } +}