From 219cdc78c5b83f2e84c75b6e61b637860ff239fe Mon Sep 17 00:00:00 2001 From: urlyy Date: Wed, 17 Jul 2024 22:26:31 +0800 Subject: [PATCH] feat(Rust): add meta string encoding algorithm in Rust (#1712) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What does this PR do? Implementing meta string encoding algorithm in Rust and has passed all 13 tests. The code is in `rust/fury/src/meta/*` and `rust/tests/tests/test_meta.rs` ![Snipaste_2024-06-29_17-07-41](https://github.com/pandalee99/fury/assets/61675635/e00dce79-5763-4bdd-9a23-5f190199ed5c) I am not familiar with Rust, I implemented this feature by referring to the Java implementation and utilizing TONGYI Lingma. Therefore, there might be some problems. Please don't review it immediately. As this version is essentially the same as the Java version, I have a question for `But note that the meta string encoding algorithm is used for encode field name only, so the special charater can't be . or $, thus the implementation will be simpler`. Does this imply I should remove the checks in the Rust version for whether a character is equal to `.` or `$`? In that case, I also need to modify the test cases. ## Related issues [#1544](https://github.com/apache/fury/issues/1544) ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## 本改动借助了通义灵码进行辅助编程 ![image](https://github.com/pandalee99/fury/assets/61675635/7fb84046-80b5-4c44-b1d1-4bc7665224b1) --- rust/fury/src/lib.rs | 2 + rust/fury/src/meta/meta_string.rs | 435 +++++++++++++++++++++++++++ rust/fury/src/meta/mod.rs | 19 ++ rust/tests/tests/test_meta_string.rs | 243 +++++++++++++++ 4 files changed, 699 insertions(+) create mode 100644 rust/fury/src/meta/meta_string.rs create mode 100644 rust/fury/src/meta/mod.rs create mode 100644 rust/tests/tests/test_meta_string.rs diff --git a/rust/fury/src/lib.rs b/rust/fury/src/lib.rs index 6a58e504e1..20a152705b 100644 --- a/rust/fury/src/lib.rs +++ b/rust/fury/src/lib.rs @@ -18,6 +18,7 @@ mod buffer; mod deserializer; mod error; +mod meta; mod row; mod serializer; mod types; @@ -25,6 +26,7 @@ mod types; pub use deserializer::from_buffer; pub use error::Error; pub use fury_derive::*; +pub use meta::{Encoding, MetaStringDecoder, MetaStringEncoder}; pub use row::{from_row, to_row}; pub use serializer::to_buffer; diff --git a/rust/fury/src/meta/meta_string.rs b/rust/fury/src/meta/meta_string.rs new file mode 100644 index 0000000000..3b46265e8d --- /dev/null +++ b/rust/fury/src/meta/meta_string.rs @@ -0,0 +1,435 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[derive(Debug, PartialEq)] +pub enum Encoding { + Utf8 = 0x00, + LowerSpecial = 0x01, + LowerUpperDigitSpecial = 0x02, + FirstToLowerSpecial = 0x03, + AllToLowerSpecial = 0x04, +} + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("encoded_data cannot be empty")] + EncodedDataEmpty, + + #[error("Long meta string than 32767 is not allowed")] + LengthExceed, + + #[error("Non-ASCII characters in meta string are not allowed")] + OnlyAllowASCII, + + #[error("Unsupported character for LOWER_SPECIAL encoding: {ch:?}")] + UnsupportedLowerSpecialCharacter { ch: char }, + + #[error("Unsupported character for LOWER_UPPER_DIGIT_SPECIAL encoding: {ch:?}")] + UnsupportedLowerUpperDigitSpecialCharacter { ch: char }, + + #[error("Invalid character value for LOWER_SPECIAL decoding: {value:?}")] + InvalidLowerSpecialValue { value: u8 }, + + #[error("Invalid character value for LOWER_UPPER_DIGIT_SPECIAL decoding: {value:?}")] + InvalidLowerUpperDigitSpecialValue { value: u8 }, +} + +#[derive(Debug, PartialEq)] +pub struct MetaString { + pub original: String, + pub encoding: Encoding, + pub bytes: Vec, + pub strip_last_char: bool, +} + +impl MetaString { + pub fn new(original: String, encoding: Encoding, bytes: Vec) -> Result { + let mut strip_last_char = false; + if encoding != Encoding::Utf8 { + if bytes.is_empty() { + return Err(Error::EncodedDataEmpty); + } + strip_last_char = (bytes[0] & 0x80) != 0; + } + Ok(MetaString { + original, + encoding, + bytes, + strip_last_char, + }) + } +} + +pub struct MetaStringDecoder {} +impl Default for MetaStringDecoder { + fn default() -> Self { + Self::new() + } +} + +pub struct MetaStringEncoder {} +impl Default for MetaStringEncoder { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug)] +struct StringStatistics { + digit_count: usize, + upper_count: usize, + can_lower_upper_digit_special_encoded: bool, + can_lower_special_encoded: bool, +} + +impl MetaStringEncoder { + pub fn new() -> Self { + MetaStringEncoder {} + } + + fn is_latin(&self, s: &str) -> bool { + s.bytes().all(|b| b.is_ascii()) + } + + pub fn encode(&self, input: &str) -> Result { + if input.is_empty() { + return MetaString::new(input.to_string(), Encoding::Utf8, vec![]); + } + // equal to "std::i16::MAX" + const SHORT_MAX_VALUE: usize = 32767; + if input.len() >= SHORT_MAX_VALUE { + return Err(Error::LengthExceed); + } + if !self.is_latin(input) { + return MetaString::new(input.to_string(), Encoding::Utf8, input.as_bytes().to_vec()); + } + let encoding = self.compute_encoding(input); + self.encode_with_encoding(input, encoding) + } + + fn compute_encoding(&self, input: &str) -> Encoding { + let statistics = self.compute_statistics(input); + if statistics.can_lower_special_encoded { + return Encoding::LowerSpecial; + } + if statistics.can_lower_upper_digit_special_encoded { + if statistics.digit_count != 0 { + return Encoding::LowerUpperDigitSpecial; + } + let upper_count: usize = statistics.upper_count; + if upper_count == 1 && input.chars().next().unwrap().is_uppercase() { + return Encoding::FirstToLowerSpecial; + } + if ((input.len() + upper_count) * 5) < (input.len() * 6) { + return Encoding::AllToLowerSpecial; + } + return Encoding::LowerUpperDigitSpecial; + } + Encoding::Utf8 + } + + fn compute_statistics(&self, chars: &str) -> StringStatistics { + let mut can_lower_upper_digit_special_encoded = true; + let mut can_lower_special_encoded = true; + let mut digit_count = 0; + let mut upper_count = 0; + for c in chars.chars() { + if can_lower_upper_digit_special_encoded + && !(c.is_lowercase() + || c.is_uppercase() + || c.is_ascii_digit() + || c == '.' + || c == '_') + { + can_lower_upper_digit_special_encoded = false; + } + if can_lower_special_encoded + && !(c.is_lowercase() || matches!(c, '.' | '_' | '$' | '|')) + { + can_lower_special_encoded = false; + } + if c.is_ascii_digit() { + digit_count += 1; + } + if c.is_uppercase() { + upper_count += 1; + } + } + StringStatistics { + digit_count, + upper_count, + can_lower_upper_digit_special_encoded, + can_lower_special_encoded, + } + } + + pub fn encode_with_encoding( + &self, + input: &str, + encoding: Encoding, + ) -> Result { + if input.is_empty() { + return MetaString::new(input.to_string(), Encoding::Utf8, vec![]); + } + // equal to "std::i16::MAX" + const SHORT_MAX_VALUE: usize = 32767; + if input.len() >= SHORT_MAX_VALUE { + return Err(Error::LengthExceed); + } + if encoding != Encoding::Utf8 && !self.is_latin(input) { + return Err(Error::OnlyAllowASCII); + }; + if input.is_empty() { + return MetaString::new(input.to_string(), Encoding::Utf8, vec![]); + }; + + match encoding { + Encoding::LowerSpecial => { + let encoded_data = self.encode_lower_special(input)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::LowerUpperDigitSpecial => { + let encoded_data = self.encode_lower_upper_digit_special(input)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::FirstToLowerSpecial => { + let encoded_data = self.encode_first_to_lower_special(input)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::AllToLowerSpecial => { + let upper_count = input.chars().filter(|c| c.is_uppercase()).count(); + let encoded_data = self.encode_all_to_lower_special(input, upper_count)?; + MetaString::new(input.to_string(), encoding, encoded_data) + } + Encoding::Utf8 => { + let encoded_data = input.as_bytes().to_vec(); + MetaString::new(input.to_string(), Encoding::Utf8, encoded_data) + } + } + } + + fn encode_generic(&self, input: &str, bits_per_char: u8) -> Result, Error> { + let total_bits: usize = input.len() * bits_per_char as usize + 1; + let byte_length: usize = (total_bits + 7) / 8; + let mut bytes = vec![0; byte_length]; + let mut current_bit = 1; + for c in input.chars() { + let value = self.char_to_value(c, bits_per_char)?; + for i in (0..bits_per_char).rev() { + if (value & (1 << i)) != 0 { + let byte_pos: usize = current_bit / 8; + let bit_pos: usize = current_bit % 8; + bytes[byte_pos] |= 1 << (7 - bit_pos); + } + current_bit += 1; + } + } + if byte_length * 8 >= total_bits + bits_per_char as usize { + bytes[0] |= 0x80; + } + Ok(bytes) + } + pub fn encode_lower_special(&self, input: &str) -> Result, Error> { + self.encode_generic(input, 5) + } + + pub fn encode_lower_upper_digit_special(&self, input: &str) -> Result, Error> { + self.encode_generic(input, 6) + } + + pub fn encode_first_to_lower_special(&self, input: &str) -> Result, Error> { + let mut chars: Vec = input.chars().collect(); + chars[0] = chars[0].to_lowercase().next().unwrap(); + self.encode_generic(&chars.iter().collect::(), 5) + } + + pub fn encode_all_to_lower_special( + &self, + input: &str, + upper_count: usize, + ) -> Result, Error> { + let mut new_chars = Vec::with_capacity(input.len() + upper_count); + for c in input.chars() { + if c.is_uppercase() { + new_chars.push('|'); + new_chars.push(c.to_lowercase().next().unwrap()); + } else { + new_chars.push(c); + } + } + self.encode_generic(&new_chars.iter().collect::(), 5) + } + + fn char_to_value(&self, c: char, bits_per_char: u8) -> Result { + match bits_per_char { + 5 => match c { + 'a'..='z' => Ok(c as u8 - b'a'), + '.' => Ok(26), + '_' => Ok(27), + '$' => Ok(28), + '|' => Ok(29), + _ => Err(Error::UnsupportedLowerSpecialCharacter { ch: c }), + }, + 6 => match c { + 'a'..='z' => Ok(c as u8 - b'a'), + 'A'..='Z' => Ok(c as u8 - b'A' + 26), + '0'..='9' => Ok(c as u8 - b'0' + 52), + _ => { + if c == '.' { + Ok(62) + } else if c == '_' { + Ok(63) + } else { + Err(Error::UnsupportedLowerUpperDigitSpecialCharacter { ch: c }) + } + } + }, + _ => unreachable!(), + } + } +} + +impl MetaStringDecoder { + pub fn new() -> Self { + MetaStringDecoder {} + } + + pub fn decode(&self, encoded_data: &[u8], encoding: Encoding) -> Result { + if encoded_data.is_empty() { + return Ok("".to_string()); + } + match encoding { + Encoding::LowerSpecial => self.decode_lower_special(encoded_data), + Encoding::LowerUpperDigitSpecial => self.decode_lower_upper_digit_special(encoded_data), + Encoding::FirstToLowerSpecial => self.decode_rep_first_lower_special(encoded_data), + Encoding::AllToLowerSpecial => self.decode_rep_all_to_lower_special(encoded_data), + Encoding::Utf8 => Ok(String::from_utf8_lossy(encoded_data).into_owned()), + } + } + + fn decode_lower_special(&self, data: &[u8]) -> Result { + let mut decoded = String::new(); + let total_bits: usize = data.len() * 8; + let strip_last_char = (data[0] & 0x80) != 0; + let bit_mask: usize = 0b11111; + let mut bit_index = 1; + while bit_index + 5 <= total_bits && !(strip_last_char && (bit_index + 2 * 5 > total_bits)) + { + let byte_index = bit_index / 8; + let intra_byte_index = bit_index % 8; + let char_value: usize = if intra_byte_index > 3 { + ((data[byte_index] as usize) << 8 + | if byte_index + 1 < data.len() { + data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF + } else { + 0 + }) + >> (11 - intra_byte_index) + & bit_mask + } else { + (data[byte_index] as usize) >> (3 - intra_byte_index) & bit_mask + }; + bit_index += 5; + decoded.push(self.decode_lower_special_char(char_value as u8)?); + } + Ok(decoded) + } + + fn decode_lower_upper_digit_special(&self, data: &[u8]) -> Result { + let mut decoded = String::new(); + let num_bits = data.len() * 8; + let strip_last_char = (data[0] & 0x80) != 0; + let mut bit_index = 1; + let bit_mask: usize = 0b111111; + while bit_index + 6 <= num_bits && !(strip_last_char && (bit_index + 2 * 6 > num_bits)) { + let byte_index = bit_index / 8; + let intra_byte_index = bit_index % 8; + let char_value: usize = if intra_byte_index > 2 { + ((data[byte_index] as usize) << 8 + | if byte_index + 1 < data.len() { + data.get(byte_index + 1).cloned().unwrap() as usize & 0xFF + } else { + 0 + }) + >> (10 - intra_byte_index) + & bit_mask + } else { + (data[byte_index] as usize) >> (2 - intra_byte_index) & bit_mask + }; + bit_index += 6; + decoded.push(self.decode_lower_upper_digit_special_char(char_value as u8)?); + } + Ok(decoded) + } + + fn decode_lower_special_char(&self, char_value: u8) -> Result { + match char_value { + 0..=25 => Ok((b'a' + char_value) as char), + 26 => Ok('.'), + 27 => Ok('_'), + 28 => Ok('$'), + 29 => Ok('|'), + _ => Err(Error::InvalidLowerSpecialValue { value: char_value }), + } + } + + fn decode_lower_upper_digit_special_char(&self, char_value: u8) -> Result { + match char_value { + 0..=25 => Ok((b'a' + char_value) as char), + 26..=51 => Ok((b'A' + char_value - 26) as char), + 52..=61 => Ok((b'0' + char_value - 52) as char), + 62 => Ok('.'), + 63 => Ok('_'), + _ => Err(Error::InvalidLowerUpperDigitSpecialValue { value: char_value }), + } + } + + fn decode_rep_first_lower_special(&self, data: &[u8]) -> Result { + let decoded_str = self.decode_lower_special(data)?; + let mut chars = decoded_str.chars(); + match chars.next() { + Some(first_char) => { + let mut result = first_char.to_ascii_uppercase().to_string(); + result.extend(chars); + Ok(result) + } + None => Ok(decoded_str), + } + } + fn decode_rep_all_to_lower_special(&self, data: &[u8]) -> Result { + let decoded_str = self.decode_lower_special(data)?; + let mut result = String::new(); + let mut skip = false; + for (i, char) in decoded_str.chars().enumerate() { + if skip { + skip = false; + continue; + } + // Encounter a '|', capitalize the next character + // and skip the following character. + if char == '|' { + if let Some(next_char) = decoded_str.chars().nth(i + 1) { + result.push(next_char.to_ascii_uppercase()); + } + skip = true; + } else { + result.push(char); + } + } + Ok(result) + } +} diff --git a/rust/fury/src/meta/mod.rs b/rust/fury/src/meta/mod.rs new file mode 100644 index 0000000000..4e4d40b29a --- /dev/null +++ b/rust/fury/src/meta/mod.rs @@ -0,0 +1,19 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod meta_string; +pub use meta_string::{Encoding, MetaStringDecoder, MetaStringEncoder}; diff --git a/rust/tests/tests/test_meta_string.rs b/rust/tests/tests/test_meta_string.rs new file mode 100644 index 0000000000..a1065f39be --- /dev/null +++ b/rust/tests/tests/test_meta_string.rs @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::iter; + +use fury::{Encoding, MetaStringDecoder, MetaStringEncoder}; + +#[test] +fn test_encode_meta_string_lower_special() { + let encoder = MetaStringEncoder::new(); + let bytes1 = encoder.encode_lower_special("abc_def").unwrap(); + assert_eq!(bytes1.len(), 5); + let bytes2 = encoder + .encode("org.apache.fury.benchmark.data") + .unwrap() + .bytes; + assert_eq!(bytes2.len(), 19); + let bytes3 = encoder.encode("MediaContent").unwrap().bytes; + assert_eq!(bytes3.len(), 9); + let decoder = MetaStringDecoder::new(); + assert_eq!( + decoder.decode(&bytes1, Encoding::LowerSpecial).unwrap(), + "abc_def" + ); + for i in 0..128 { + let origin_string: String = iter::repeat_with(|| { + let char_a = b'a'; + ((char_a + i as u8) % 26 + char_a) as char + }) + .take(i) + .collect(); + let encoded = encoder.encode_lower_special(&origin_string).unwrap(); + let decoded = decoder.decode(&encoded, Encoding::LowerSpecial).unwrap(); + assert_eq!(decoded, origin_string); + } +} + +fn create_string(length: usize) -> String { + (0..length) + .map(|j| { + let n = j % 64; + match n { + 0..=25 => (b'a' + n as u8) as char, + 26..=51 => (b'A' + (n - 26) as u8) as char, + 52..=61 => (b'0' + (n - 52) as u8) as char, + 62 => '.', + 63 => '_', + _ => unreachable!(), + } + }) + .collect() +} + +#[test] +fn test_encode_meta_string_lower_upper_digit_special() { + let encoder = MetaStringEncoder::new(); + let encoded = encoder + .encode_lower_upper_digit_special("ExampleInput123") + .unwrap(); + assert_eq!(encoded.len(), 12); + + let decoder = MetaStringDecoder::new(); + let decoded = decoder + .decode(&encoded, Encoding::LowerUpperDigitSpecial) + .unwrap(); + assert_eq!(decoded, "ExampleInput123"); + + for i in 1..128 { + let origin_string = create_string(i); + let encoded = encoder + .encode_lower_upper_digit_special(&origin_string) + .unwrap(); + let decoded = decoder + .decode(&encoded, Encoding::LowerUpperDigitSpecial) + .unwrap(); + assert_eq!(decoded, origin_string); + } +} + +#[test] +fn test_meta_string() { + let encoder = MetaStringEncoder::new(); + + for i in 1..=127 { + let origin_string = create_string(i); + + let meta_string = encoder.encode(&origin_string).unwrap(); + assert_ne!(meta_string.encoding, Encoding::Utf8); + assert_eq!(meta_string.original, origin_string); + + let decoder = MetaStringDecoder::new(); + let new_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(new_string, origin_string); + } +} + +#[test] +fn test_encode_empty_string() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + for encoding in [ + Encoding::LowerSpecial, + Encoding::LowerUpperDigitSpecial, + Encoding::FirstToLowerSpecial, + Encoding::AllToLowerSpecial, + Encoding::Utf8, + ] { + let meta_string = encoder.encode_with_encoding("", encoding).unwrap(); + assert_eq!(meta_string.bytes.len(), 0); + let decoded = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded, ""); + } +} + +#[test] +fn test_encode_characters_outside_of_lower_special() { + let encoder = MetaStringEncoder::new(); + let test_string = "abcdefABCDEF1234!@#"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::Utf8); +} + +#[test] +fn test_all_to_upper_special_encoding() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + let test_string = "ABC_DEF"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::LowerUpperDigitSpecial); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); +} + +#[test] +fn test_first_to_lower_special_encoding() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + let test_string = "Aabcdef"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::FirstToLowerSpecial); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); +} + +#[test] +fn test_utf8_encoding() { + let encoder = MetaStringEncoder::new(); + let test_string = "你好,世界"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::Utf8); + let decoder = MetaStringDecoder::new(); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); + let test_string = "aA$"; + let meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(meta_string.encoding, Encoding::Utf8); + let decoder = MetaStringDecoder::new(); + let decoded_string = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded_string, test_string); +} + +#[test] +fn test_strip_last_char() { + let encoder = MetaStringEncoder::new(); + let test_string = "abc"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert!(!encoded_meta_string.strip_last_char); + + let test_string = "abcde"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert!(encoded_meta_string.strip_last_char); +} + +#[test] +fn test_empty_string() { + let encoder = MetaStringEncoder::new(); + let decoder = MetaStringDecoder::new(); + let meta_string = encoder.encode("").unwrap(); + assert!(meta_string.bytes.is_empty()); + let decoded = decoder + .decode(&meta_string.bytes, meta_string.encoding) + .unwrap(); + assert_eq!(decoded, ""); +} + +#[test] +fn test_ascii_encoding() { + let encoder = MetaStringEncoder::new(); + let test_string = "asciiOnly"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert_ne!(encoded_meta_string.encoding, Encoding::Utf8); + assert_eq!(encoded_meta_string.encoding, Encoding::AllToLowerSpecial); +} + +#[test] +fn test_non_ascii_encoding() { + let encoder = MetaStringEncoder::new(); + let test_string = "こんにちは"; + let encoded_meta_string = encoder.encode(test_string).unwrap(); + assert_eq!(encoded_meta_string.encoding, Encoding::Utf8); +} + +#[test] +fn test_non_ascii_encoding_and_non_utf8() { + let encoder = MetaStringEncoder::new(); + let non_ascii_string = "こんにちは"; + + match encoder.encode_with_encoding(non_ascii_string, Encoding::LowerSpecial) { + Err(err) => { + assert_eq!( + err.to_string(), + "Non-ASCII characters in meta string are not allowed" + ); + } + Ok(_) => panic!("Expected an error due to non-ASCII character with non-UTF8 encoding"), + } +}