From 09fda94ab7f476da4dc3b6752825b7249bda6ac2 Mon Sep 17 00:00:00 2001 From: Anagh Mehran Date: Sat, 29 Jun 2024 06:59:56 -0400 Subject: [PATCH] refactor(java): move latin language checker method from string serializer to string util (#1708) ## What does this PR do? This PR decouples and moves the `isLatin([])` method from `StringSerializer` class to `StringUtils`. ## Related issues #1703 ## Does this PR introduce any user-facing change? - [ ] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark --------- Co-authored-by: Shawn Yang --- .../fury/benchmark/CompressStringSuite.java | 3 +- .../apache/fury/meta/MetaStringEncoder.java | 6 +- .../apache/fury/serializer/Serializers.java | 3 +- .../fury/serializer/StringSerializer.java | 42 +-------- .../org/apache/fury/util/StringUtils.java | 42 +++++++++ .../fury-core/native-image.properties | 3 +- .../fury/serializer/StringSerializerTest.java | 83 ------------------ .../org/apache/fury/util/StringUtilsTest.java | 85 ++++++++++++++++++- 8 files changed, 137 insertions(+), 130 deletions(-) diff --git a/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java b/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java index 63979c11b1..bc09fa2073 100644 --- a/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java +++ b/java/benchmark/src/main/java/org/apache/fury/benchmark/CompressStringSuite.java @@ -22,7 +22,6 @@ import java.nio.ByteBuffer; import org.apache.fury.memory.MemoryBuffer; import org.apache.fury.memory.Platform; -import org.apache.fury.serializer.StringSerializer; import org.apache.fury.util.StringUtils; import org.openjdk.jmh.Main; import org.openjdk.jmh.annotations.Benchmark; @@ -102,7 +101,7 @@ public Object latinScalarCheck() { @Benchmark public Object latinSuperWordCheck() { - return StringSerializer.isLatin(latinStrChars); + return StringUtils.isLatin(latinStrChars); } public static void main(String[] args) throws Exception { diff --git a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java index b6a0a58b44..90298e8e41 100644 --- a/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java +++ b/java/fury-core/src/main/java/org/apache/fury/meta/MetaStringEncoder.java @@ -23,8 +23,8 @@ import java.util.HashSet; import org.apache.fury.collection.Collections; import org.apache.fury.meta.MetaString.Encoding; -import org.apache.fury.serializer.StringSerializer; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringUtils; /** Encodes plain text strings into MetaString objects with specified encoding mechanisms. */ public class MetaStringEncoder { @@ -57,7 +57,7 @@ public MetaString encode(String input, Encoding[] encodings) { if (input.isEmpty()) { return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]); } - if (!StringSerializer.isLatin(input.toCharArray())) { + if (!StringUtils.isLatin(input.toCharArray())) { return new MetaString( input, Encoding.UTF_8, @@ -79,7 +79,7 @@ public MetaString encode(String input, Encoding[] encodings) { public MetaString encode(String input, Encoding encoding) { Preconditions.checkArgument( input.length() < Short.MAX_VALUE, "Long meta string than 32767 is not allowed"); - if (encoding != Encoding.UTF_8 && !StringSerializer.isLatin(input.toCharArray())) { + if (encoding != Encoding.UTF_8 && !StringUtils.isLatin(input.toCharArray())) { throw new IllegalArgumentException("Non-ASCII characters in meta string are not allowed"); } if (input.isEmpty()) { diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java index 3dbd70ebab..63d0099c5c 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/Serializers.java @@ -51,6 +51,7 @@ import org.apache.fury.type.Type; import org.apache.fury.util.ExceptionUtils; import org.apache.fury.util.GraalvmSupport; +import org.apache.fury.util.StringUtils; import org.apache.fury.util.unsafe._JDKAccess; /** Serialization utils and common serializers. */ @@ -257,7 +258,7 @@ public void write(MemoryBuffer buffer, T value) { buffer.writeBytes(v, 0, bytesLen); } else { char[] v = (char[]) GET_VALUE.apply(value); - if (StringSerializer.isLatin(v)) { + if (StringUtils.isLatin(v)) { stringSerializer.writeCharsLatin(buffer, v, value.length()); } else { stringSerializer.writeCharsUTF16(buffer, v, value.length()); diff --git a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java index 2e26096e3a..0c77bca401 100644 --- a/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java +++ b/java/fury-core/src/main/java/org/apache/fury/serializer/StringSerializer.java @@ -43,6 +43,7 @@ import org.apache.fury.type.Type; import org.apache.fury.util.MathUtils; import org.apache.fury.util.Preconditions; +import org.apache.fury.util.StringUtils; import org.apache.fury.util.unsafe._JDKAccess; /** @@ -63,8 +64,6 @@ public final class StringSerializer extends Serializer { private static final Byte UTF16_BOXED = UTF16; private static final byte UTF8 = 2; private static final int DEFAULT_BUFFER_SIZE = 1024; - // A long mask used to clear all-higher bits of char in a super-word way. - private static final long MULTI_CHARS_NON_LATIN_MASK; // Make offset compatible with graalvm native image. private static final long STRING_VALUE_FIELD_OFFSET; @@ -103,15 +102,6 @@ private static class Offset { Preconditions.checkArgument( ReflectionUtils.getFieldNullable(String.class, "offset") == null, "Current jdk not supported"); - if (Platform.IS_LITTLE_ENDIAN) { - // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; - // Using 0x00,0xff(0xff00) to clear latin bits. - MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L; - } else { - // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; - // Using 0x00,0xff(0x00ff) to clear latin bits. - MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL; - } } private final boolean compressString; @@ -178,7 +168,7 @@ public Expression writeStringExpr(Expression strSerializer, Expression buffer, E // Invoked by jit public void writeCharsStringCompressed(MemoryBuffer buffer, String value) { final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); - if (isLatin(chars)) { + if (StringUtils.isLatin(chars)) { writeCharsLatin(buffer, chars, chars.length); } else { writeCharsUTF16(buffer, chars, chars.length); @@ -288,7 +278,7 @@ public void writeJavaString(MemoryBuffer buffer, String value) { assert STRING_VALUE_FIELD_IS_CHARS; final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET); if (compressString) { - if (isLatin(chars)) { + if (StringUtils.isLatin(chars)) { writeCharsLatin(buffer, chars, chars.length); } else { writeCharsUTF16(buffer, chars, chars.length); @@ -300,32 +290,6 @@ public void writeJavaString(MemoryBuffer buffer, String value) { } } - public static boolean isLatin(char[] chars) { - int numChars = chars.length; - int vectorizedLen = numChars >> 2; - int vectorizedChars = vectorizedLen << 2; - int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); - boolean isLatin = true; - for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { - // check 4 chars in a vectorized way, 4 times faster than scalar check loop. - // See benchmark in CompressStringSuite.latinSuperWordCheck. - long multiChars = Platform.getLong(chars, offset); - if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) != 0) { - isLatin = false; - break; - } - } - if (isLatin) { - for (int i = vectorizedChars; i < numChars; i++) { - if (chars[i] > 0xFF) { - isLatin = false; - break; - } - } - } - return isLatin; - } - // Invoked by fury JIT public String readJavaString(MemoryBuffer buffer) { if (STRING_VALUE_FIELD_IS_BYTES) { diff --git a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java index 338073fc51..cc892bef11 100644 --- a/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java +++ b/java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java @@ -22,12 +22,28 @@ import java.util.HashMap; import java.util.Map; import java.util.Random; +import org.apache.fury.memory.Platform; public class StringUtils { + // A long mask used to clear all-higher bits of char in a super-word way. + private static final long MULTI_CHARS_NON_LATIN_MASK; + private static final char[] BASE16_CHARS2 = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; + static { + if (Platform.IS_LITTLE_ENDIAN) { + // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; + // Using 0x00,0xff(0xff00) to clear latin bits. + MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L; + } else { + // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; + // Using 0x00,0xff(0x00ff) to clear latin bits. + MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL; + } + } + /** Converts a bytes array into a hexadecimal string. */ public static String encodeHexString(final byte[] data) { StringBuilder result = new StringBuilder(data.length * 2); @@ -249,4 +265,30 @@ public static String lowerCamelToLowerUnderscore(String lowerCamel) { return builder.toString(); } + + public static boolean isLatin(char[] chars) { + int numChars = chars.length; + int vectorizedLen = numChars >> 2; + int vectorizedChars = vectorizedLen << 2; + int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); + boolean isLatin = true; + for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { + // check 4 chars in a vectorized way, 4 times faster than scalar check loop. + // See benchmark in CompressStringSuite.latinSuperWordCheck. + long multiChars = Platform.getLong(chars, offset); + if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) != 0) { + isLatin = false; + break; + } + } + if (isLatin) { + for (int i = vectorizedChars; i < numChars; i++) { + if (chars[i] > 0xFF) { + isLatin = false; + break; + } + } + } + return isLatin; + } } diff --git a/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties b/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties index 80986a2b99..3b6e47deb1 100644 --- a/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties +++ b/java/fury-core/src/main/resources/META-INF/native-image/org.apache.fury/fury-core/native-image.properties @@ -172,4 +172,5 @@ Args=--initialize-at-build-time=org.apache.fury.memory.MemoryBuffer,\ org.apache.fury.reflect.Types$ClassOwnership,\ org.apache.fury.reflect.Types$ClassOwnership$1,\ org.apache.fury.reflect.Types$ClassOwnership$2,\ - org.apache.fury.resolver.DisallowedList + org.apache.fury.resolver.DisallowedList,\ + org.apache.fury.util.StringUtils diff --git a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java index 51779c8ca1..84cc69ed0d 100644 --- a/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/serializer/StringSerializerTest.java @@ -21,8 +21,6 @@ import static org.apache.fury.serializer.StringSerializer.newBytesStringZeroCopy; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; import java.lang.reflect.Field; import java.nio.ByteBuffer; @@ -290,87 +288,6 @@ public void testCompressJava8String() { } } - @Test(dataProvider = "endian") - public void testVectorizedLatinCheckAlgorithm(boolean endian) { - // assertTrue(isLatin("Fury".toCharArray(), endian)); - // assertTrue(isLatin(StringUtils.random(8 * 10).toCharArray(), endian)); - // test unaligned - assertTrue(isLatin((StringUtils.random(8 * 10) + "1").toCharArray(), endian)); - assertTrue(isLatin((StringUtils.random(8 * 10) + "12").toCharArray(), endian)); - assertTrue(isLatin((StringUtils.random(8 * 10) + "123").toCharArray(), endian)); - assertFalse(isLatin("你好, Fury".toCharArray(), endian)); - assertFalse(isLatin((StringUtils.random(8 * 10) + "你好").toCharArray(), endian)); - assertFalse(isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray(), endian)); - } - - private boolean isLatin(char[] chars, boolean isLittle) { - boolean reverseBytes = - (Platform.IS_LITTLE_ENDIAN && !isLittle) || (!Platform.IS_LITTLE_ENDIAN && !isLittle); - if (reverseBytes) { - for (int i = 0; i < chars.length; i++) { - chars[i] = Character.reverseBytes(chars[i]); - } - } - long mask; - if (isLittle) { - // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; - // Using 0x00,0xff(0xff00) to clear latin bits. - mask = 0xff00ff00ff00ff00L; - } else { - // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; - // Using 0x00,0xff(0x00ff) to clear latin bits. - mask = 0x00ff00ff00ff00ffL; - } - int numChars = chars.length; - int vectorizedLen = numChars >> 2; - int vectorizedChars = vectorizedLen << 2; - int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); - boolean isLatin = true; - for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { - // check 4 chars in a vectorized way, 4 times faster than scalar check loop. - long multiChars = Platform.getLong(chars, offset); - if ((multiChars & mask) != 0) { - isLatin = false; - break; - } - } - if (isLatin) { - for (int i = vectorizedChars; i < numChars; i++) { - char c = chars[i]; - if (reverseBytes) { - c = Character.reverseBytes(c); - } - if (c > 0xFF) { - isLatin = false; - break; - } - } - } - return isLatin; - } - - @Test - public void testLatinCheck() { - assertTrue(StringSerializer.isLatin("Fury".toCharArray())); - assertTrue(StringSerializer.isLatin(StringUtils.random(8 * 10).toCharArray())); - // test unaligned - assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "1").toCharArray())); - assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "12").toCharArray())); - assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "123").toCharArray())); - assertFalse(StringSerializer.isLatin("你好, Fury".toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(8 * 10) + "你好").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(11) + "你").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(10) + "你好").toCharArray())); - assertFalse(StringSerializer.isLatin((StringUtils.random(9) + "性能好").toCharArray())); - assertFalse(StringSerializer.isLatin("\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("a\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("ab\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("abc\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("abcd\u1234".toCharArray())); - assertFalse(StringSerializer.isLatin("Javaone Keynote\u1234".toCharArray())); - } - @Test public void testReadUtf8String() { Fury fury = getJavaFury(); diff --git a/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java b/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java index dea417888d..dec17d63a6 100644 --- a/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java +++ b/java/fury-core/src/test/java/org/apache/fury/util/StringUtilsTest.java @@ -23,9 +23,11 @@ import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; +import org.apache.fury.FuryTestBase; +import org.apache.fury.memory.Platform; import org.testng.annotations.Test; -public class StringUtilsTest { +public class StringUtilsTest extends FuryTestBase { @Test public void testEncodeHexString() { @@ -87,4 +89,85 @@ public void testLowerCamelToLowerUnderscore() { assertEquals(StringUtils.lowerCamelToLowerUnderscore("some123variable"), "some123variable"); assertEquals(StringUtils.lowerCamelToLowerUnderscore("someVariable123"), "some_variable123"); } + + @Test(dataProvider = "endian") + public void testVectorizedLatinCheckAlgorithm(boolean endian) { + // assertTrue(isLatin("Fury".toCharArray(), endian)); + // assertTrue(isLatin(StringUtils.random(8 * 10).toCharArray(), endian)); + // test unaligned + assertTrue(isLatin((StringUtils.random(8 * 10) + "1").toCharArray(), endian)); + assertTrue(isLatin((StringUtils.random(8 * 10) + "12").toCharArray(), endian)); + assertTrue(isLatin((StringUtils.random(8 * 10) + "123").toCharArray(), endian)); + assertFalse(isLatin("你好, Fury".toCharArray(), endian)); + assertFalse(isLatin((StringUtils.random(8 * 10) + "你好").toCharArray(), endian)); + assertFalse(isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray(), endian)); + } + + private boolean isLatin(char[] chars, boolean isLittle) { + boolean reverseBytes = + (Platform.IS_LITTLE_ENDIAN && !isLittle) || (!Platform.IS_LITTLE_ENDIAN && !isLittle); + if (reverseBytes) { + for (int i = 0; i < chars.length; i++) { + chars[i] = Character.reverseBytes(chars[i]); + } + } + long mask; + if (isLittle) { + // latin chars will be 0xXX,0x00;0xXX,0x00 in byte order; + // Using 0x00,0xff(0xff00) to clear latin bits. + mask = 0xff00ff00ff00ff00L; + } else { + // latin chars will be 0x00,0xXX;0x00,0xXX in byte order; + // Using 0x00,0xff(0x00ff) to clear latin bits. + mask = 0x00ff00ff00ff00ffL; + } + int numChars = chars.length; + int vectorizedLen = numChars >> 2; + int vectorizedChars = vectorizedLen << 2; + int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1); + boolean isLatin = true; + for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) { + // check 4 chars in a vectorized way, 4 times faster than scalar check loop. + long multiChars = Platform.getLong(chars, offset); + if ((multiChars & mask) != 0) { + isLatin = false; + break; + } + } + if (isLatin) { + for (int i = vectorizedChars; i < numChars; i++) { + char c = chars[i]; + if (reverseBytes) { + c = Character.reverseBytes(c); + } + if (c > 0xFF) { + isLatin = false; + break; + } + } + } + return isLatin; + } + + @Test + public void testLatinCheck() { + assertTrue(StringUtils.isLatin("Fury".toCharArray())); + assertTrue(StringUtils.isLatin(StringUtils.random(8 * 10).toCharArray())); + // test unaligned + assertTrue(StringUtils.isLatin((StringUtils.random(8 * 10) + "1").toCharArray())); + assertTrue(StringUtils.isLatin((StringUtils.random(8 * 10) + "12").toCharArray())); + assertTrue(StringUtils.isLatin((StringUtils.random(8 * 10) + "123").toCharArray())); + assertFalse(StringUtils.isLatin("你好, Fury".toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(8 * 10) + "你好").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(11) + "你").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(10) + "你好").toCharArray())); + assertFalse(StringUtils.isLatin((StringUtils.random(9) + "性能好").toCharArray())); + assertFalse(StringUtils.isLatin("\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("a\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("ab\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("abc\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("abcd\u1234".toCharArray())); + assertFalse(StringUtils.isLatin("Javaone Keynote\u1234".toCharArray())); + } }