Skip to content

Commit

Permalink
refactor(java): move latin language checker method from string serial…
Browse files Browse the repository at this point in the history
…izer to string util (#1708)



## What does this PR do?

<!-- Describe the purpose of this PR. -->
This PR decouples and moves the `isLatin([])` method from
`StringSerializer` class to `StringUtils`.


## Related issues

<!--
Is there any related issue? Please attach here.

- #1703
- #xxxx1
- #xxxx2
-->
#1703


## Does this PR introduce any user-facing change?

<!--
If any user-facing interface changes, please [open an
issue](https://github.com/apache/fury/issues/new/choose) describing the
need to do so and update the document if necessary.
-->

- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?


## Benchmark

<!--
When the PR has an impact on performance (if you don't know whether the
PR will have an impact on performance, you can submit the PR first, and
if it will have impact on performance, the code reviewer will explain
it), be sure to attach a benchmark data here.
-->

---------

Co-authored-by: Shawn Yang <[email protected]>
  • Loading branch information
anagh07 and chaokunyang authored Jun 29, 2024
1 parent d450d7d commit 09fda94
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 130 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import java.nio.ByteBuffer;
import org.apache.fury.memory.MemoryBuffer;
import org.apache.fury.memory.Platform;
import org.apache.fury.serializer.StringSerializer;
import org.apache.fury.util.StringUtils;
import org.openjdk.jmh.Main;
import org.openjdk.jmh.annotations.Benchmark;
Expand Down Expand Up @@ -102,7 +101,7 @@ public Object latinScalarCheck() {

@Benchmark
public Object latinSuperWordCheck() {
return StringSerializer.isLatin(latinStrChars);
return StringUtils.isLatin(latinStrChars);
}

public static void main(String[] args) throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
import java.util.HashSet;
import org.apache.fury.collection.Collections;
import org.apache.fury.meta.MetaString.Encoding;
import org.apache.fury.serializer.StringSerializer;
import org.apache.fury.util.Preconditions;
import org.apache.fury.util.StringUtils;

/** Encodes plain text strings into MetaString objects with specified encoding mechanisms. */
public class MetaStringEncoder {
Expand Down Expand Up @@ -57,7 +57,7 @@ public MetaString encode(String input, Encoding[] encodings) {
if (input.isEmpty()) {
return new MetaString(input, Encoding.UTF_8, specialChar1, specialChar2, new byte[0]);
}
if (!StringSerializer.isLatin(input.toCharArray())) {
if (!StringUtils.isLatin(input.toCharArray())) {
return new MetaString(
input,
Encoding.UTF_8,
Expand All @@ -79,7 +79,7 @@ public MetaString encode(String input, Encoding[] encodings) {
public MetaString encode(String input, Encoding encoding) {
Preconditions.checkArgument(
input.length() < Short.MAX_VALUE, "Long meta string than 32767 is not allowed");
if (encoding != Encoding.UTF_8 && !StringSerializer.isLatin(input.toCharArray())) {
if (encoding != Encoding.UTF_8 && !StringUtils.isLatin(input.toCharArray())) {
throw new IllegalArgumentException("Non-ASCII characters in meta string are not allowed");
}
if (input.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
import org.apache.fury.type.Type;
import org.apache.fury.util.ExceptionUtils;
import org.apache.fury.util.GraalvmSupport;
import org.apache.fury.util.StringUtils;
import org.apache.fury.util.unsafe._JDKAccess;

/** Serialization utils and common serializers. */
Expand Down Expand Up @@ -257,7 +258,7 @@ public void write(MemoryBuffer buffer, T value) {
buffer.writeBytes(v, 0, bytesLen);
} else {
char[] v = (char[]) GET_VALUE.apply(value);
if (StringSerializer.isLatin(v)) {
if (StringUtils.isLatin(v)) {
stringSerializer.writeCharsLatin(buffer, v, value.length());
} else {
stringSerializer.writeCharsUTF16(buffer, v, value.length());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import org.apache.fury.type.Type;
import org.apache.fury.util.MathUtils;
import org.apache.fury.util.Preconditions;
import org.apache.fury.util.StringUtils;
import org.apache.fury.util.unsafe._JDKAccess;

/**
Expand All @@ -63,8 +64,6 @@ public final class StringSerializer extends Serializer<String> {
private static final Byte UTF16_BOXED = UTF16;
private static final byte UTF8 = 2;
private static final int DEFAULT_BUFFER_SIZE = 1024;
// A long mask used to clear all-higher bits of char in a super-word way.
private static final long MULTI_CHARS_NON_LATIN_MASK;

// Make offset compatible with graalvm native image.
private static final long STRING_VALUE_FIELD_OFFSET;
Expand Down Expand Up @@ -103,15 +102,6 @@ private static class Offset {
Preconditions.checkArgument(
ReflectionUtils.getFieldNullable(String.class, "offset") == null,
"Current jdk not supported");
if (Platform.IS_LITTLE_ENDIAN) {
// latin chars will be 0xXX,0x00;0xXX,0x00 in byte order;
// Using 0x00,0xff(0xff00) to clear latin bits.
MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L;
} else {
// latin chars will be 0x00,0xXX;0x00,0xXX in byte order;
// Using 0x00,0xff(0x00ff) to clear latin bits.
MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL;
}
}

private final boolean compressString;
Expand Down Expand Up @@ -178,7 +168,7 @@ public Expression writeStringExpr(Expression strSerializer, Expression buffer, E
// Invoked by jit
public void writeCharsStringCompressed(MemoryBuffer buffer, String value) {
final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET);
if (isLatin(chars)) {
if (StringUtils.isLatin(chars)) {
writeCharsLatin(buffer, chars, chars.length);
} else {
writeCharsUTF16(buffer, chars, chars.length);
Expand Down Expand Up @@ -288,7 +278,7 @@ public void writeJavaString(MemoryBuffer buffer, String value) {
assert STRING_VALUE_FIELD_IS_CHARS;
final char[] chars = (char[]) Platform.getObject(value, STRING_VALUE_FIELD_OFFSET);
if (compressString) {
if (isLatin(chars)) {
if (StringUtils.isLatin(chars)) {
writeCharsLatin(buffer, chars, chars.length);
} else {
writeCharsUTF16(buffer, chars, chars.length);
Expand All @@ -300,32 +290,6 @@ public void writeJavaString(MemoryBuffer buffer, String value) {
}
}

public static boolean isLatin(char[] chars) {
int numChars = chars.length;
int vectorizedLen = numChars >> 2;
int vectorizedChars = vectorizedLen << 2;
int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
boolean isLatin = true;
for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) {
// check 4 chars in a vectorized way, 4 times faster than scalar check loop.
// See benchmark in CompressStringSuite.latinSuperWordCheck.
long multiChars = Platform.getLong(chars, offset);
if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) != 0) {
isLatin = false;
break;
}
}
if (isLatin) {
for (int i = vectorizedChars; i < numChars; i++) {
if (chars[i] > 0xFF) {
isLatin = false;
break;
}
}
}
return isLatin;
}

// Invoked by fury JIT
public String readJavaString(MemoryBuffer buffer) {
if (STRING_VALUE_FIELD_IS_BYTES) {
Expand Down
42 changes: 42 additions & 0 deletions java/fury-core/src/main/java/org/apache/fury/util/StringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,28 @@
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
import org.apache.fury.memory.Platform;

public class StringUtils {
// A long mask used to clear all-higher bits of char in a super-word way.
private static final long MULTI_CHARS_NON_LATIN_MASK;

private static final char[] BASE16_CHARS2 = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};

static {
if (Platform.IS_LITTLE_ENDIAN) {
// latin chars will be 0xXX,0x00;0xXX,0x00 in byte order;
// Using 0x00,0xff(0xff00) to clear latin bits.
MULTI_CHARS_NON_LATIN_MASK = 0xff00ff00ff00ff00L;
} else {
// latin chars will be 0x00,0xXX;0x00,0xXX in byte order;
// Using 0x00,0xff(0x00ff) to clear latin bits.
MULTI_CHARS_NON_LATIN_MASK = 0x00ff00ff00ff00ffL;
}
}

/** Converts a bytes array into a hexadecimal string. */
public static String encodeHexString(final byte[] data) {
StringBuilder result = new StringBuilder(data.length * 2);
Expand Down Expand Up @@ -249,4 +265,30 @@ public static String lowerCamelToLowerUnderscore(String lowerCamel) {

return builder.toString();
}

public static boolean isLatin(char[] chars) {
int numChars = chars.length;
int vectorizedLen = numChars >> 2;
int vectorizedChars = vectorizedLen << 2;
int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
boolean isLatin = true;
for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) {
// check 4 chars in a vectorized way, 4 times faster than scalar check loop.
// See benchmark in CompressStringSuite.latinSuperWordCheck.
long multiChars = Platform.getLong(chars, offset);
if ((multiChars & MULTI_CHARS_NON_LATIN_MASK) != 0) {
isLatin = false;
break;
}
}
if (isLatin) {
for (int i = vectorizedChars; i < numChars; i++) {
if (chars[i] > 0xFF) {
isLatin = false;
break;
}
}
}
return isLatin;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,5 @@ Args=--initialize-at-build-time=org.apache.fury.memory.MemoryBuffer,\
org.apache.fury.reflect.Types$ClassOwnership,\
org.apache.fury.reflect.Types$ClassOwnership$1,\
org.apache.fury.reflect.Types$ClassOwnership$2,\
org.apache.fury.resolver.DisallowedList
org.apache.fury.resolver.DisallowedList,\
org.apache.fury.util.StringUtils
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

import static org.apache.fury.serializer.StringSerializer.newBytesStringZeroCopy;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;

import java.lang.reflect.Field;
import java.nio.ByteBuffer;
Expand Down Expand Up @@ -290,87 +288,6 @@ public void testCompressJava8String() {
}
}

@Test(dataProvider = "endian")
public void testVectorizedLatinCheckAlgorithm(boolean endian) {
// assertTrue(isLatin("Fury".toCharArray(), endian));
// assertTrue(isLatin(StringUtils.random(8 * 10).toCharArray(), endian));
// test unaligned
assertTrue(isLatin((StringUtils.random(8 * 10) + "1").toCharArray(), endian));
assertTrue(isLatin((StringUtils.random(8 * 10) + "12").toCharArray(), endian));
assertTrue(isLatin((StringUtils.random(8 * 10) + "123").toCharArray(), endian));
assertFalse(isLatin("你好, Fury".toCharArray(), endian));
assertFalse(isLatin((StringUtils.random(8 * 10) + "你好").toCharArray(), endian));
assertFalse(isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray(), endian));
}

private boolean isLatin(char[] chars, boolean isLittle) {
boolean reverseBytes =
(Platform.IS_LITTLE_ENDIAN && !isLittle) || (!Platform.IS_LITTLE_ENDIAN && !isLittle);
if (reverseBytes) {
for (int i = 0; i < chars.length; i++) {
chars[i] = Character.reverseBytes(chars[i]);
}
}
long mask;
if (isLittle) {
// latin chars will be 0xXX,0x00;0xXX,0x00 in byte order;
// Using 0x00,0xff(0xff00) to clear latin bits.
mask = 0xff00ff00ff00ff00L;
} else {
// latin chars will be 0x00,0xXX;0x00,0xXX in byte order;
// Using 0x00,0xff(0x00ff) to clear latin bits.
mask = 0x00ff00ff00ff00ffL;
}
int numChars = chars.length;
int vectorizedLen = numChars >> 2;
int vectorizedChars = vectorizedLen << 2;
int endOffset = Platform.CHAR_ARRAY_OFFSET + (vectorizedChars << 1);
boolean isLatin = true;
for (int offset = Platform.CHAR_ARRAY_OFFSET; offset < endOffset; offset += 8) {
// check 4 chars in a vectorized way, 4 times faster than scalar check loop.
long multiChars = Platform.getLong(chars, offset);
if ((multiChars & mask) != 0) {
isLatin = false;
break;
}
}
if (isLatin) {
for (int i = vectorizedChars; i < numChars; i++) {
char c = chars[i];
if (reverseBytes) {
c = Character.reverseBytes(c);
}
if (c > 0xFF) {
isLatin = false;
break;
}
}
}
return isLatin;
}

@Test
public void testLatinCheck() {
assertTrue(StringSerializer.isLatin("Fury".toCharArray()));
assertTrue(StringSerializer.isLatin(StringUtils.random(8 * 10).toCharArray()));
// test unaligned
assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "1").toCharArray()));
assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "12").toCharArray()));
assertTrue(StringSerializer.isLatin((StringUtils.random(8 * 10) + "123").toCharArray()));
assertFalse(StringSerializer.isLatin("你好, Fury".toCharArray()));
assertFalse(StringSerializer.isLatin((StringUtils.random(8 * 10) + "你好").toCharArray()));
assertFalse(StringSerializer.isLatin((StringUtils.random(8 * 10) + "1你好").toCharArray()));
assertFalse(StringSerializer.isLatin((StringUtils.random(11) + "你").toCharArray()));
assertFalse(StringSerializer.isLatin((StringUtils.random(10) + "你好").toCharArray()));
assertFalse(StringSerializer.isLatin((StringUtils.random(9) + "性能好").toCharArray()));
assertFalse(StringSerializer.isLatin("\u1234".toCharArray()));
assertFalse(StringSerializer.isLatin("a\u1234".toCharArray()));
assertFalse(StringSerializer.isLatin("ab\u1234".toCharArray()));
assertFalse(StringSerializer.isLatin("abc\u1234".toCharArray()));
assertFalse(StringSerializer.isLatin("abcd\u1234".toCharArray()));
assertFalse(StringSerializer.isLatin("Javaone Keynote\u1234".toCharArray()));
}

@Test
public void testReadUtf8String() {
Fury fury = getJavaFury();
Expand Down
Loading

0 comments on commit 09fda94

Please sign in to comment.