Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement WriteStringValueSegment defined in Issue 67337 #101356

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9122ef8
Implement WriteStringValueSegment defined in Issue 67337
ificator Apr 21, 2024
e044b13
Fix some review comments
ificator May 26, 2024
e7abe7f
merge upstream/main
ificator May 26, 2024
b8d578c
Handle split surrogate pair
ificator May 26, 2024
181cef2
Merge remote-tracking branch 'upstream/main' into user/ificator/write…
ificator Dec 6, 2024
65006ce
Commit old changes responding to comments
ificator Dec 6, 2024
1601af8
utf8 and utf16
PranavSenthilnathan Dec 11, 2024
d6b66be
fix build error
PranavSenthilnathan Dec 16, 2024
a46a1cc
Update src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf…
PranavSenthilnathan Dec 16, 2024
b5d0c17
Update src/libraries/System.Text.Json/src/System/Text/Json/Writer/Utf…
PranavSenthilnathan Dec 16, 2024
4a0d1c6
PR comments
PranavSenthilnathan Dec 16, 2024
09d321d
Merge branch 'main' of https://github.com/dotnet/runtime into user/if…
PranavSenthilnathan Dec 16, 2024
96ed922
add encoding flags
PranavSenthilnathan Dec 17, 2024
a078bfd
add test for switching encoding
PranavSenthilnathan Dec 17, 2024
93e6ee9
use CoreLib Rune for polyfill instead of having a separate copy
PranavSenthilnathan Dec 17, 2024
501813f
Merge branch 'main' of https://github.com/dotnet/runtime into user/if…
PranavSenthilnathan Dec 17, 2024
c3b1c3b
move warning disabling to top and fix up tests
PranavSenthilnathan Dec 18, 2024
c9c4884
add fuzzer
PranavSenthilnathan Dec 19, 2024
8482b1c
Fix some tests I missed
PranavSenthilnathan Dec 19, 2024
d50bbca
clean up and add another test to fuzzer
PranavSenthilnathan Dec 19, 2024
55827d9
comment typo
PranavSenthilnathan Dec 20, 2024
a5cd855
pr comments
PranavSenthilnathan Dec 20, 2024
c82b035
Merge branch 'main' of https://github.com/dotnet/runtime into user/if…
PranavSenthilnathan Dec 20, 2024
4f63907
Merge branch 'user/ificator/writestringvaluesegment' of https://githu…
PranavSenthilnathan Dec 20, 2024
b7fd4a5
throw when encodings are mixed
PranavSenthilnathan Dec 24, 2024
c0a700c
update fuzzer to assert that mixing encodings always throws
PranavSenthilnathan Dec 24, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions eng/pipelines/libraries/fuzzing/deploy-to-onefuzz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,4 +153,12 @@ extends:
onefuzzDropDirectory: $(fuzzerProject)/deployment/UTF8Fuzzer
SYSTEM_ACCESSTOKEN: $(System.AccessToken)
displayName: Send UTF8Fuzzer to OneFuzz

- task: onefuzz-task@0
inputs:
onefuzzOSes: 'Windows'
env:
onefuzzDropDirectory: $(fuzzerProject)/deployment/Utf8JsonWriterFuzzer
SYSTEM_ACCESSTOKEN: $(System.AccessToken)
displayName: Send Utf8JsonWriterFuzzer to OneFuzz
# ONEFUZZ_TASK_WORKAROUND_END
3 changes: 2 additions & 1 deletion src/libraries/Fuzzing/DotnetFuzzing/DotnetFuzzing.csproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
Expand Down Expand Up @@ -29,6 +29,7 @@
<Compile Include="Fuzzers\TextEncodingFuzzer.cs" />
<Compile Include="Fuzzers\TypeNameFuzzer.cs" />
<Compile Include="Fuzzers\UTF8Fuzzer.cs" />
<Compile Include="Fuzzers\Utf8JsonWriterFuzzer.cs" />
<Compile Include="IFuzzer.cs" />
<Compile Include="PooledBoundedMemory.cs" />
<Compile Include="Program.cs" />
Expand Down
223 changes: 223 additions & 0 deletions src/libraries/Fuzzing/DotnetFuzzing/Fuzzers/Utf8JsonWriterFuzzer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Buffers;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Text.Encodings.Web;
using System.Text.Json;
using System.Text.Unicode;
using SharpFuzz;

namespace DotnetFuzzing.Fuzzers;

internal sealed class Utf8JsonWriterFuzzer : IFuzzer
{
public string[] TargetAssemblies { get; } = ["System.Text.Json"];

public string[] TargetCoreLibPrefixes => [];

// One of the bytes in the input is used to set various test options.
// Each bit in that byte represents a different option as indicated here.

// Options for JsonWriterOptions
private const byte IndentFlag = 1;
PranavSenthilnathan marked this conversation as resolved.
Show resolved Hide resolved
private const byte EncoderFlag = 1 << 1;
private const byte MaxDepthFlag = 1 << 2;
private const byte NewLineFlag = 1 << 3;
private const byte SkipValidationFlag = 1 << 4;

// Options for choosing between UTF-8 and UTF-16 encoding
private const byte EncodingFlag = 1 << 5;

public void FuzzTarget(ReadOnlySpan<byte> bytes)
{
const int minLength = 10; // 2 ints, 1 byte, and 1 padding to align chars
if (bytes.Length < minLength)
{
return;
}

// First 2 ints are used as indices to slice the input and the following byte is used for options
ReadOnlySpan<int> ints = MemoryMarshal.Cast<byte, int>(bytes);
int slice1 = ints[0];
int slice2 = ints[1];
byte optionsByte = bytes[8];
bytes = bytes.Slice(minLength);
ReadOnlySpan<char> chars = MemoryMarshal.Cast<byte, char>(bytes);

// Validate that the indices are within bounds of the input
bool utf8 = (optionsByte & EncodingFlag) == 0;
if (!(0 <= slice1 && slice1 <= slice2 && slice2 <= (utf8 ? bytes.Length : chars.Length)))
{
return;
}

// Set up options based on the first byte
bool indented = (optionsByte & IndentFlag) == 0;
JsonWriterOptions options = new()
{
Encoder = (optionsByte & EncodingFlag) == 0 ? JavaScriptEncoder.Default : JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
Indented = indented,
MaxDepth = (optionsByte & MaxDepthFlag) == 0 ? 1 : 0,
PranavSenthilnathan marked this conversation as resolved.
Show resolved Hide resolved
NewLine = (optionsByte & NewLineFlag) == 0 ? "\n" : "\r\n",
SkipValidation = (optionsByte & SkipValidationFlag) == 0,
};

// Compute the expected result by using the encoder directly and the input
int maxExpandedSizeBytes = 6 * bytes.Length + 2;
byte[] expectedBuffer = ArrayPool<byte>.Shared.Rent(maxExpandedSizeBytes);
Span<byte> expected =
expectedBuffer.AsSpan(0, utf8
? EncodeToUtf8(bytes, expectedBuffer, options.Encoder)
: EncodeToUtf8(chars, expectedBuffer, options.Encoder));

// Compute the actual result by using Utf8JsonWriter. Each iteration is a different slice of the input, but the result should be the same.
byte[] actualBuffer = new byte[expected.Length];
foreach (ReadOnlySpan<Range> ranges in new[]
{
new[] { 0.. },
new[] { 0..slice1, slice1.. },
new[] { 0..slice1, slice1..slice2, slice2.. },
})
{
using MemoryStream stream = new(actualBuffer);
using Utf8JsonWriter writer = new(stream, options);

if (utf8)
{
WriteStringValueSegments(writer, bytes, ranges);
}
else
{
WriteStringValueSegments(writer, chars, ranges);
}

writer.Flush();

// Compare the expected and actual results
Assert.SequenceEqual(expected, actualBuffer);
Assert.Equal(expected.Length, writer.BytesCommitted);
Assert.Equal(0, writer.BytesPending);

Array.Clear(actualBuffer);
}

// Additional test for mixing UTF-8 and UTF-16 encoding. The alignment math is easier in UTF-16 mode so just run it for that.
if (!utf8)
{
Array.Clear(expectedBuffer);

{
ReadOnlySpan<char> firstSegment = chars[slice1..];
ReadOnlySpan<byte> secondSegment = bytes[0..(2 * slice1)];

expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, secondSegment, expectedBuffer, options.Encoder));

actualBuffer = new byte[expected.Length];
using MemoryStream stream = new(actualBuffer);
using Utf8JsonWriter writer = new(stream, options);

writer.WriteStringValueSegment(firstSegment, false);
writer.WriteStringValueSegment(secondSegment, true);
writer.Flush();

Assert.SequenceEqual(expected, actualBuffer);
Assert.Equal(expected.Length, writer.BytesCommitted);
Assert.Equal(0, writer.BytesPending);
}

Array.Clear(expectedBuffer);

{
ReadOnlySpan<byte> firstSegment = bytes[0..(2 * slice1)];
ReadOnlySpan<char> secondSegment = chars[slice1..];

expected = expectedBuffer.AsSpan(0, EncodeToUtf8(firstSegment, secondSegment, expectedBuffer, options.Encoder));

actualBuffer = new byte[expected.Length];
using MemoryStream stream = new(actualBuffer);
using Utf8JsonWriter writer = new(stream, options);

writer.WriteStringValueSegment(firstSegment, false);
writer.WriteStringValueSegment(secondSegment, true);
writer.Flush();

Assert.SequenceEqual(expected, actualBuffer);
Assert.Equal(expected.Length, writer.BytesCommitted);
Assert.Equal(0, writer.BytesPending);
}
}

ArrayPool<byte>.Shared.Return(expectedBuffer);
}

private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan<byte> bytes, ReadOnlySpan<Range> ranges)
{
for (int i = 0; i < ranges.Length; i++)
{
writer.WriteStringValueSegment(bytes[ranges[i]], i == ranges.Length - 1);
}
}

private static void WriteStringValueSegments(Utf8JsonWriter writer, ReadOnlySpan<char> chars, ReadOnlySpan<Range> ranges)
{
for (int i = 0; i < ranges.Length; i++)
{
writer.WriteStringValueSegment(chars[ranges[i]], i == ranges.Length - 1);
}
}

private static int EncodeToUtf8(ReadOnlySpan<byte> bytes, Span<byte> destBuffer, JavaScriptEncoder encoder)
{
destBuffer[0] = (byte)'"';
encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out int written, isFinalBlock: true);
destBuffer[++written] = (byte)'"';
return written + 1;
}

private static int EncodeToUtf8(ReadOnlySpan<char> chars, Span<byte> destBuffer, JavaScriptEncoder encoder)
{
int written = 1;
destBuffer[0] = (byte)'"';
destBuffer[written += EncodeTranscode(chars, destBuffer[1..], encoder)] = (byte)'"';
return written + 1;
}

private static int EncodeToUtf8(ReadOnlySpan<byte> bytes, ReadOnlySpan<char> chars, Span<byte> destBuffer, JavaScriptEncoder encoder)
{
int written = 1;
destBuffer[0] = (byte)'"';
encoder.EncodeUtf8(bytes, destBuffer[1..], out _, out int writtenTemp, isFinalBlock: true);
written += writtenTemp;
destBuffer[written += EncodeTranscode(chars, destBuffer[written..], encoder, isFinalBlock: true)] = (byte)'"';
return written + 1;
}

private static int EncodeToUtf8(ReadOnlySpan<char> chars, ReadOnlySpan<byte> bytes, Span<byte> destBuffer, JavaScriptEncoder encoder)
{
int written = 1;
destBuffer[0] = (byte)'"';
written += EncodeTranscode(chars, destBuffer[1..], encoder, isFinalBlock: true);
encoder.EncodeUtf8(bytes, destBuffer[written..], out _, out int writtenTemp, isFinalBlock: true);
written += writtenTemp;
destBuffer[written] = (byte)'"';
return written + 1;
}

private static int EncodeTranscode(ReadOnlySpan<char> chars, Span<byte> destBuffer, JavaScriptEncoder encoder, bool isFinalBlock = true)
{
var utf16buffer = ArrayPool<char>.Shared.Rent(6 * chars.Length);
encoder.Encode(chars, utf16buffer, out _, out int written, isFinalBlock: true);

Utf8.FromUtf16(utf16buffer.AsSpan(0, written), destBuffer, out _, out written, isFinalBlock);
ArrayPool<char>.Shared.Return(utf16buffer);
return written;
}
}
29 changes: 22 additions & 7 deletions src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
using System.Runtime.CompilerServices;
using System.Text.Unicode;

#if !SYSTEM_PRIVATE_CORELIB
#pragma warning disable CS3019 // CLS compliance checking will not be performed because it is not visible from outside this assembly
#endif

namespace System.Text
{
/// <summary>
Expand All @@ -18,7 +22,12 @@ namespace System.Text
/// assuming that the underlying <see cref="Rune"/> instance is well-formed.
/// </remarks>
[DebuggerDisplay("{DebuggerDisplay,nq}")]
public readonly struct Rune : IComparable, IComparable<Rune>, IEquatable<Rune>
#if SYSTEM_PRIVATE_CORELIB
public
#else
internal
#endif
readonly struct Rune : IComparable, IComparable<Rune>, IEquatable<Rune>
#if SYSTEM_PRIVATE_CORELIB
#pragma warning disable SA1001 // Commas should be spaced correctly
, ISpanFormattable
Expand Down Expand Up @@ -141,7 +150,14 @@ private Rune(uint scalarValue, bool _)
public static explicit operator Rune(int value) => new Rune(value);

// Displayed as "'<char>' (U+XXXX)"; e.g., "'e' (U+0065)"
private string DebuggerDisplay => string.Create(CultureInfo.InvariantCulture, $"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");
private string DebuggerDisplay =>
#if SYSTEM_PRIVATE_CORELIB
string.Create(
CultureInfo.InvariantCulture,
#else
FormattableString.Invariant(
#endif
$"U+{_value:X4} '{(IsValid(_value) ? ToString() : "\uFFFD")}'");

/// <summary>
/// Returns true if and only if this scalar value is ASCII ([ U+0000..U+007F ])
Expand Down Expand Up @@ -242,7 +258,6 @@ private static Rune ChangeCaseCultureAware(Rune rune, TextInfo textInfo, bool to
#else
private static Rune ChangeCaseCultureAware(Rune rune, CultureInfo culture, bool toUpper)
{
Debug.Assert(!GlobalizationMode.Invariant, "This should've been checked by the caller.");
eiriktsarpalis marked this conversation as resolved.
Show resolved Hide resolved
Debug.Assert(culture != null, "This should've been checked by the caller.");

Span<char> original = stackalloc char[MaxUtf16CharsPerRune]; // worst case scenario = 2 code units (for a surrogate pair)
Expand Down Expand Up @@ -1375,12 +1390,12 @@ public static Rune ToLower(Rune value, CultureInfo culture)
// ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
// we'll just jump straight to the globalization tables if they're available.

#if SYSTEM_PRIVATE_CORELIB
if (GlobalizationMode.Invariant)
{
return ToLowerInvariant(value);
}

#if SYSTEM_PRIVATE_CORELIB
return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: false);
#else
return ChangeCaseCultureAware(value, culture, toUpper: false);
Expand All @@ -1399,14 +1414,14 @@ public static Rune ToLowerInvariant(Rune value)
return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToLowercase(value._value));
}

#if SYSTEM_PRIVATE_CORELIB
if (GlobalizationMode.Invariant)
{
return UnsafeCreate(CharUnicodeInfo.ToLower(value._value));
}

// Non-ASCII data requires going through the case folding tables.

#if SYSTEM_PRIVATE_CORELIB
return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: false);
#else
return ChangeCaseCultureAware(value, CultureInfo.InvariantCulture, toUpper: false);
Expand All @@ -1424,12 +1439,12 @@ public static Rune ToUpper(Rune value, CultureInfo culture)
// ASCII characters differently than the invariant culture (e.g., Turkish I). Instead
// we'll just jump straight to the globalization tables if they're available.

#if SYSTEM_PRIVATE_CORELIB
if (GlobalizationMode.Invariant)
{
return ToUpperInvariant(value);
}

#if SYSTEM_PRIVATE_CORELIB
return ChangeCaseCultureAware(value, culture.TextInfo, toUpper: true);
#else
return ChangeCaseCultureAware(value, culture, toUpper: true);
Expand All @@ -1448,14 +1463,14 @@ public static Rune ToUpperInvariant(Rune value)
return UnsafeCreate(Utf16Utility.ConvertAllAsciiCharsInUInt32ToUppercase(value._value));
}

#if SYSTEM_PRIVATE_CORELIB
if (GlobalizationMode.Invariant)
{
return UnsafeCreate(CharUnicodeInfo.ToUpper(value._value));
}

// Non-ASCII data requires going through the case folding tables.

#if SYSTEM_PRIVATE_CORELIB
return ChangeCaseCultureAware(value, TextInfo.Invariant, toUpper: true);
#else
return ChangeCaseCultureAware(value, CultureInfo.InvariantCulture, toUpper: true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@

using System.Diagnostics;
using System.Runtime.CompilerServices;

#if SYSTEM_PRIVATE_CORELIB
using System.Runtime.Intrinsics;
#endif

namespace System.Text.Unicode
{
Expand Down Expand Up @@ -277,6 +280,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
return (differentBits & indicator) == 0;
}

#if SYSTEM_PRIVATE_CORELIB
/// <summary>
/// Returns true iff the TVector represents ASCII UTF-16 characters in machine endianness.
/// </summary>
Expand All @@ -286,5 +290,6 @@ internal static bool AllCharsInVectorAreAscii<TVector>(TVector vec)
{
return (vec & TVector.Create(unchecked((ushort)~0x007F))).Equals(TVector.Zero);
}
#endif
}
}
Loading
Loading