Skip to content

Commit

Permalink
Propery parse UTF-16, UTF-32, and surrogate pairs
Browse files Browse the repository at this point in the history
Signed-off-by: Juan Cruz Viotti <[email protected]>
  • Loading branch information
jviotti committed Oct 4, 2024
1 parent 45dfaf8 commit b0c330e
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 9 deletions.
68 changes: 61 additions & 7 deletions src/json/parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,10 @@ inline auto parse_boolean_false(
return JSON{false};
}

auto parse_string_unicode(
auto parse_string_unicode_code_point(
const std::uint64_t line, std::uint64_t &column,
std::basic_istream<typename JSON::Char, typename JSON::CharTraits> &stream,
std::basic_ostringstream<typename JSON::Char, typename JSON::CharTraits,
typename JSON::Allocator<typename JSON::Char>>
&result) -> void {
std::basic_istream<typename JSON::Char, typename JSON::CharTraits> &stream)
-> unsigned long {
std::basic_string<typename JSON::Char, typename JSON::CharTraits,
typename JSON::Allocator<typename JSON::Char>>
code_point;
Expand Down Expand Up @@ -109,8 +107,64 @@ auto parse_string_unicode(
// According to ECMA 404, \u can be followed by "any"
// sequence of 4 hexadecimal digits.
constexpr auto unicode_base{16};
result.put(static_cast<typename JSON::Char>(
std::stoul(code_point, nullptr, unicode_base)));
const auto result{std::stoul(code_point, nullptr, unicode_base)};
// The largest possible valid unicode code point
assert(result <= 0xFFFF);
return result;
}

auto parse_string_unicode(
const std::uint64_t line, std::uint64_t &column,
std::basic_istream<typename JSON::Char, typename JSON::CharTraits> &stream,
std::basic_ostringstream<typename JSON::Char, typename JSON::CharTraits,
typename JSON::Allocator<typename JSON::Char>>
&result) -> void {
auto code_point{parse_string_unicode_code_point(line, column, stream)};

// TODO: Refactor this mess

// This means we are at the beginning of a UTF-16 surrogate pair,
// and we need to know the next code point to calculate what the
// real final code point is
if (code_point >= 0xD800 && code_point <= 0xDBFF) {
column += 1;
if (stream.get() == internal::token_string_escape<typename JSON::Char>) {
column += 1;
if (stream.get() ==
internal::token_string_escape_unicode<typename JSON::Char>) {
const auto low_code_point{
parse_string_unicode_code_point(line, column, stream)};
if (low_code_point >= 0xDC00 && low_code_point <= 0xDFFF) {
code_point = 0x10000 + ((code_point - 0xD800) << 10) +
(low_code_point - 0xDC00);
} else {
throw ParseError(line, column);
}
} else {
throw ParseError(line, column);
}
} else {
throw ParseError(line, column);
}
}

// Convert a Unicode codepoint into UTF-8
// See https://en.wikipedia.org/wiki/UTF-8#Description

using CharT = typename JSON::Char;
if (code_point <= 0x7F) {
// UTF-8
result.put(static_cast<CharT>(code_point));
} else if (code_point <= 0x7FF) {
// UTF-16
result.put(static_cast<CharT>(0xC0 | ((code_point >> 6) & 0x1F)));
result.put(static_cast<CharT>(0x80 | (code_point & 0x3F)));
} else {
// UTF-32
result.put(static_cast<CharT>(0xE0 | ((code_point >> 12) & 0x0F)));
result.put(static_cast<CharT>(0x80 | ((code_point >> 6) & 0x3F)));
result.put(static_cast<CharT>(0x80 | (code_point & 0x3F)));
}
}

auto parse_string_escape(
Expand Down
7 changes: 5 additions & 2 deletions test/json/json_parse_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1225,13 +1225,16 @@ TEST(JSON_parse, string_unicode_code_points) {
EXPECT_EQ(document.to_string(), "\u002F");
}

TEST(JSON_parse, string_unicode_length) {
TEST(JSON_parse, string_unicode_length_surrogates) {
// See https://en.wikipedia.org/wiki/UTF-8#Surrogates
// https://unicodeplus.com/U+D83D
// https://unicodeplus.com/U+DCA9
std::istringstream input{"\"\\uD83D\\uDCA9\""};
const sourcemeta::jsontoolkit::JSON document =
sourcemeta::jsontoolkit::parse(input);
EXPECT_TRUE(document.is_string());
EXPECT_EQ(document.size(), 1);
EXPECT_EQ(document.byte_size(), 2);
EXPECT_EQ(document.byte_size(), 3);
}

TEST(JSON_parse, string_unicode_code_point_equality) {
Expand Down
28 changes: 28 additions & 0 deletions test/json/json_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,31 @@ TEST(JSON_string, estimated_byte_size_empty) {
const sourcemeta::jsontoolkit::JSON document{""};
EXPECT_EQ(document.estimated_byte_size(), 0);
}

TEST(JSON_string, unicode_length_1) {
// This unicode string corresponds to 简律纯
const sourcemeta::jsontoolkit::JSON document{"\u7b80\u5f8b\u7eaf"};
EXPECT_EQ(document.size(), 3);

// https://unicodeplus.com/U+7B80 (UTF-8: 0xE7 0xAE 0x80)
// https://unicodeplus.com/U+5F8B (UTF-8: 0xE5 0xBE 0x8B)
// https://unicodeplus.com/U+7EAF (UTF-8: 0xE7 0xBA 0xAF)
EXPECT_EQ(document.byte_size(), 9);
}

TEST(JSON_string, unicode_length_2) {
// This unicode string corresponds to 简律纯
const auto document = sourcemeta::jsontoolkit::parse(R"JSON({
"name": "\u7b80\u5f8b\u7eaf"
})JSON");

EXPECT_TRUE(document.is_object());
EXPECT_TRUE(document.defines("name"));
EXPECT_TRUE(document.at("name").is_string());
EXPECT_EQ(document.at("name").size(), 3);

// https://unicodeplus.com/U+7B80 (UTF-8: 0xE7 0xAE 0x80)
// https://unicodeplus.com/U+5F8B (UTF-8: 0xE5 0xBE 0x8B)
// https://unicodeplus.com/U+7EAF (UTF-8: 0xE7 0xBA 0xAF)
EXPECT_EQ(document.at("name").byte_size(), 9);
}

0 comments on commit b0c330e

Please sign in to comment.