From b0c330e78fe3976cf64dbf7e0ffe0eed6f7cdf0f Mon Sep 17 00:00:00 2001 From: Juan Cruz Viotti Date: Fri, 4 Oct 2024 14:46:05 -0400 Subject: [PATCH] Propery parse UTF-16, UTF-32, and surrogate pairs Signed-off-by: Juan Cruz Viotti --- src/json/parser.h | 68 +++++++++++++++++++++++++++++++---- test/json/json_parse_test.cc | 7 ++-- test/json/json_string_test.cc | 28 +++++++++++++++ 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/src/json/parser.h b/src/json/parser.h index 8e88829f2..d85986c7d 100644 --- a/src/json/parser.h +++ b/src/json/parser.h @@ -71,12 +71,10 @@ inline auto parse_boolean_false( return JSON{false}; } -auto parse_string_unicode( +auto parse_string_unicode_code_point( const std::uint64_t line, std::uint64_t &column, - std::basic_istream &stream, - std::basic_ostringstream> - &result) -> void { + std::basic_istream &stream) + -> unsigned long { std::basic_string> code_point; @@ -109,8 +107,64 @@ auto parse_string_unicode( // According to ECMA 404, \u can be followed by "any" // sequence of 4 hexadecimal digits. constexpr auto unicode_base{16}; - result.put(static_cast( - std::stoul(code_point, nullptr, unicode_base))); + const auto result{std::stoul(code_point, nullptr, unicode_base)}; + // The largest possible valid unicode code point + assert(result <= 0xFFFF); + return result; +} + +auto parse_string_unicode( + const std::uint64_t line, std::uint64_t &column, + std::basic_istream &stream, + std::basic_ostringstream> + &result) -> void { + auto code_point{parse_string_unicode_code_point(line, column, stream)}; + + // TODO: Refactor this mess + + // This means we are at the beginning of a UTF-16 surrogate pair, + // and we need to know the next code point to calculate what the + // real final code point is + if (code_point >= 0xD800 && code_point <= 0xDBFF) { + column += 1; + if (stream.get() == internal::token_string_escape) { + column += 1; + if (stream.get() == + internal::token_string_escape_unicode) { + const auto low_code_point{ + parse_string_unicode_code_point(line, column, stream)}; + if (low_code_point >= 0xDC00 && low_code_point <= 0xDFFF) { + code_point = 0x10000 + ((code_point - 0xD800) << 10) + + (low_code_point - 0xDC00); + } else { + throw ParseError(line, column); + } + } else { + throw ParseError(line, column); + } + } else { + throw ParseError(line, column); + } + } + + // Convert a Unicode codepoint into UTF-8 + // See https://en.wikipedia.org/wiki/UTF-8#Description + + using CharT = typename JSON::Char; + if (code_point <= 0x7F) { + // UTF-8 + result.put(static_cast(code_point)); + } else if (code_point <= 0x7FF) { + // UTF-16 + result.put(static_cast(0xC0 | ((code_point >> 6) & 0x1F))); + result.put(static_cast(0x80 | (code_point & 0x3F))); + } else { + // UTF-32 + result.put(static_cast(0xE0 | ((code_point >> 12) & 0x0F))); + result.put(static_cast(0x80 | ((code_point >> 6) & 0x3F))); + result.put(static_cast(0x80 | (code_point & 0x3F))); + } } auto parse_string_escape( diff --git a/test/json/json_parse_test.cc b/test/json/json_parse_test.cc index b9e07e864..c8161b70a 100644 --- a/test/json/json_parse_test.cc +++ b/test/json/json_parse_test.cc @@ -1225,13 +1225,16 @@ TEST(JSON_parse, string_unicode_code_points) { EXPECT_EQ(document.to_string(), "\u002F"); } -TEST(JSON_parse, string_unicode_length) { +TEST(JSON_parse, string_unicode_length_surrogates) { + // See https://en.wikipedia.org/wiki/UTF-8#Surrogates + // https://unicodeplus.com/U+D83D + // https://unicodeplus.com/U+DCA9 std::istringstream input{"\"\\uD83D\\uDCA9\""}; const sourcemeta::jsontoolkit::JSON document = sourcemeta::jsontoolkit::parse(input); EXPECT_TRUE(document.is_string()); EXPECT_EQ(document.size(), 1); - EXPECT_EQ(document.byte_size(), 2); + EXPECT_EQ(document.byte_size(), 3); } TEST(JSON_parse, string_unicode_code_point_equality) { diff --git a/test/json/json_string_test.cc b/test/json/json_string_test.cc index 2c54e2d45..ac9c3c4d3 100644 --- a/test/json/json_string_test.cc +++ b/test/json/json_string_test.cc @@ -53,3 +53,31 @@ TEST(JSON_string, estimated_byte_size_empty) { const sourcemeta::jsontoolkit::JSON document{""}; EXPECT_EQ(document.estimated_byte_size(), 0); } + +TEST(JSON_string, unicode_length_1) { + // This unicode string corresponds to 简律纯 + const sourcemeta::jsontoolkit::JSON document{"\u7b80\u5f8b\u7eaf"}; + EXPECT_EQ(document.size(), 3); + + // https://unicodeplus.com/U+7B80 (UTF-8: 0xE7 0xAE 0x80) + // https://unicodeplus.com/U+5F8B (UTF-8: 0xE5 0xBE 0x8B) + // https://unicodeplus.com/U+7EAF (UTF-8: 0xE7 0xBA 0xAF) + EXPECT_EQ(document.byte_size(), 9); +} + +TEST(JSON_string, unicode_length_2) { + // This unicode string corresponds to 简律纯 + const auto document = sourcemeta::jsontoolkit::parse(R"JSON({ + "name": "\u7b80\u5f8b\u7eaf" + })JSON"); + + EXPECT_TRUE(document.is_object()); + EXPECT_TRUE(document.defines("name")); + EXPECT_TRUE(document.at("name").is_string()); + EXPECT_EQ(document.at("name").size(), 3); + + // https://unicodeplus.com/U+7B80 (UTF-8: 0xE7 0xAE 0x80) + // https://unicodeplus.com/U+5F8B (UTF-8: 0xE5 0xBE 0x8B) + // https://unicodeplus.com/U+7EAF (UTF-8: 0xE7 0xBA 0xAF) + EXPECT_EQ(document.at("name").byte_size(), 9); +}