From b0c330e78fe3976cf64dbf7e0ffe0eed6f7cdf0f Mon Sep 17 00:00:00 2001
From: Juan Cruz Viotti <jv@jviotti.com>
Date: Fri, 4 Oct 2024 14:46:05 -0400
Subject: [PATCH] Propery parse UTF-16, UTF-32, and surrogate pairs

Signed-off-by: Juan Cruz Viotti <jv@jviotti.com>
---
 src/json/parser.h             | 68 +++++++++++++++++++++++++++++++----
 test/json/json_parse_test.cc  |  7 ++--
 test/json/json_string_test.cc | 28 +++++++++++++++
 3 files changed, 94 insertions(+), 9 deletions(-)
diff --git a/src/json/parser.h b/src/json/parser.h
index 8e88829f2..d85986c7d 100644
--- a/src/json/parser.h
+++ b/src/json/parser.h
@@ -71,12 +71,10 @@ inline auto parse_boolean_false(
   return JSON{false};
 }
 
-auto parse_string_unicode(
+auto parse_string_unicode_code_point(
     const std::uint64_t line, std::uint64_t &column,
-    std::basic_istream<typename JSON::Char, typename JSON::CharTraits> &stream,
-    std::basic_ostringstream<typename JSON::Char, typename JSON::CharTraits,
-                             typename JSON::Allocator<typename JSON::Char>>
-        &result) -> void {
+    std::basic_istream<typename JSON::Char, typename JSON::CharTraits> &stream)
+    -> unsigned long {
   std::basic_string<typename JSON::Char, typename JSON::CharTraits,
                     typename JSON::Allocator<typename JSON::Char>>
       code_point;
@@ -109,8 +107,64 @@ auto parse_string_unicode(
   // According to ECMA 404, \u can be followed by "any"
   // sequence of 4 hexadecimal digits.
   constexpr auto unicode_base{16};
-  result.put(static_cast<typename JSON::Char>(
-      std::stoul(code_point, nullptr, unicode_base)));
+  const auto result{std::stoul(code_point, nullptr, unicode_base)};
+  // The largest possible valid unicode code point
+  assert(result <= 0xFFFF);
+  return result;
+}
+
+auto parse_string_unicode(
+    const std::uint64_t line, std::uint64_t &column,
+    std::basic_istream<typename JSON::Char, typename JSON::CharTraits> &stream,
+    std::basic_ostringstream<typename JSON::Char, typename JSON::CharTraits,
+                             typename JSON::Allocator<typename JSON::Char>>
+        &result) -> void {
+  auto code_point{parse_string_unicode_code_point(line, column, stream)};
+
+  // TODO: Refactor this mess
+
+  // This means we are at the beginning of a UTF-16 surrogate pair,
+  // and we need to know the next code point to calculate what the
+  // real final code point is
+  if (code_point >= 0xD800 && code_point <= 0xDBFF) {
+    column += 1;
+    if (stream.get() == internal::token_string_escape<typename JSON::Char>) {
+      column += 1;
+      if (stream.get() ==
+          internal::token_string_escape_unicode<typename JSON::Char>) {
+        const auto low_code_point{
+            parse_string_unicode_code_point(line, column, stream)};
+        if (low_code_point >= 0xDC00 && low_code_point <= 0xDFFF) {
+          code_point = 0x10000 + ((code_point - 0xD800) << 10) +
+                       (low_code_point - 0xDC00);
+        } else {
+          throw ParseError(line, column);
+        }
+      } else {
+        throw ParseError(line, column);
+      }
+    } else {
+      throw ParseError(line, column);
+    }
+  }
+
+  // Convert a Unicode codepoint into UTF-8
+  // See https://en.wikipedia.org/wiki/UTF-8#Description
+
+  using CharT = typename JSON::Char;
+  if (code_point <= 0x7F) {
+    // UTF-8
+    result.put(static_cast<CharT>(code_point));
+  } else if (code_point <= 0x7FF) {
+    // UTF-16
+    result.put(static_cast<CharT>(0xC0 | ((code_point >> 6) & 0x1F)));
+    result.put(static_cast<CharT>(0x80 | (code_point & 0x3F)));
+  } else {
+    // UTF-32
+    result.put(static_cast<CharT>(0xE0 | ((code_point >> 12) & 0x0F)));
+    result.put(static_cast<CharT>(0x80 | ((code_point >> 6) & 0x3F)));
+    result.put(static_cast<CharT>(0x80 | (code_point & 0x3F)));
+  }
 }
 
 auto parse_string_escape(
diff --git a/test/json/json_parse_test.cc b/test/json/json_parse_test.cc
index b9e07e864..c8161b70a 100644
--- a/test/json/json_parse_test.cc
+++ b/test/json/json_parse_test.cc
@@ -1225,13 +1225,16 @@ TEST(JSON_parse, string_unicode_code_points) {
   EXPECT_EQ(document.to_string(), "\u002F");
 }
 
-TEST(JSON_parse, string_unicode_length) {
+TEST(JSON_parse, string_unicode_length_surrogates) {
+  // See https://en.wikipedia.org/wiki/UTF-8#Surrogates
+  // https://unicodeplus.com/U+D83D
+  // https://unicodeplus.com/U+DCA9
   std::istringstream input{"\"\\uD83D\\uDCA9\""};
   const sourcemeta::jsontoolkit::JSON document =
       sourcemeta::jsontoolkit::parse(input);
   EXPECT_TRUE(document.is_string());
   EXPECT_EQ(document.size(), 1);
-  EXPECT_EQ(document.byte_size(), 2);
+  EXPECT_EQ(document.byte_size(), 3);
 }
 
 TEST(JSON_parse, string_unicode_code_point_equality) {
diff --git a/test/json/json_string_test.cc b/test/json/json_string_test.cc
index 2c54e2d45..ac9c3c4d3 100644
--- a/test/json/json_string_test.cc
+++ b/test/json/json_string_test.cc
@@ -53,3 +53,31 @@ TEST(JSON_string, estimated_byte_size_empty) {
   const sourcemeta::jsontoolkit::JSON document{""};
   EXPECT_EQ(document.estimated_byte_size(), 0);
 }
+
+TEST(JSON_string, unicode_length_1) {
+  // This unicode string corresponds to 简律纯
+  const sourcemeta::jsontoolkit::JSON document{"\u7b80\u5f8b\u7eaf"};
+  EXPECT_EQ(document.size(), 3);
+
+  // https://unicodeplus.com/U+7B80 (UTF-8: 0xE7 0xAE 0x80)
+  // https://unicodeplus.com/U+5F8B (UTF-8: 0xE5 0xBE 0x8B)
+  // https://unicodeplus.com/U+7EAF (UTF-8: 0xE7 0xBA 0xAF)
+  EXPECT_EQ(document.byte_size(), 9);
+}
+
+TEST(JSON_string, unicode_length_2) {
+  // This unicode string corresponds to 简律纯
+  const auto document = sourcemeta::jsontoolkit::parse(R"JSON({
+    "name": "\u7b80\u5f8b\u7eaf"
+  })JSON");
+
+  EXPECT_TRUE(document.is_object());
+  EXPECT_TRUE(document.defines("name"));
+  EXPECT_TRUE(document.at("name").is_string());
+  EXPECT_EQ(document.at("name").size(), 3);
+
+  // https://unicodeplus.com/U+7B80 (UTF-8: 0xE7 0xAE 0x80)
+  // https://unicodeplus.com/U+5F8B (UTF-8: 0xE5 0xBE 0x8B)
+  // https://unicodeplus.com/U+7EAF (UTF-8: 0xE7 0xBA 0xAF)
+  EXPECT_EQ(document.at("name").byte_size(), 9);
+}