diff --git a/include/spotify/json/codec/string.hpp b/include/spotify/json/codec/string.hpp index d840f603..9edbcdda 100644 --- a/include/spotify/json/codec/string.hpp +++ b/include/spotify/json/codec/string.hpp @@ -115,17 +115,48 @@ class string_t final { detail::fail(context, "\\u must be followed by 4 hex digits"); } - static void decode_unicode_escape(decode_context &context, std::string &out) { + static unsigned decode_hex_number(decode_context &context) { detail::require_bytes<4>(context, "\\u must be followed by 4 hex digits"); const auto a = decode_hex_nibble(context, *(context.position++)); const auto b = decode_hex_nibble(context, *(context.position++)); const auto c = decode_hex_nibble(context, *(context.position++)); const auto d = decode_hex_nibble(context, *(context.position++)); - const auto p = unsigned((a << 12) | (b << 8) | (c << 4) | d); - encode_utf8(context, out, p); + return unsigned((a << 12) | (b << 8) | (c << 4) | d); } - static void encode_utf8(decode_context &context, std::string &out, unsigned p) { + static void decode_unicode_escape( + decode_context &context, + std::string &out) { + const auto p = decode_hex_number(context); + if (json_likely(!handle_surrogate_pair(context, out, p))) { + encode_utf8(out, p); + } + } + + static bool handle_surrogate_pair( + decode_context &context, + std::string &out, + unsigned p) { + if (json_unlikely(is_high_surrogate(p))) { + // Parse low surrogate + if (detail::peek_2(context, '\\', 'u')) { + detail::skip_unchecked_n(context, 2); + const auto n = decode_hex_number(context); + if (json_likely(is_low_surrogate(n))) { + // Any Unicode codepoint encoded by a surrogate pair is 4 bytes in UTF-8 + encode_utf8_4(out, codepoint_from_surrogate_pair(p, n)); + return true; + } else { + // Rewind context to before the escape sequence + context.position -= 6; + } + } + } + + return false; + } + + static void encode_utf8(std::string &out, unsigned p) { if (json_likely(p <= 0x7F)) { encode_utf8_1(out, p); } else if (json_likely(p <= 0x07FF)) { @@ -154,6 +185,27 @@ class string_t final { const char cc[] = { c0, c1, c2 }; out.append(&cc[0], 3); } + + static void encode_utf8_4(std::string &out, uint32_t p) { + const char c0 = 0xF0 | ((p >> 18) & 0x07); + const char c1 = 0x80 | ((p >> 12) & 0x3F); + const char c2 = 0x80 | ((p >> 6) & 0x3F); + const char c3 = 0x80 | ((p >> 0) & 0x3F); + const char cc[] = { c0, c1, c2, c3 }; + out.append(&cc[0], 4); + } + + static bool is_high_surrogate(unsigned p) { + return (p & 0xFC00) == 0xD800; + } + + static bool is_low_surrogate(unsigned p) { + return (p & 0xFC00) == 0xDC00; + } + + static uint32_t codepoint_from_surrogate_pair(uint32_t high, uint32_t low) { + return (((high & 0x03FF) << 10) | (low & 0x03FF)) + 0x10000; + } }; inline string_t string() { diff --git a/include/spotify/json/detail/decode_helpers.hpp b/include/spotify/json/detail/decode_helpers.hpp index 3a8f29e2..0dc6b9a3 100644 --- a/include/spotify/json/detail/decode_helpers.hpp +++ b/include/spotify/json/detail/decode_helpers.hpp @@ -55,9 +55,7 @@ json_force_inline void fail_if( } template -json_force_inline void require_bytes( - const decode_context &context, - const string_type &error = "Unexpected end of input") { +json_force_inline void require_bytes(const decode_context &context, const string_type &error) { fail_if(context, context.remaining() < num_required_bytes, error); } @@ -80,6 +78,18 @@ json_force_inline char peek(const decode_context &context) { return (context.remaining() ? peek_unchecked(context) : 0); } +/** + * Returns true if the next two characters are `first` and `second`. + * Returns false if the characters do not match, or if there is less than 2 + * characters remaining. + */ +json_force_inline bool peek_2(const decode_context &context, const char first, const char second) { + if (context.remaining() < 2) { + return false; + } + return first == *context.position && second == *(context.position + 1); +} + json_force_inline char next_unchecked(decode_context &context) { return *(context.position++); } diff --git a/test/src/test_decode_helpers.cpp b/test/src/test_decode_helpers.cpp index c4f0a4ca..73f7fb5c 100644 --- a/test/src/test_decode_helpers.cpp +++ b/test/src/test_decode_helpers.cpp @@ -74,6 +74,23 @@ BOOST_AUTO_TEST_CASE(json_decode_helpers_peek) { BOOST_CHECK_EQUAL(peek(make_context("ab")), 'a'); } +BOOST_AUTO_TEST_CASE(json_decode_helpers_peek_2) { + BOOST_CHECK(peek_2(make_context("ab"), 'a', 'b')); + BOOST_CHECK(peek_2(make_context("abcd"), 'a', 'b')); +} + +BOOST_AUTO_TEST_CASE(json_decode_helpers_peek_2_nonmatching) { + BOOST_CHECK(!peek_2(make_context("aa"), 'a', 'b')); + BOOST_CHECK(!peek_2(make_context("bb"), 'a', 'b')); + BOOST_CHECK(!peek_2(make_context("aab"), 'a', 'b')); +} + +BOOST_AUTO_TEST_CASE(json_decode_helpers_peek_2_too_short) { + BOOST_CHECK(!peek_2(make_context(""), 'a', 'b')); + BOOST_CHECK(!peek_2(make_context("a"), 'a', 'b')); + BOOST_CHECK(!peek_2(make_context("b"), 'a', 'b')); +} + /* * Next */ diff --git a/test/src/test_string.cpp b/test/src/test_string.cpp index d8f2c89b..a8652392 100644 --- a/test/src/test_string.cpp +++ b/test/src/test_string.cpp @@ -205,6 +205,62 @@ BOOST_AUTO_TEST_CASE(json_codec_string_should_decode_escaped_unicode) { BOOST_CHECK_EQUAL(string_parse("\"\\u20AC\""), "\xE2\x82\xAC"); } +BOOST_AUTO_TEST_CASE(json_codec_string_should_decode_surrogate_pairs) { + // [TWO HEARTS] Emoji (code point 0x1F495) + const std::string two_hearts = "\xf0\x9f\x92\x95"; + BOOST_CHECK_EQUAL(string_parse("\"I \\ud83d\\udc95 Unicode\""), + "I " + two_hearts + " Unicode"); + BOOST_CHECK_EQUAL(string_parse("\"I\\n\\ud83d\\udc95\\nUnicode\""), + "I\n" + two_hearts + "\nUnicode"); + // Extreme values of each surrogate + BOOST_CHECK_EQUAL(string_parse("\"\\ud800\\udc00\""), "\xf0\x90\x80\x80"); + BOOST_CHECK_EQUAL(string_parse("\"\\ud800\\udfff\""), "\xf0\x90\x8f\xbf"); + BOOST_CHECK_EQUAL(string_parse("\"\\udbff\\udc00\""), "\xf4\x8f\xb0\x80"); + BOOST_CHECK_EQUAL(string_parse("\"\\udbff\\udfff\""), "\xf4\x8f\xbf\xbf"); +} + +BOOST_AUTO_TEST_CASE(json_codec_string_should_output_code_points_from_broken_surrogate_pairs) { + // [TWO HEARTS] Emoji (code point 0x1F495) + const std::string two_hearts = "\xf0\x9f\x92\x95"; + // UTF-8 representations of code points 0xd83d and 0xdc95, which form the + // surrogate pairs for the emoji above. + const std::string high = "\xed\xa0\xbd"; + const std::string low = "\xed\xb2\x95"; + + // Lone high surrogate + BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\""), high); + BOOST_CHECK_EQUAL(string_parse("\"\\n\\ud83d\\n\""), "\n" + high + "\n"); + BOOST_CHECK_EQUAL(string_parse("\"\\\\\\ud83d\\\\\""), "\\" + high + "\\"); + BOOST_CHECK_EQUAL(string_parse("\"Foo\\ud83dFoo\""), "Foo" + high + "Foo"); + BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\ud83d\\udc95\""), high + two_hearts); + + // Lone low surrogate + BOOST_CHECK_EQUAL(string_parse("\"\\udc95\""), low); + BOOST_CHECK_EQUAL(string_parse("\"\\n\\udc95\\n\""), "\n" + low + "\n"); + BOOST_CHECK_EQUAL(string_parse("\"\\\\\\udc95\\\\\""), "\\" + low + "\\"); + BOOST_CHECK_EQUAL(string_parse("\"Foo\\udc95Foo\""), "Foo" + low + "Foo"); + BOOST_CHECK_EQUAL(string_parse("\"\\udc95\\ud83d\\udc95\""), low + two_hearts); + + // Flipped order surrogates + BOOST_CHECK_EQUAL(string_parse("\"\\udc95\\ud83d\""), low + high); + + // Double high surrogate + BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\ud83d\""), high + high); + + // Double low surrogate + BOOST_CHECK_EQUAL(string_parse("\"\\udc95\\udc95\""), low + low); + + // Intermingled valid and invalid sequences + BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\udc95\\udc95\""), two_hearts + low); + BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\udc95\\ud83d\""), two_hearts + high); +} + +BOOST_AUTO_TEST_CASE(json_codec_string_should_not_decode_incomplete_low_surrogate) { + string_parse_fail("\"\\ud83d\\\""); + string_parse_fail("\"\\ud83d\\u\""); + string_parse_fail("\"\\ud83d\\udc9\""); +} + BOOST_AUTO_TEST_CASE(json_codec_string_should_not_decode_invalid_escaped_characters) { string_parse_fail("\"\\q\""); // \q is not a valid escape sequence }