Skip to content

Commit

Permalink
Merge pull request #45 from inrustwetrust/surrogate-pairs
Browse files Browse the repository at this point in the history
Fix incorrect decoding of Unicode surrogate pairs
  • Loading branch information
punchfox authored Nov 9, 2017
2 parents 6443095 + 094bc89 commit d55125f
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 7 deletions.
60 changes: 56 additions & 4 deletions include/spotify/json/codec/string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,17 +115,48 @@ class string_t final {
detail::fail(context, "\\u must be followed by 4 hex digits");
}

static void decode_unicode_escape(decode_context &context, std::string &out) {
static unsigned decode_hex_number(decode_context &context) {
detail::require_bytes<4>(context, "\\u must be followed by 4 hex digits");
const auto a = decode_hex_nibble(context, *(context.position++));
const auto b = decode_hex_nibble(context, *(context.position++));
const auto c = decode_hex_nibble(context, *(context.position++));
const auto d = decode_hex_nibble(context, *(context.position++));
const auto p = unsigned((a << 12) | (b << 8) | (c << 4) | d);
encode_utf8(context, out, p);
return unsigned((a << 12) | (b << 8) | (c << 4) | d);
}

static void encode_utf8(decode_context &context, std::string &out, unsigned p) {
static void decode_unicode_escape(
decode_context &context,
std::string &out) {
const auto p = decode_hex_number(context);
if (json_likely(!handle_surrogate_pair(context, out, p))) {
encode_utf8(out, p);
}
}

static bool handle_surrogate_pair(
decode_context &context,
std::string &out,
unsigned p) {
if (json_unlikely(is_high_surrogate(p))) {
// Parse low surrogate
if (detail::peek_2(context, '\\', 'u')) {
detail::skip_unchecked_n(context, 2);
const auto n = decode_hex_number(context);
if (json_likely(is_low_surrogate(n))) {
// Any Unicode codepoint encoded by a surrogate pair is 4 bytes in UTF-8
encode_utf8_4(out, codepoint_from_surrogate_pair(p, n));
return true;
} else {
// Rewind context to before the escape sequence
context.position -= 6;
}
}
}

return false;
}

static void encode_utf8(std::string &out, unsigned p) {
if (json_likely(p <= 0x7F)) {
encode_utf8_1(out, p);
} else if (json_likely(p <= 0x07FF)) {
Expand Down Expand Up @@ -154,6 +185,27 @@ class string_t final {
const char cc[] = { c0, c1, c2 };
out.append(&cc[0], 3);
}

static void encode_utf8_4(std::string &out, uint32_t p) {
const char c0 = 0xF0 | ((p >> 18) & 0x07);
const char c1 = 0x80 | ((p >> 12) & 0x3F);
const char c2 = 0x80 | ((p >> 6) & 0x3F);
const char c3 = 0x80 | ((p >> 0) & 0x3F);
const char cc[] = { c0, c1, c2, c3 };
out.append(&cc[0], 4);
}

static bool is_high_surrogate(unsigned p) {
return (p & 0xFC00) == 0xD800;
}

static bool is_low_surrogate(unsigned p) {
return (p & 0xFC00) == 0xDC00;
}

static uint32_t codepoint_from_surrogate_pair(uint32_t high, uint32_t low) {
return (((high & 0x03FF) << 10) | (low & 0x03FF)) + 0x10000;
}
};

inline string_t string() {
Expand Down
16 changes: 13 additions & 3 deletions include/spotify/json/detail/decode_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ json_force_inline void fail_if(
}

template <size_t num_required_bytes, typename string_type>
json_force_inline void require_bytes(
const decode_context &context,
const string_type &error = "Unexpected end of input") {
json_force_inline void require_bytes(const decode_context &context, const string_type &error) {
fail_if(context, context.remaining() < num_required_bytes, error);
}

Expand All @@ -80,6 +78,18 @@ json_force_inline char peek(const decode_context &context) {
return (context.remaining() ? peek_unchecked(context) : 0);
}

/**
* Returns true if the next two characters are `first` and `second`.
* Returns false if the characters do not match, or if there is less than 2
* characters remaining.
*/
json_force_inline bool peek_2(const decode_context &context, const char first, const char second) {
if (context.remaining() < 2) {
return false;
}
return first == *context.position && second == *(context.position + 1);
}

json_force_inline char next_unchecked(decode_context &context) {
return *(context.position++);
}
Expand Down
17 changes: 17 additions & 0 deletions test/src/test_decode_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ BOOST_AUTO_TEST_CASE(json_decode_helpers_peek) {
BOOST_CHECK_EQUAL(peek(make_context("ab")), 'a');
}

BOOST_AUTO_TEST_CASE(json_decode_helpers_peek_2) {
BOOST_CHECK(peek_2(make_context("ab"), 'a', 'b'));
BOOST_CHECK(peek_2(make_context("abcd"), 'a', 'b'));
}

BOOST_AUTO_TEST_CASE(json_decode_helpers_peek_2_nonmatching) {
BOOST_CHECK(!peek_2(make_context("aa"), 'a', 'b'));
BOOST_CHECK(!peek_2(make_context("bb"), 'a', 'b'));
BOOST_CHECK(!peek_2(make_context("aab"), 'a', 'b'));
}

BOOST_AUTO_TEST_CASE(json_decode_helpers_peek_2_too_short) {
BOOST_CHECK(!peek_2(make_context(""), 'a', 'b'));
BOOST_CHECK(!peek_2(make_context("a"), 'a', 'b'));
BOOST_CHECK(!peek_2(make_context("b"), 'a', 'b'));
}

/*
* Next
*/
Expand Down
56 changes: 56 additions & 0 deletions test/src/test_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,62 @@ BOOST_AUTO_TEST_CASE(json_codec_string_should_decode_escaped_unicode) {
BOOST_CHECK_EQUAL(string_parse("\"\\u20AC\""), "\xE2\x82\xAC");
}

BOOST_AUTO_TEST_CASE(json_codec_string_should_decode_surrogate_pairs) {
// [TWO HEARTS] Emoji (code point 0x1F495)
const std::string two_hearts = "\xf0\x9f\x92\x95";
BOOST_CHECK_EQUAL(string_parse("\"I \\ud83d\\udc95 Unicode\""),
"I " + two_hearts + " Unicode");
BOOST_CHECK_EQUAL(string_parse("\"I\\n\\ud83d\\udc95\\nUnicode\""),
"I\n" + two_hearts + "\nUnicode");
// Extreme values of each surrogate
BOOST_CHECK_EQUAL(string_parse("\"\\ud800\\udc00\""), "\xf0\x90\x80\x80");
BOOST_CHECK_EQUAL(string_parse("\"\\ud800\\udfff\""), "\xf0\x90\x8f\xbf");
BOOST_CHECK_EQUAL(string_parse("\"\\udbff\\udc00\""), "\xf4\x8f\xb0\x80");
BOOST_CHECK_EQUAL(string_parse("\"\\udbff\\udfff\""), "\xf4\x8f\xbf\xbf");
}

BOOST_AUTO_TEST_CASE(json_codec_string_should_output_code_points_from_broken_surrogate_pairs) {
// [TWO HEARTS] Emoji (code point 0x1F495)
const std::string two_hearts = "\xf0\x9f\x92\x95";
// UTF-8 representations of code points 0xd83d and 0xdc95, which form the
// surrogate pairs for the emoji above.
const std::string high = "\xed\xa0\xbd";
const std::string low = "\xed\xb2\x95";

// Lone high surrogate
BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\""), high);
BOOST_CHECK_EQUAL(string_parse("\"\\n\\ud83d\\n\""), "\n" + high + "\n");
BOOST_CHECK_EQUAL(string_parse("\"\\\\\\ud83d\\\\\""), "\\" + high + "\\");
BOOST_CHECK_EQUAL(string_parse("\"Foo\\ud83dFoo\""), "Foo" + high + "Foo");
BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\ud83d\\udc95\""), high + two_hearts);

// Lone low surrogate
BOOST_CHECK_EQUAL(string_parse("\"\\udc95\""), low);
BOOST_CHECK_EQUAL(string_parse("\"\\n\\udc95\\n\""), "\n" + low + "\n");
BOOST_CHECK_EQUAL(string_parse("\"\\\\\\udc95\\\\\""), "\\" + low + "\\");
BOOST_CHECK_EQUAL(string_parse("\"Foo\\udc95Foo\""), "Foo" + low + "Foo");
BOOST_CHECK_EQUAL(string_parse("\"\\udc95\\ud83d\\udc95\""), low + two_hearts);

// Flipped order surrogates
BOOST_CHECK_EQUAL(string_parse("\"\\udc95\\ud83d\""), low + high);

// Double high surrogate
BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\ud83d\""), high + high);

// Double low surrogate
BOOST_CHECK_EQUAL(string_parse("\"\\udc95\\udc95\""), low + low);

// Intermingled valid and invalid sequences
BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\udc95\\udc95\""), two_hearts + low);
BOOST_CHECK_EQUAL(string_parse("\"\\ud83d\\udc95\\ud83d\""), two_hearts + high);
}

BOOST_AUTO_TEST_CASE(json_codec_string_should_not_decode_incomplete_low_surrogate) {
string_parse_fail("\"\\ud83d\\\"");
string_parse_fail("\"\\ud83d\\u\"");
string_parse_fail("\"\\ud83d\\udc9\"");
}

BOOST_AUTO_TEST_CASE(json_codec_string_should_not_decode_invalid_escaped_characters) {
string_parse_fail("\"\\q\""); // \q is not a valid escape sequence
}
Expand Down

0 comments on commit d55125f

Please sign in to comment.