Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry-pick "LibTextCodec: Add SingleByteEncoders" #25122

Merged
merged 1 commit into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions Tests/LibTextCodec/TestTextEncoders.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,22 @@ TEST_CASE(test_gb18030_encoder)
EXPECT(processed_bytes[2] == 0xFE);
EXPECT(processed_bytes[3] == 0xFE);
}

TEST_CASE(test_windows1252_encoder)
{
auto encoder = TextCodec::encoder_for_exact_name("windows-1252"sv);
auto test_string = "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏfoo€"sv;
Vector<u8> processed_bytes;
MUST(encoder.value().process(
Utf8View(test_string),
[&](u8 byte) { dbgln("{}", processed_bytes.size()); return processed_bytes.try_append(byte); },
[&](u32) -> ErrorOr<void> { EXPECT(false); return {}; }));
EXPECT(processed_bytes.size() == 20);
for (u8 i = 0; i < 15; i++) {
EXPECT(processed_bytes[i] == (0xC0 + i));
}
EXPECT(processed_bytes[16] == 0x66);
EXPECT(processed_bytes[17] == 0x6F);
EXPECT(processed_bytes[18] == 0x6F);
EXPECT(processed_bytes[19] == 0x80);
}
115 changes: 115 additions & 0 deletions Userland/Libraries/LibTextCodec/Encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,37 @@ EUCJPEncoder s_euc_jp_encoder;
ISO2022JPEncoder s_iso_2022_jp_encoder;
ShiftJISEncoder s_shift_jis_encoder;
EUCKREncoder s_euc_kr_encoder;

// s_{encoding}_index is generated from https://encoding.spec.whatwg.org/indexes.json
// Found separately in https://encoding.spec.whatwg.org/index-{encoding}.txt
SingleByteEncoder s_ibm866_encoder { s_ibm866_index };
SingleByteEncoder s_latin2_encoder { s_iso_8859_2_index };
SingleByteEncoder s_latin3_encoder { s_iso_8859_3_index };
SingleByteEncoder s_latin4_encoder { s_iso_8859_4_index };
SingleByteEncoder s_latin_cyrillic_encoder { s_iso_8859_5_index };
SingleByteEncoder s_latin_arabic_encoder { s_iso_8859_6_index };
SingleByteEncoder s_latin_greek_encoder { s_iso_8859_7_index };
SingleByteEncoder s_latin_hebrew_encoder { s_iso_8859_8_index };
SingleByteEncoder s_latin6_encoder { s_iso_8859_10_index };
SingleByteEncoder s_latin7_encoder { s_iso_8859_13_index };
SingleByteEncoder s_latin8_encoder { s_iso_8859_14_index };
SingleByteEncoder s_latin9_encoder { s_iso_8859_15_index };
SingleByteEncoder s_latin10_encoder { s_iso_8859_16_index };
SingleByteEncoder s_centraleurope_encoder { s_windows_1250_index };
SingleByteEncoder s_cyrillic_encoder { s_windows_1251_index };
SingleByteEncoder s_hebrew_encoder { s_windows_1255_index };
SingleByteEncoder s_koi8r_encoder { s_koi8_r_index };
SingleByteEncoder s_koi8u_encoder { s_koi8_u_index };
SingleByteEncoder s_mac_roman_encoder { s_macintosh_index };
SingleByteEncoder s_windows874_encoder { s_windows_874_index };
SingleByteEncoder s_windows1252_encoder { s_windows_1252_index };
SingleByteEncoder s_windows1253_encoder { s_windows_1253_index };
SingleByteEncoder s_turkish_encoder { s_windows_1254_index };
SingleByteEncoder s_windows1256_encoder { s_windows_1256_index };
SingleByteEncoder s_windows1257_encoder { s_windows_1257_index };
SingleByteEncoder s_windows1258_encoder { s_windows_1258_index };
SingleByteEncoder s_mac_cyrillic_encoder { s_x_mac_cyrillic_index };

}

Optional<Encoder&> encoder_for_exact_name(StringView encoding)
Expand All @@ -42,6 +73,60 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
return s_gb18030_encoder;
if (encoding.equals_ignoring_ascii_case("gbk"sv))
return s_gbk_encoder;
if (encoding.equals_ignoring_ascii_case("ibm866"sv))
return s_ibm866_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv))
return s_latin2_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv))
return s_latin3_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv))
return s_latin4_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv))
return s_latin_cyrillic_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv))
return s_latin_arabic_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv))
return s_latin_greek_encoder;
if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv))
return s_latin_hebrew_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv))
return s_latin6_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv))
return s_latin7_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv))
return s_latin8_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv))
return s_latin9_encoder;
if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv))
return s_latin10_encoder;
if (encoding.equals_ignoring_ascii_case("koi8-r"sv))
return s_koi8r_encoder;
if (encoding.equals_ignoring_ascii_case("koi8-u"sv))
return s_koi8u_encoder;
if (encoding.equals_ignoring_ascii_case("macintosh"sv))
return s_mac_roman_encoder;
if (encoding.equals_ignoring_ascii_case("windows-874"sv))
return s_windows874_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1250"sv))
return s_centraleurope_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1251"sv))
return s_cyrillic_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1252"sv))
return s_windows1252_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1253"sv))
return s_windows1253_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1254"sv))
return s_turkish_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1255"sv))
return s_hebrew_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1256"sv))
return s_windows1256_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1257"sv))
return s_windows1257_encoder;
if (encoding.equals_ignoring_ascii_case("windows-1258"sv))
return s_windows1258_encoder;
if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv))
return s_mac_cyrillic_encoder;
dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
return {};
}
Expand Down Expand Up @@ -551,4 +636,34 @@ ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)
return {};
}

// https://encoding.spec.whatwg.org/#single-byte-encoder
template<Integral ArrayType>
ErrorOr<void> SingleByteEncoder<ArrayType>::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error)
{
for (u32 const code_point : input) {
if (code_point < 0x80) {
// 2. If code point is an ASCII code point, return a byte whose value is code point.
TRY(on_byte(static_cast<u8>(code_point)));
} else {
Optional<u8> pointer = {};
for (u8 i = 0; i < m_translation_table.size(); i++) {
if (m_translation_table[i] == code_point) {
// 3. Let pointer be the index pointer for code point in index single-byte.
pointer = i;
break;
}
}
if (pointer.has_value()) {
// 5. Return a byte whose value is pointer + 0x80.
TRY(on_byte(pointer.value() + 0x80));
} else {
// 4. If pointer is null, return error with code point.
TRY(on_error(code_point));
}
}
}
// 1. If code point is end-of-queue, return finished.
return {};
}

}
13 changes: 13 additions & 0 deletions Userland/Libraries/LibTextCodec/Encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,19 @@ class GB18030Encoder final : public Encoder {
private:
IsGBK m_is_gbk { IsGBK::No };
};
template<Integral ArrayType = u32>
class SingleByteEncoder final : public Encoder {
public:
SingleByteEncoder(Array<ArrayType, 128> translation_table)
: m_translation_table(translation_table)
{
}

virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte, Function<ErrorOr<void>(u32)> on_error) override;

private:
Array<ArrayType, 128> m_translation_table;
};

Optional<Encoder&> encoder_for_exact_name(StringView encoding);
Optional<Encoder&> encoder_for(StringView label);
Expand Down
Loading