Skip to content

Commit

Permalink
Support windows-1252 when ICU is disabled
Browse files Browse the repository at this point in the history
Mostly for testing purposes.
  • Loading branch information
Ghabry committed Nov 29, 2023
1 parent 1aeebc1 commit d94eea3
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 12 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ project(liblcf VERSION 0.8 LANGUAGES CXX)

# Compilation options
option(BUILD_SHARED_LIBS "Build shared library, disable for building the static library (default: ON)" ON)
option(LIBLCF_WITH_ICU "ICU encoding handling (disable only for testing purposes, default: ON)" ON)
option(LIBLCF_WITH_ICU "ICU encoding handling (when disabled only windows-1252 is supported, default: ON)" ON)
option(LIBLCF_WITH_XML "XML reading support (expat, default: ON)" ON)
option(LIBLCF_UPDATE_MIMEDB "Whether to run update-mime-database after install (default: ON)" ON)
option(LIBLCF_ENABLE_TOOLS "Whether to build the tools (default: ON)" ON)
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ AM_CONDITIONAL(CROSS_COMPILING,[test "x$cross_compiling" = "xyes"])

# Checks for libraries.
AC_SUBST([LCF_SUPPORT_ICU],[0])
AC_ARG_ENABLE([icu],[AS_HELP_STRING([--disable-icu],[Disable ICU encoding detection (only for testing purposes) [default=no]])])
AC_ARG_ENABLE([icu],[AS_HELP_STRING([--disable-icu],[Disable ICU encoding handling (only windows-1252 supported) [default=no]])])
AS_IF([test "x$enable_icu" != "xno"],[
AX_PKG_CHECK_MODULES([ICU],[],[icu-i18n],[LCF_SUPPORT_ICU=1])
])
Expand Down
74 changes: 69 additions & 5 deletions src/encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@
# include <unicode/ucsdet.h>
# include <unicode/ucnv.h>
#else
# ifdef _MSC_VER
# error MSVC builds require ICU
# endif
# include <cstdint>
#endif

#ifdef _WIN32
Expand Down Expand Up @@ -73,7 +71,6 @@ void Encoder::Decode(std::string& str) {
Convert(str, _conv_storage, _conv_runtime);
}

#if LCF_SUPPORT_ICU
void Encoder::Init() {
if (_encoding.empty()) {
return;
Expand All @@ -84,6 +81,7 @@ void Encoder::Init() {
? ReaderUtil::CodepageToEncoding(code_page)
: _encoding;

#if LCF_SUPPORT_ICU
auto status = U_ZERO_ERROR;
constexpr auto runtime_encoding = "UTF-8";
auto conv_runtime = ucnv_open(runtime_encoding, &status);
Expand All @@ -106,8 +104,17 @@ void Encoder::Init() {

_conv_runtime = conv_runtime;
_conv_storage = conv_storage;
#else
if (storage_encoding != "windows-1252") {
return;
}

_conv_runtime = 65001;
_conv_storage = 1252;
#endif
}

#if LCF_SUPPORT_ICU
void Encoder::Reset() {
if (_conv_runtime) {
ucnv_close(_conv_runtime);
Expand Down Expand Up @@ -143,7 +150,64 @@ void Encoder::Convert(std::string& str, UConverter* conv_dst, UConverter* conv_s

str.assign(_buffer.data(), dst_p);
}
#else
void Encoder::Convert(std::string& str, int conv_dst, int) {
if (str.empty()) {
return;
}

size_t buf_idx = 0;

if (conv_dst == 65001) {
// From 1252 to UTF-8
// Based on https://stackoverflow.com/q/4059775/
_buffer.resize(str.size() * 2 + 1);

for (unsigned char ch: str) {
if (ch < 0x80) {
_buffer[buf_idx] = static_cast<char>(ch);
} else {
_buffer[buf_idx] = static_cast<char>(0xC0 | (ch >> 6));
++buf_idx;
_buffer[buf_idx] = static_cast<char>(0x80 | (ch & 0x3F));
}

++buf_idx;
}
} else {
// From UTF-8 to 1252
// Based on https://stackoverflow.com/q/23689733/
_buffer.resize(str.size() + 1);
uint32_t codepoint;

for (size_t str_idx = 0; str_idx < str.size(); ++str_idx) {
unsigned char ch = str[str_idx];
if (ch <= 0x7F) {
codepoint = ch;
} else if (ch <= 0xBF) {
codepoint = (codepoint << 6) | (ch & 0x3F);
} else if (ch <= 0xDF) {
codepoint = ch & 0x1F;
} else if (ch <= 0xEF) {
codepoint = ch & 0x0F;
} else {
codepoint = ch & 0x07;
}
++str_idx;
ch = str[str_idx];
if (((ch & 0xC0) != 0x80) && (codepoint <= 0x10ffff)) {
if (codepoint <= 255) {
_buffer[buf_idx] = static_cast<char>(codepoint);
} else {
_buffer[buf_idx] = '?';
}
}
++buf_idx;
}
}

str.assign(_buffer.data(), buf_idx);
}
#endif

} //namespace lcf

11 changes: 6 additions & 5 deletions src/lcf/encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,15 @@ class Encoder {

UConverter* _conv_storage = nullptr;
UConverter* _conv_runtime = nullptr;
std::vector<char> _buffer;
#else
void Init() {}
void Init();
void Reset() {}
void Convert(std::string&, void*, void*) {}
void* _conv_storage = nullptr;
void* _conv_runtime = nullptr;
void Convert(std::string& str, int conv_dst, int conv_src);

int _conv_storage = 0;
int _conv_runtime = 0;
#endif
std::vector<char> _buffer;
std::string _encoding;
};

Expand Down

0 comments on commit d94eea3

Please sign in to comment.