dtoolutil: improve Unicode encoding/decoding, support non-BMP chars

- Support encoding and decoding four-byte UTF-8 sequences
- E_unicode supports surrogate pairs, renamed to E_utf16be for clarity
- char32_t should be used for storing a Unicode code point
This commit is contained in:
rdb 2018-10-08 22:33:54 +02:00
parent 9061fd9416
commit 29b577971f
6 changed files with 167 additions and 41 deletions

View File

@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
* *
*/ */
INLINE StringUnicodeDecoder:: INLINE StringUnicodeDecoder::
StringUnicodeDecoder(const std::string &input) : StringDecoder(input) { StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
} }

View File

@ -26,7 +26,7 @@ StringDecoder::
/** /**
* Returns the next character in sequence. * Returns the next character in sequence.
*/ */
int StringDecoder:: char32_t StringDecoder::
get_next_character() { get_next_character() {
if (test_eof()) { if (test_eof()) {
return -1; return -1;
@ -57,19 +57,20 @@ get_notify_ptr() {
/* /*
In UTF-8, each 16-bit Unicode character is encoded as a sequence of In UTF-8, each 16-bit Unicode character is encoded as a sequence of
one, two, or three 8-bit bytes, depending on the value of the one, two, three or four 8-bit bytes, depending on the value of the
character. The following table shows the format of such UTF-8 byte character. The following table shows the format of such UTF-8 byte
sequences (where the "free bits" shown by x's in the table are sequences (where the "free bits" shown by x's in the table are
combined in the order shown, and interpreted from most significant to combined in the order shown, and interpreted from most significant to
least significant): least significant):
Binary format of bytes in sequence: Binary format of bytes in sequence:
Number of Maximum expressible Number of Maximum expressible
1st byte 2nd byte 3rd byte free bits: Unicode value: 1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value:
0xxxxxxx 7 007F hex (127) 0xxxxxxx 7 007F hex (127)
110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047) 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535) 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111)
The value of each individual byte indicates its UTF-8 function, as follows: The value of each individual byte indicates its UTF-8 function, as follows:
@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
80 to BF hex (128 to 191): continuing byte in a multi-byte sequence. 80 to BF hex (128 to 191): continuing byte in a multi-byte sequence.
C2 to DF hex (194 to 223): first byte of a two-byte sequence. C2 to DF hex (194 to 223): first byte of a two-byte sequence.
E0 to EF hex (224 to 239): first byte of a three-byte sequence. E0 to EF hex (224 to 239): first byte of a three-byte sequence.
F0 to F7 hex (240 to 247): first byte of a four-byte sequence.
*/ */
/** /**
* Returns the next character in sequence. * Returns the next character in sequence.
*/ */
int StringUtf8Decoder:: char32_t StringUtf8Decoder::
get_next_character() { get_next_character() {
unsigned int result; unsigned int result;
while (!test_eof()) { while (!test_eof()) {
@ -125,6 +127,35 @@ get_next_character() {
unsigned int three = (unsigned char)_input[_p++]; unsigned int three = (unsigned char)_input[_p++];
result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
return result; return result;
} else if ((result & 0xf8) == 0xf0) {
// First byte of four.
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int two = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int three = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int four = (unsigned char)_input[_p++];
result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
return result;
} }
// Otherwise--the high bit is set but it is not one of the introductory // Otherwise--the high bit is set but it is not one of the introductory
@ -144,7 +175,7 @@ get_next_character() {
/** /**
* Returns the next character in sequence. * Returns the next character in sequence.
*/ */
int StringUnicodeDecoder:: char32_t StringUtf16Decoder::
get_next_character() { get_next_character() {
if (test_eof()) { if (test_eof()) {
return -1; return -1;
@ -159,5 +190,33 @@ get_next_character() {
return -1; return -1;
} }
unsigned int low = (unsigned char)_input[_p++]; unsigned int low = (unsigned char)_input[_p++];
return ((high << 8) | low); int ch = ((high << 8) | low);
/*
using std::swap;
if (ch == 0xfffe) {
// This is a byte-swapped byte-order-marker. That means we need to swap
// the endianness of the rest of the stream.
char *data = (char *)_input.data();
for (size_t p = _p; p < _input.size() - 1; p += 2) {
std::swap(data[p], data[p + 1]);
}
ch = 0xfeff;
}
*/
if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
// This is a high surrogate. Look for a subsequent low surrogate.
unsigned int high = (unsigned char)_input[_p];
unsigned int low = (unsigned char)_input[_p + 1];
int ch2 = ((high << 8) | low);
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
// Yes, this is a low surrogate.
_p += 2;
return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
}
}
// No, this is just a regular character, or an unpaired surrogate.
return ch;
} }

View File

@ -26,7 +26,7 @@ public:
INLINE StringDecoder(const std::string &input); INLINE StringDecoder(const std::string &input);
virtual ~StringDecoder(); virtual ~StringDecoder();
virtual int get_next_character(); virtual char32_t get_next_character();
INLINE bool is_eof(); INLINE bool is_eof();
static void set_notify_ptr(std::ostream *ptr); static void set_notify_ptr(std::ostream *ptr);
@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
public: public:
INLINE StringUtf8Decoder(const std::string &input); INLINE StringUtf8Decoder(const std::string &input);
virtual int get_next_character(); virtual char32_t get_next_character();
}; };
/** /**
* This decoder extracts characters two at a time to get a plain wide * This decoder extracts characters two at a time to get a plain wide
* character sequence. * character sequence. It supports surrogate pairs.
*/ */
class StringUnicodeDecoder : public StringDecoder { class StringUtf16Decoder : public StringDecoder {
public: public:
INLINE StringUnicodeDecoder(const std::string &input); INLINE StringUtf16Decoder(const std::string &input);
virtual int get_next_character(); virtual char32_t get_next_character();
}; };
// Deprecated alias of StringUtf16Encoder.
typedef StringUtf16Decoder StringUnicodeDecoder;
#include "stringDecoder.I" #include "stringDecoder.I"
#endif #endif

View File

@ -169,8 +169,23 @@ append_text(const std::string &text) {
* wide character, up to 16 bits in Unicode. * wide character, up to 16 bits in Unicode.
*/ */
INLINE void TextEncoder:: INLINE void TextEncoder::
append_unicode_char(int character) { append_unicode_char(char32_t character) {
#if WCHAR_MAX >= 0x10FFFF
// wchar_t might be UTF-32.
_wtext = get_wtext() + std::wstring(1, (wchar_t)character); _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
#else
if ((character & ~0xffff) == 0) {
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
} else {
// Encode as a surrogate pair.
uint32_t v = (uint32_t)character - 0x10000u;
wchar_t wstr[2] = {
(wchar_t)((v >> 10u) | 0xd800u),
(wchar_t)((v & 0x3ffu) | 0xdc00u),
};
_wtext = get_wtext() + std::wstring(wstr, 2);
}
#endif
_flags = (_flags | F_got_wtext) & ~F_got_text; _flags = (_flags | F_got_wtext) & ~F_got_text;
text_changed(); text_changed();
} }

View File

@ -21,7 +21,7 @@ using std::ostream;
using std::string; using std::string;
using std::wstring; using std::wstring;
TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859; TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
/** /**
* Adjusts the text stored within the encoder to all uppercase letters * Adjusts the text stored within the encoder to all uppercase letters
@ -109,11 +109,11 @@ is_wtext() const {
} }
/** /**
* Encodes a single wide char into a one-, two-, or three-byte string, * Encodes a single Unicode character into a one-, two-, three-, or four-byte
* according to the given encoding system. * string, according to the given encoding system.
*/ */
string TextEncoder:: string TextEncoder::
encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) { encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
switch (encoding) { switch (encoding) {
case E_iso8859: case E_iso8859:
if ((ch & ~0xff) == 0) { if ((ch & ~0xff) == 0) {
@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
return return
string(1, (char)((ch >> 6) | 0xc0)) + string(1, (char)((ch >> 6) | 0xc0)) +
string(1, (char)((ch & 0x3f) | 0x80)); string(1, (char)((ch & 0x3f) | 0x80));
} else { } else if ((ch & ~0xffff) == 0) {
return return
string(1, (char)((ch >> 12) | 0xe0)) + string(1, (char)((ch >> 12) | 0xe0)) +
string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) + string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
string(1, (char)((ch & 0x3f) | 0x80)); string(1, (char)((ch & 0x3f) | 0x80));
} else {
return
string(1, (char)((ch >> 18) | 0xf0)) +
string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
string(1, (char)((ch & 0x3f) | 0x80));
} }
case E_unicode: case E_utf16be:
return if ((ch & ~0xffff) == 0) {
string(1, (char)(ch >> 8)) + // Note that this passes through surrogates and BOMs unharmed.
string(1, (char)(ch & 0xff)); return
string(1, (char)(ch >> 8)) +
string(1, (char)(ch & 0xff));
} else {
// Use a surrogate pair.
uint32_t v = (uint32_t)ch - 0x10000u;
uint16_t hi = (v >> 10u) | 0xd800u;
uint16_t lo = (v & 0x3ffu) | 0xdc00u;
char encoded[4] = {
(char)(hi >> 8),
(char)(hi & 0xff),
(char)(lo >> 8),
(char)(lo & 0xff),
};
return string(encoded, 4);
}
} }
return ""; return "";
@ -169,8 +190,25 @@ string TextEncoder::
encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) { encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
string result; string result;
for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) { for (size_t i = 0; i < wtext.size(); ++i) {
result += encode_wchar(*pi, encoding); wchar_t ch = wtext[i];
// On some systems, wstring may be UTF-16, and contain surrogate pairs.
#if WCHAR_MAX < 0x10FFFF
if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
// This is a high surrogate. Look for a subsequent low surrogate.
wchar_t ch2 = wtext[i + 1];
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
// Yes, this is a low surrogate.
char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
result += encode_wchar(code_point, encoding);
i++;
continue;
}
}
#endif
result += encode_wchar(ch, encoding);
} }
return result; return result;
@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
return decode_text_impl(decoder); return decode_text_impl(decoder);
} }
case E_unicode: case E_utf16be:
{ {
StringUnicodeDecoder decoder(text); StringUtf16Decoder decoder(text);
return decode_text_impl(decoder); return decode_text_impl(decoder);
} }
@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
wstring result; wstring result;
// bool expand_amp = get_expand_amp(); // bool expand_amp = get_expand_amp();
wchar_t character = decoder.get_next_character(); char32_t character = decoder.get_next_character();
while (!decoder.is_eof()) { while (!decoder.is_eof()) {
/* /*
if (character == '&' && expand_amp) { if (character == '&' && expand_amp) {
@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
character = expand_amp_sequence(decoder); character = expand_amp_sequence(decoder);
} }
*/ */
result += character; if (character <= WCHAR_MAX) {
result += character;
} else {
// We need to encode this as a surrogate pair.
uint32_t v = (uint32_t)character - 0x10000u;
result += (wchar_t)((v >> 10u) | 0xd800u);
result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
}
character = decoder.get_next_character(); character = decoder.get_next_character();
} }
@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
case TextEncoder::E_utf8: case TextEncoder::E_utf8:
return out << "utf8"; return out << "utf8";
case TextEncoder::E_unicode: case TextEncoder::E_utf16be:
return out << "unicode"; return out << "utf16be";
}; };
return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**"; return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
encoding = TextEncoder::E_iso8859; encoding = TextEncoder::E_iso8859;
} else if (word == "utf8" || word == "utf-8") { } else if (word == "utf8" || word == "utf-8") {
encoding = TextEncoder::E_utf8; encoding = TextEncoder::E_utf8;
} else if (word == "unicode") { } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
encoding = TextEncoder::E_unicode; word == "utf16-be" || word == "utf-16-be") {
encoding = TextEncoder::E_utf16be;
} else { } else {
ostream *notify_ptr = StringDecoder::get_notify_ptr(); ostream *notify_ptr = StringDecoder::get_notify_ptr();
if (notify_ptr != nullptr) { if (notify_ptr != nullptr) {

View File

@ -35,7 +35,10 @@ PUBLISHED:
enum Encoding { enum Encoding {
E_iso8859, E_iso8859,
E_utf8, E_utf8,
E_unicode E_utf16be,
// Deprecated alias for E_utf16be
E_unicode = E_utf16be,
}; };
INLINE TextEncoder(); INLINE TextEncoder();
@ -70,7 +73,7 @@ PUBLISHED:
INLINE std::string get_text(Encoding encoding) const; INLINE std::string get_text(Encoding encoding) const;
INLINE void append_text(const std::string &text); INLINE void append_text(const std::string &text);
#endif #endif
INLINE void append_unicode_char(int character); INLINE void append_unicode_char(char32_t character);
INLINE size_t get_num_chars() const; INLINE size_t get_num_chars() const;
INLINE int get_unicode_char(size_t index) const; INLINE int get_unicode_char(size_t index) const;
INLINE void set_unicode_char(size_t index, int character); INLINE void set_unicode_char(size_t index, int character);
@ -103,13 +106,13 @@ PUBLISHED:
bool is_wtext() const; bool is_wtext() const;
#ifdef CPPPARSER #ifdef CPPPARSER
EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding); EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const; EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding); EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
EXTEND INLINE PyObject *decode_text(PyObject *text) const; EXTEND INLINE PyObject *decode_text(PyObject *text) const;
EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding); EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
#else #else
static std::string encode_wchar(wchar_t ch, Encoding encoding); static std::string encode_wchar(char32_t ch, Encoding encoding);
INLINE std::string encode_wtext(const std::wstring &wtext) const; INLINE std::string encode_wtext(const std::wstring &wtext) const;
static std::string encode_wtext(const std::wstring &wtext, Encoding encoding); static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
INLINE std::wstring decode_text(const std::string &text) const; INLINE std::wstring decode_text(const std::string &text) const;