dtoolutil: improve Unicode encoding/decoding, support non-BMP chars

- Support encoding and decoding four-byte UTF-8 sequences
- E_unicode supports surrogate pairs, renamed to E_utf16be for clarity
- char32_t should be used for storing a Unicode code point
This commit is contained in:
rdb 2018-10-08 22:33:54 +02:00
parent 9061fd9416
commit 29b577971f
6 changed files with 167 additions and 41 deletions

View File

@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
*
*/
INLINE StringUnicodeDecoder::
StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
}

View File

@ -26,7 +26,7 @@ StringDecoder::
/**
* Returns the next character in sequence.
*/
int StringDecoder::
char32_t StringDecoder::
get_next_character() {
if (test_eof()) {
return -1;
@ -57,19 +57,20 @@ get_notify_ptr() {
/*
In UTF-8, each 16-bit Unicode character is encoded as a sequence of
one, two, or three 8-bit bytes, depending on the value of the
one, two, three or four 8-bit bytes, depending on the value of the
character. The following table shows the format of such UTF-8 byte
sequences (where the "free bits" shown by x's in the table are
combined in the order shown, and interpreted from most significant to
least significant):
Binary format of bytes in sequence:
Number of Maximum expressible
1st byte 2nd byte 3rd byte free bits: Unicode value:
Number of Maximum expressible
1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value:
0xxxxxxx 7 007F hex (127)
110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
0xxxxxxx 7 007F hex (127)
110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111)
The value of each individual byte indicates its UTF-8 function, as follows:
@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
80 to BF hex (128 to 191): continuing byte in a multi-byte sequence.
C2 to DF hex (194 to 223): first byte of a two-byte sequence.
E0 to EF hex (224 to 239): first byte of a three-byte sequence.
F0 to F7 hex (240 to 247): first byte of a four-byte sequence.
*/
/**
* Returns the next character in sequence.
*/
int StringUtf8Decoder::
char32_t StringUtf8Decoder::
get_next_character() {
unsigned int result;
while (!test_eof()) {
@ -125,6 +127,35 @@ get_next_character() {
unsigned int three = (unsigned char)_input[_p++];
result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
return result;
} else if ((result & 0xf8) == 0xf0) {
// First byte of four.
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int two = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int three = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int four = (unsigned char)_input[_p++];
result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
return result;
}
// Otherwise--the high bit is set but it is not one of the introductory
@ -144,7 +175,7 @@ get_next_character() {
/**
* Returns the next character in sequence.
*/
int StringUnicodeDecoder::
char32_t StringUtf16Decoder::
get_next_character() {
if (test_eof()) {
return -1;
@ -159,5 +190,33 @@ get_next_character() {
return -1;
}
unsigned int low = (unsigned char)_input[_p++];
return ((high << 8) | low);
int ch = ((high << 8) | low);
/*
using std::swap;
if (ch == 0xfffe) {
// This is a byte-swapped byte-order-marker. That means we need to swap
// the endianness of the rest of the stream.
char *data = (char *)_input.data();
for (size_t p = _p; p < _input.size() - 1; p += 2) {
std::swap(data[p], data[p + 1]);
}
ch = 0xfeff;
}
*/
if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
// This is a high surrogate. Look for a subsequent low surrogate.
unsigned int high = (unsigned char)_input[_p];
unsigned int low = (unsigned char)_input[_p + 1];
int ch2 = ((high << 8) | low);
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
// Yes, this is a low surrogate.
_p += 2;
return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
}
}
// No, this is just a regular character, or an unpaired surrogate.
return ch;
}

View File

@ -26,7 +26,7 @@ public:
INLINE StringDecoder(const std::string &input);
virtual ~StringDecoder();
virtual int get_next_character();
virtual char32_t get_next_character();
INLINE bool is_eof();
static void set_notify_ptr(std::ostream *ptr);
@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
public:
INLINE StringUtf8Decoder(const std::string &input);
virtual int get_next_character();
virtual char32_t get_next_character();
};
/**
* This decoder extracts characters two at a time to get a plain wide
* character sequence.
* character sequence. It supports surrogate pairs.
*/
class StringUnicodeDecoder : public StringDecoder {
class StringUtf16Decoder : public StringDecoder {
public:
INLINE StringUnicodeDecoder(const std::string &input);
INLINE StringUtf16Decoder(const std::string &input);
virtual int get_next_character();
virtual char32_t get_next_character();
};
// Deprecated alias of StringUtf16Encoder.
typedef StringUtf16Decoder StringUnicodeDecoder;
#include "stringDecoder.I"
#endif

View File

@ -169,8 +169,23 @@ append_text(const std::string &text) {
* wide character, up to 16 bits in Unicode.
*/
INLINE void TextEncoder::
append_unicode_char(int character) {
append_unicode_char(char32_t character) {
#if WCHAR_MAX >= 0x10FFFF
// wchar_t might be UTF-32.
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
#else
if ((character & ~0xffff) == 0) {
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
} else {
// Encode as a surrogate pair.
uint32_t v = (uint32_t)character - 0x10000u;
wchar_t wstr[2] = {
(wchar_t)((v >> 10u) | 0xd800u),
(wchar_t)((v & 0x3ffu) | 0xdc00u),
};
_wtext = get_wtext() + std::wstring(wstr, 2);
}
#endif
_flags = (_flags | F_got_wtext) & ~F_got_text;
text_changed();
}

View File

@ -21,7 +21,7 @@ using std::ostream;
using std::string;
using std::wstring;
TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
/**
* Adjusts the text stored within the encoder to all uppercase letters
@ -109,11 +109,11 @@ is_wtext() const {
}
/**
* Encodes a single wide char into a one-, two-, or three-byte string,
* according to the given encoding system.
* Encodes a single Unicode character into a one-, two-, three-, or four-byte
* string, according to the given encoding system.
*/
string TextEncoder::
encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
switch (encoding) {
case E_iso8859:
if ((ch & ~0xff) == 0) {
@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
return
string(1, (char)((ch >> 6) | 0xc0)) +
string(1, (char)((ch & 0x3f) | 0x80));
} else {
} else if ((ch & ~0xffff) == 0) {
return
string(1, (char)((ch >> 12) | 0xe0)) +
string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
string(1, (char)((ch & 0x3f) | 0x80));
} else {
return
string(1, (char)((ch >> 18) | 0xf0)) +
string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
string(1, (char)((ch & 0x3f) | 0x80));
}
case E_unicode:
return
string(1, (char)(ch >> 8)) +
string(1, (char)(ch & 0xff));
case E_utf16be:
if ((ch & ~0xffff) == 0) {
// Note that this passes through surrogates and BOMs unharmed.
return
string(1, (char)(ch >> 8)) +
string(1, (char)(ch & 0xff));
} else {
// Use a surrogate pair.
uint32_t v = (uint32_t)ch - 0x10000u;
uint16_t hi = (v >> 10u) | 0xd800u;
uint16_t lo = (v & 0x3ffu) | 0xdc00u;
char encoded[4] = {
(char)(hi >> 8),
(char)(hi & 0xff),
(char)(lo >> 8),
(char)(lo & 0xff),
};
return string(encoded, 4);
}
}
return "";
@ -169,8 +190,25 @@ string TextEncoder::
encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
string result;
for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
result += encode_wchar(*pi, encoding);
for (size_t i = 0; i < wtext.size(); ++i) {
wchar_t ch = wtext[i];
// On some systems, wstring may be UTF-16, and contain surrogate pairs.
#if WCHAR_MAX < 0x10FFFF
if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
// This is a high surrogate. Look for a subsequent low surrogate.
wchar_t ch2 = wtext[i + 1];
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
// Yes, this is a low surrogate.
char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
result += encode_wchar(code_point, encoding);
i++;
continue;
}
}
#endif
result += encode_wchar(ch, encoding);
}
return result;
@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
return decode_text_impl(decoder);
}
case E_unicode:
case E_utf16be:
{
StringUnicodeDecoder decoder(text);
StringUtf16Decoder decoder(text);
return decode_text_impl(decoder);
}
@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
wstring result;
// bool expand_amp = get_expand_amp();
wchar_t character = decoder.get_next_character();
char32_t character = decoder.get_next_character();
while (!decoder.is_eof()) {
/*
if (character == '&' && expand_amp) {
@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
character = expand_amp_sequence(decoder);
}
*/
result += character;
if (character <= WCHAR_MAX) {
result += character;
} else {
// We need to encode this as a surrogate pair.
uint32_t v = (uint32_t)character - 0x10000u;
result += (wchar_t)((v >> 10u) | 0xd800u);
result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
}
character = decoder.get_next_character();
}
@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
case TextEncoder::E_utf8:
return out << "utf8";
case TextEncoder::E_unicode:
return out << "unicode";
case TextEncoder::E_utf16be:
return out << "utf16be";
};
return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
encoding = TextEncoder::E_iso8859;
} else if (word == "utf8" || word == "utf-8") {
encoding = TextEncoder::E_utf8;
} else if (word == "unicode") {
encoding = TextEncoder::E_unicode;
} else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
word == "utf16-be" || word == "utf-16-be") {
encoding = TextEncoder::E_utf16be;
} else {
ostream *notify_ptr = StringDecoder::get_notify_ptr();
if (notify_ptr != nullptr) {

View File

@ -35,7 +35,10 @@ PUBLISHED:
enum Encoding {
E_iso8859,
E_utf8,
E_unicode
E_utf16be,
// Deprecated alias for E_utf16be
E_unicode = E_utf16be,
};
INLINE TextEncoder();
@ -70,7 +73,7 @@ PUBLISHED:
INLINE std::string get_text(Encoding encoding) const;
INLINE void append_text(const std::string &text);
#endif
INLINE void append_unicode_char(int character);
INLINE void append_unicode_char(char32_t character);
INLINE size_t get_num_chars() const;
INLINE int get_unicode_char(size_t index) const;
INLINE void set_unicode_char(size_t index, int character);
@ -103,13 +106,13 @@ PUBLISHED:
bool is_wtext() const;
#ifdef CPPPARSER
EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
EXTEND INLINE PyObject *decode_text(PyObject *text) const;
EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
#else
static std::string encode_wchar(wchar_t ch, Encoding encoding);
static std::string encode_wchar(char32_t ch, Encoding encoding);
INLINE std::string encode_wtext(const std::wstring &wtext) const;
static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
INLINE std::wstring decode_text(const std::string &text) const;