From 29b577971f71c0e6d2dfc07fb8eeeef6712d2f58 Mon Sep 17 00:00:00 2001 From: rdb Date: Mon, 8 Oct 2018 22:33:54 +0200 Subject: [PATCH] dtoolutil: improve Unicode encoding/decoding, support non-BMP chars - Support encoding and decoding four-byte UTF-8 sequences - E_unicode supports surrogate pairs, renamed to E_utf16be for clarity - char32_t should be used for storing a Unicode code point --- dtool/src/dtoolutil/stringDecoder.I | 2 +- dtool/src/dtoolutil/stringDecoder.cxx | 79 +++++++++++++++++++++---- dtool/src/dtoolutil/stringDecoder.h | 15 +++-- dtool/src/dtoolutil/textEncoder.I | 17 +++++- dtool/src/dtoolutil/textEncoder.cxx | 84 +++++++++++++++++++++------ dtool/src/dtoolutil/textEncoder.h | 11 ++-- 6 files changed, 167 insertions(+), 41 deletions(-) diff --git a/dtool/src/dtoolutil/stringDecoder.I b/dtool/src/dtoolutil/stringDecoder.I index f7a3b14701..ce128833d0 100644 --- a/dtool/src/dtoolutil/stringDecoder.I +++ b/dtool/src/dtoolutil/stringDecoder.I @@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) { * */ INLINE StringUnicodeDecoder:: -StringUnicodeDecoder(const std::string &input) : StringDecoder(input) { +StringUtf16Decoder(const std::string &input) : StringDecoder(input) { } diff --git a/dtool/src/dtoolutil/stringDecoder.cxx b/dtool/src/dtoolutil/stringDecoder.cxx index e77e0c5e13..f9ecfdecd3 100644 --- a/dtool/src/dtoolutil/stringDecoder.cxx +++ b/dtool/src/dtoolutil/stringDecoder.cxx @@ -26,7 +26,7 @@ StringDecoder:: /** * Returns the next character in sequence. */ -int StringDecoder:: +char32_t StringDecoder:: get_next_character() { if (test_eof()) { return -1; @@ -57,19 +57,20 @@ get_notify_ptr() { /* In UTF-8, each 16-bit Unicode character is encoded as a sequence of -one, two, or three 8-bit bytes, depending on the value of the +one, two, three or four 8-bit bytes, depending on the value of the character. The following table shows the format of such UTF-8 byte sequences (where the "free bits" shown by x's in the table are combined in the order shown, and interpreted from most significant to least significant): Binary format of bytes in sequence: - Number of Maximum expressible - 1st byte 2nd byte 3rd byte free bits: Unicode value: + Number of Maximum expressible + 1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value: - 0xxxxxxx 7 007F hex (127) - 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047) - 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535) + 0xxxxxxx 7 007F hex (127) + 110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047) + 1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535) + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111) The value of each individual byte indicates its UTF-8 function, as follows: @@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows: 80 to BF hex (128 to 191): continuing byte in a multi-byte sequence. C2 to DF hex (194 to 223): first byte of a two-byte sequence. E0 to EF hex (224 to 239): first byte of a three-byte sequence. + F0 to F7 hex (240 to 247): first byte of a four-byte sequence. */ /** * Returns the next character in sequence. */ -int StringUtf8Decoder:: +char32_t StringUtf8Decoder:: get_next_character() { unsigned int result; while (!test_eof()) { @@ -125,6 +127,35 @@ get_next_character() { unsigned int three = (unsigned char)_input[_p++]; result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); return result; + + } else if ((result & 0xf8) == 0xf0) { + // First byte of four. + if (test_eof()) { + if (_notify_ptr != nullptr) { + (*_notify_ptr) + << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; + } + return -1; + } + unsigned int two = (unsigned char)_input[_p++]; + if (test_eof()) { + if (_notify_ptr != nullptr) { + (*_notify_ptr) + << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; + } + return -1; + } + unsigned int three = (unsigned char)_input[_p++]; + if (test_eof()) { + if (_notify_ptr != nullptr) { + (*_notify_ptr) + << "utf-8 encoded string '" << _input << "' ends abruptly.\n"; + } + return -1; + } + unsigned int four = (unsigned char)_input[_p++]; + result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f); + return result; } // Otherwise--the high bit is set but it is not one of the introductory @@ -144,7 +175,7 @@ get_next_character() { /** * Returns the next character in sequence. */ -int StringUnicodeDecoder:: +char32_t StringUtf16Decoder:: get_next_character() { if (test_eof()) { return -1; @@ -159,5 +190,33 @@ get_next_character() { return -1; } unsigned int low = (unsigned char)_input[_p++]; - return ((high << 8) | low); + int ch = ((high << 8) | low); + + /* + using std::swap; + + if (ch == 0xfffe) { + // This is a byte-swapped byte-order-marker. That means we need to swap + // the endianness of the rest of the stream. + char *data = (char *)_input.data(); + for (size_t p = _p; p < _input.size() - 1; p += 2) { + std::swap(data[p], data[p + 1]); + } + ch = 0xfeff; + } + */ + + if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) { + // This is a high surrogate. Look for a subsequent low surrogate. + unsigned int high = (unsigned char)_input[_p]; + unsigned int low = (unsigned char)_input[_p + 1]; + int ch2 = ((high << 8) | low); + if (ch2 >= 0xdc00 && ch2 < 0xe000) { + // Yes, this is a low surrogate. + _p += 2; + return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00); + } + } + // No, this is just a regular character, or an unpaired surrogate. + return ch; } diff --git a/dtool/src/dtoolutil/stringDecoder.h b/dtool/src/dtoolutil/stringDecoder.h index c0b2534ee2..6885f77e08 100644 --- a/dtool/src/dtoolutil/stringDecoder.h +++ b/dtool/src/dtoolutil/stringDecoder.h @@ -26,7 +26,7 @@ public: INLINE StringDecoder(const std::string &input); virtual ~StringDecoder(); - virtual int get_next_character(); + virtual char32_t get_next_character(); INLINE bool is_eof(); static void set_notify_ptr(std::ostream *ptr); @@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder { public: INLINE StringUtf8Decoder(const std::string &input); - virtual int get_next_character(); + virtual char32_t get_next_character(); }; /** * This decoder extracts characters two at a time to get a plain wide - * character sequence. + * character sequence. It supports surrogate pairs. */ -class StringUnicodeDecoder : public StringDecoder { +class StringUtf16Decoder : public StringDecoder { public: - INLINE StringUnicodeDecoder(const std::string &input); + INLINE StringUtf16Decoder(const std::string &input); - virtual int get_next_character(); + virtual char32_t get_next_character(); }; +// Deprecated alias of StringUtf16Encoder. +typedef StringUtf16Decoder StringUnicodeDecoder; + #include "stringDecoder.I" #endif diff --git a/dtool/src/dtoolutil/textEncoder.I b/dtool/src/dtoolutil/textEncoder.I index 417ef386e2..766319d6da 100644 --- a/dtool/src/dtoolutil/textEncoder.I +++ b/dtool/src/dtoolutil/textEncoder.I @@ -169,8 +169,23 @@ append_text(const std::string &text) { * wide character, up to 16 bits in Unicode. */ INLINE void TextEncoder:: -append_unicode_char(int character) { +append_unicode_char(char32_t character) { +#if WCHAR_MAX >= 0x10FFFF + // wchar_t might be UTF-32. _wtext = get_wtext() + std::wstring(1, (wchar_t)character); +#else + if ((character & ~0xffff) == 0) { + _wtext = get_wtext() + std::wstring(1, (wchar_t)character); + } else { + // Encode as a surrogate pair. + uint32_t v = (uint32_t)character - 0x10000u; + wchar_t wstr[2] = { + (wchar_t)((v >> 10u) | 0xd800u), + (wchar_t)((v & 0x3ffu) | 0xdc00u), + }; + _wtext = get_wtext() + std::wstring(wstr, 2); + } +#endif _flags = (_flags | F_got_wtext) & ~F_got_text; text_changed(); } diff --git a/dtool/src/dtoolutil/textEncoder.cxx b/dtool/src/dtoolutil/textEncoder.cxx index da835b7bfb..1065f21dcb 100644 --- a/dtool/src/dtoolutil/textEncoder.cxx +++ b/dtool/src/dtoolutil/textEncoder.cxx @@ -21,7 +21,7 @@ using std::ostream; using std::string; using std::wstring; -TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859; +TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8; /** * Adjusts the text stored within the encoder to all uppercase letters @@ -109,11 +109,11 @@ is_wtext() const { } /** - * Encodes a single wide char into a one-, two-, or three-byte string, - * according to the given encoding system. + * Encodes a single Unicode character into a one-, two-, three-, or four-byte + * string, according to the given encoding system. */ string TextEncoder:: -encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) { +encode_wchar(char32_t ch, TextEncoder::Encoding encoding) { switch (encoding) { case E_iso8859: if ((ch & ~0xff) == 0) { @@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) { return string(1, (char)((ch >> 6) | 0xc0)) + string(1, (char)((ch & 0x3f) | 0x80)); - } else { + } else if ((ch & ~0xffff) == 0) { return string(1, (char)((ch >> 12) | 0xe0)) + string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) + string(1, (char)((ch & 0x3f) | 0x80)); + } else { + return + string(1, (char)((ch >> 18) | 0xf0)) + + string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) + + string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) + + string(1, (char)((ch & 0x3f) | 0x80)); } - case E_unicode: - return - string(1, (char)(ch >> 8)) + - string(1, (char)(ch & 0xff)); + case E_utf16be: + if ((ch & ~0xffff) == 0) { + // Note that this passes through surrogates and BOMs unharmed. + return + string(1, (char)(ch >> 8)) + + string(1, (char)(ch & 0xff)); + } else { + // Use a surrogate pair. + uint32_t v = (uint32_t)ch - 0x10000u; + uint16_t hi = (v >> 10u) | 0xd800u; + uint16_t lo = (v & 0x3ffu) | 0xdc00u; + char encoded[4] = { + (char)(hi >> 8), + (char)(hi & 0xff), + (char)(lo >> 8), + (char)(lo & 0xff), + }; + return string(encoded, 4); + } } return ""; @@ -169,8 +190,25 @@ string TextEncoder:: encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) { string result; - for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) { - result += encode_wchar(*pi, encoding); + for (size_t i = 0; i < wtext.size(); ++i) { + wchar_t ch = wtext[i]; + + // On some systems, wstring may be UTF-16, and contain surrogate pairs. +#if WCHAR_MAX < 0x10FFFF + if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) { + // This is a high surrogate. Look for a subsequent low surrogate. + wchar_t ch2 = wtext[i + 1]; + if (ch2 >= 0xdc00 && ch2 < 0xe000) { + // Yes, this is a low surrogate. + char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00); + result += encode_wchar(code_point, encoding); + i++; + continue; + } + } +#endif + + result += encode_wchar(ch, encoding); } return result; @@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) { return decode_text_impl(decoder); } - case E_unicode: + case E_utf16be: { - StringUnicodeDecoder decoder(text); + StringUtf16Decoder decoder(text); return decode_text_impl(decoder); } @@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) { wstring result; // bool expand_amp = get_expand_amp(); - wchar_t character = decoder.get_next_character(); + char32_t character = decoder.get_next_character(); while (!decoder.is_eof()) { /* if (character == '&' && expand_amp) { @@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) { character = expand_amp_sequence(decoder); } */ - result += character; + if (character <= WCHAR_MAX) { + result += character; + } else { + // We need to encode this as a surrogate pair. + uint32_t v = (uint32_t)character - 0x10000u; + result += (wchar_t)((v >> 10u) | 0xd800u); + result += (wchar_t)((v & 0x3ffu) | 0xdc00u); + } character = decoder.get_next_character(); } @@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) { case TextEncoder::E_utf8: return out << "utf8"; - case TextEncoder::E_unicode: - return out << "unicode"; + case TextEncoder::E_utf16be: + return out << "utf16be"; }; return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**"; @@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) { encoding = TextEncoder::E_iso8859; } else if (word == "utf8" || word == "utf-8") { encoding = TextEncoder::E_utf8; - } else if (word == "unicode") { - encoding = TextEncoder::E_unicode; + } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" || + word == "utf16-be" || word == "utf-16-be") { + encoding = TextEncoder::E_utf16be; } else { ostream *notify_ptr = StringDecoder::get_notify_ptr(); if (notify_ptr != nullptr) { diff --git a/dtool/src/dtoolutil/textEncoder.h b/dtool/src/dtoolutil/textEncoder.h index baa0ef9b3e..71d93a71ca 100644 --- a/dtool/src/dtoolutil/textEncoder.h +++ b/dtool/src/dtoolutil/textEncoder.h @@ -35,7 +35,10 @@ PUBLISHED: enum Encoding { E_iso8859, E_utf8, - E_unicode + E_utf16be, + + // Deprecated alias for E_utf16be + E_unicode = E_utf16be, }; INLINE TextEncoder(); @@ -70,7 +73,7 @@ PUBLISHED: INLINE std::string get_text(Encoding encoding) const; INLINE void append_text(const std::string &text); #endif - INLINE void append_unicode_char(int character); + INLINE void append_unicode_char(char32_t character); INLINE size_t get_num_chars() const; INLINE int get_unicode_char(size_t index) const; INLINE void set_unicode_char(size_t index, int character); @@ -103,13 +106,13 @@ PUBLISHED: bool is_wtext() const; #ifdef CPPPARSER - EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding); + EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding); EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const; EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding); EXTEND INLINE PyObject *decode_text(PyObject *text) const; EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding); #else - static std::string encode_wchar(wchar_t ch, Encoding encoding); + static std::string encode_wchar(char32_t ch, Encoding encoding); INLINE std::string encode_wtext(const std::wstring &wtext) const; static std::string encode_wtext(const std::wstring &wtext, Encoding encoding); INLINE std::wstring decode_text(const std::string &text) const;