dtoolutil: improve Unicode encoding/decoding, support non-BMP chars

- Support encoding and decoding four-byte UTF-8 sequences - E_unicode supports surrogate pairs, renamed to E_utf16be for clarity - char32_t should be used for storing a Unicode code point
2025-10-01 01:07:51 -04:00 · 2018-10-08 22:33:54 +02:00 · 2018-10-08 22:33:54 +02:00 · 29b577971f
commit 29b577971f
parent 9061fd9416
6 changed files with 167 additions and 41 deletions
--- a/dtool/src/dtoolutil/stringDecoder.I
+++ b/dtool/src/dtoolutil/stringDecoder.I
@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
 *
 */
 INLINE StringUnicodeDecoder::
-StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
+StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
 }
--- a/dtool/src/dtoolutil/stringDecoder.cxx
+++ b/dtool/src/dtoolutil/stringDecoder.cxx
@ -26,7 +26,7 @@ StringDecoder::
 /**
 * Returns the next character in sequence.
 */
-int StringDecoder::
+char32_t StringDecoder::
 get_next_character() {
  if (test_eof()) {
    return -1;
@ -57,19 +57,20 @@ get_notify_ptr() {
 /*
 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
-one, two, or three 8-bit bytes, depending on the value of the
+one, two, three or four 8-bit bytes, depending on the value of the
 character. The following table shows the format of such UTF-8 byte
 sequences (where the "free bits" shown by x's in the table are
 combined in the order shown, and interpreted from most significant to
 least significant):
 Binary format of bytes in sequence:
-                                        Number of    Maximum expressible
+                                              Number of    Maximum expressible
- 1st byte     2nd byte    3rd byte      free bits:      Unicode value:
+ 1st byte    2nd byte   3rd byte   4th byte   free bits:     Unicode value:
- 0xxxxxxx                                  7           007F hex   (127)
+ 0xxxxxxx                                         7          007F hex   (127)
- 110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
+ 110xxxxx    10xxxxxx                          (5+6)=11      07FF hex  (2047)
- 1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
+ 1110xxxx    10xxxxxx   10xxxxxx              (4+6+6)=16     FFFF hex (65535)
 11110xxx    10xxxxxx   10xxxxxx   10xxxxxx   (4+6*3)=21   10FFFF hex (1114111)
 The value of each individual byte indicates its UTF-8 function, as follows:
@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
 80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
 C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
 E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
 F0 to F7 hex (240 to 247):  first byte of a four-byte sequence.
 */
 /**
 * Returns the next character in sequence.
 */
-int StringUtf8Decoder::
+char32_t StringUtf8Decoder::
 get_next_character() {
  unsigned int result;
  while (!test_eof()) {
@ -125,6 +127,35 @@ get_next_character() {
      unsigned int three = (unsigned char)_input[_p++];
      result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
      return result;
    } else if ((result & 0xf8) == 0xf0) {
      // First byte of four.
      if (test_eof()) {
        if (_notify_ptr != nullptr) {
          (*_notify_ptr)
            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
        }
        return -1;
      }
      unsigned int two = (unsigned char)_input[_p++];
      if (test_eof()) {
        if (_notify_ptr != nullptr) {
          (*_notify_ptr)
            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
        }
        return -1;
      }
      unsigned int three = (unsigned char)_input[_p++];
      if (test_eof()) {
        if (_notify_ptr != nullptr) {
          (*_notify_ptr)
            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
        }
        return -1;
      }
      unsigned int four = (unsigned char)_input[_p++];
      result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
      return result;
    }
    // Otherwise--the high bit is set but it is not one of the introductory
@ -144,7 +175,7 @@ get_next_character() {
 /**
 * Returns the next character in sequence.
 */
-int StringUnicodeDecoder::
+char32_t StringUtf16Decoder::
 get_next_character() {
  if (test_eof()) {
    return -1;
@ -159,5 +190,33 @@ get_next_character() {
    return -1;
  }
  unsigned int low = (unsigned char)_input[_p++];
-  return ((high << 8) | low);
+  int ch = ((high << 8) | low);
  /*
  using std::swap;
  if (ch == 0xfffe) {
    // This is a byte-swapped byte-order-marker.  That means we need to swap
    // the endianness of the rest of the stream.
    char *data = (char *)_input.data();
    for (size_t p = _p; p < _input.size() - 1; p += 2) {
      std::swap(data[p], data[p + 1]);
    }
    ch = 0xfeff;
  }
  */
  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
    // This is a high surrogate.  Look for a subsequent low surrogate.
    unsigned int high = (unsigned char)_input[_p];
    unsigned int low = (unsigned char)_input[_p + 1];
    int ch2 = ((high << 8) | low);
    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
      // Yes, this is a low surrogate.
      _p += 2;
      return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
    }
  }
  // No, this is just a regular character, or an unpaired surrogate.
  return ch;
 }
--- a/dtool/src/dtoolutil/stringDecoder.h
+++ b/dtool/src/dtoolutil/stringDecoder.h
@ -26,7 +26,7 @@ public:
  INLINE StringDecoder(const std::string &input);
  virtual ~StringDecoder();
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
  INLINE bool is_eof();
  static void set_notify_ptr(std::ostream *ptr);
@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
 public:
  INLINE StringUtf8Decoder(const std::string &input);
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };
 /**
 * This decoder extracts characters two at a time to get a plain wide
- * character sequence.
+ * character sequence.  It supports surrogate pairs.
 */
-class StringUnicodeDecoder : public StringDecoder {
+class StringUtf16Decoder : public StringDecoder {
 public:
-  INLINE StringUnicodeDecoder(const std::string &input);
+  INLINE StringUtf16Decoder(const std::string &input);
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };
 // Deprecated alias of StringUtf16Encoder.
 typedef StringUtf16Decoder StringUnicodeDecoder;
 #include "stringDecoder.I"
 #endif
--- a/dtool/src/dtoolutil/textEncoder.I
+++ b/dtool/src/dtoolutil/textEncoder.I
@ -169,8 +169,23 @@ append_text(const std::string &text) {
 * wide character, up to 16 bits in Unicode.
 */
 INLINE void TextEncoder::
-append_unicode_char(int character) {
+append_unicode_char(char32_t character) {
 #if WCHAR_MAX >= 0x10FFFF
  // wchar_t might be UTF-32.
  _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
 #else
  if ((character & ~0xffff) == 0) {
    _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
  } else {
    // Encode as a surrogate pair.
    uint32_t v = (uint32_t)character - 0x10000u;
    wchar_t wstr[2] = {
      (wchar_t)((v >> 10u) | 0xd800u),
      (wchar_t)((v & 0x3ffu) | 0xdc00u),
    };
    _wtext = get_wtext() + std::wstring(wstr, 2);
  }
 #endif
  _flags = (_flags | F_got_wtext) & ~F_got_text;
  text_changed();
 }
--- a/dtool/src/dtoolutil/textEncoder.cxx
+++ b/dtool/src/dtoolutil/textEncoder.cxx
@ -21,7 +21,7 @@ using std::ostream;
 using std::string;
 using std::wstring;
-TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
+TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
 /**
 * Adjusts the text stored within the encoder to all uppercase letters
@ -109,11 +109,11 @@ is_wtext() const {
 }
 /**
- * Encodes a single wide char into a one-, two-, or three-byte string,
+ * Encodes a single Unicode character into a one-, two-, three-, or four-byte
- * according to the given encoding system.
+ * string, according to the given encoding system.
 */
 string TextEncoder::
-encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
+encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
  switch (encoding) {
  case E_iso8859:
    if ((ch & ~0xff) == 0) {
@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
      return
        string(1, (char)((ch >> 6) | 0xc0)) +
        string(1, (char)((ch & 0x3f) | 0x80));
-    } else {
+    } else if ((ch & ~0xffff) == 0) {
      return
        string(1, (char)((ch >> 12) | 0xe0)) +
        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
        string(1, (char)((ch & 0x3f) | 0x80));
    } else {
      return
        string(1, (char)((ch >> 18) | 0xf0)) +
        string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
        string(1, (char)((ch & 0x3f) | 0x80));
    }
-  case E_unicode:
+  case E_utf16be:
-    return
+    if ((ch & ~0xffff) == 0) {
-      string(1, (char)(ch >> 8)) +
+      // Note that this passes through surrogates and BOMs unharmed.
-      string(1, (char)(ch & 0xff));
+      return
        string(1, (char)(ch >> 8)) +
        string(1, (char)(ch & 0xff));
    } else {
      // Use a surrogate pair.
      uint32_t v = (uint32_t)ch - 0x10000u;
      uint16_t hi = (v >> 10u) | 0xd800u;
      uint16_t lo = (v & 0x3ffu) | 0xdc00u;
      char encoded[4] = {
        (char)(hi >> 8),
        (char)(hi & 0xff),
        (char)(lo >> 8),
        (char)(lo & 0xff),
      };
      return string(encoded, 4);
    }
  }
  return "";
@ -169,8 +190,25 @@ string TextEncoder::
 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
  string result;
-  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
+  for (size_t i = 0; i < wtext.size(); ++i) {
-    result += encode_wchar(*pi, encoding);
+    wchar_t ch = wtext[i];
    // On some systems, wstring may be UTF-16, and contain surrogate pairs.
 #if WCHAR_MAX < 0x10FFFF
    if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
      // This is a high surrogate.  Look for a subsequent low surrogate.
      wchar_t ch2 = wtext[i + 1];
      if (ch2 >= 0xdc00 && ch2 < 0xe000) {
        // Yes, this is a low surrogate.
        char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
        result += encode_wchar(code_point, encoding);
        i++;
        continue;
      }
    }
 #endif
    result += encode_wchar(ch, encoding);
  }
  return result;
@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
      return decode_text_impl(decoder);
    }
-  case E_unicode:
+  case E_utf16be:
    {
-      StringUnicodeDecoder decoder(text);
+      StringUtf16Decoder decoder(text);
      return decode_text_impl(decoder);
    }
@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
  wstring result;
  // bool expand_amp = get_expand_amp();
-  wchar_t character = decoder.get_next_character();
+  char32_t character = decoder.get_next_character();
  while (!decoder.is_eof()) {
    /*
    if (character == '&' && expand_amp) {
@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
      character = expand_amp_sequence(decoder);
    }
    */
-    result += character;
+    if (character <= WCHAR_MAX) {
      result += character;
    } else {
      // We need to encode this as a surrogate pair.
      uint32_t v = (uint32_t)character - 0x10000u;
      result += (wchar_t)((v >> 10u) | 0xd800u);
      result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
    }
    character = decoder.get_next_character();
  }
@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
  case TextEncoder::E_utf8:
    return out << "utf8";
-  case TextEncoder::E_unicode:
+  case TextEncoder::E_utf16be:
-    return out << "unicode";
+    return out << "utf16be";
  };
  return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
    encoding = TextEncoder::E_iso8859;
  } else if (word == "utf8" || word == "utf-8") {
    encoding = TextEncoder::E_utf8;
-  } else if (word == "unicode") {
+  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
-    encoding = TextEncoder::E_unicode;
+                                  word == "utf16-be" || word == "utf-16-be") {
    encoding = TextEncoder::E_utf16be;
  } else {
    ostream *notify_ptr = StringDecoder::get_notify_ptr();
    if (notify_ptr != nullptr) {
--- a/dtool/src/dtoolutil/textEncoder.h
+++ b/dtool/src/dtoolutil/textEncoder.h
@ -35,7 +35,10 @@ PUBLISHED:
  enum Encoding {
    E_iso8859,
    E_utf8,
-    E_unicode
+    E_utf16be,
    // Deprecated alias for E_utf16be
    E_unicode = E_utf16be,
  };
  INLINE TextEncoder();
@ -70,7 +73,7 @@ PUBLISHED:
  INLINE std::string get_text(Encoding encoding) const;
  INLINE void append_text(const std::string &text);
 #endif
-  INLINE void append_unicode_char(int character);
+  INLINE void append_unicode_char(char32_t character);
  INLINE size_t get_num_chars() const;
  INLINE int get_unicode_char(size_t index) const;
  INLINE void set_unicode_char(size_t index, int character);
@ -103,13 +106,13 @@ PUBLISHED:
  bool is_wtext() const;
 #ifdef CPPPARSER
-  EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
+  EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
  EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
  EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
  EXTEND INLINE PyObject *decode_text(PyObject *text) const;
  EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
 #else
-  static std::string encode_wchar(wchar_t ch, Encoding encoding);
+  static std::string encode_wchar(char32_t ch, Encoding encoding);
  INLINE std::string encode_wtext(const std::wstring &wtext) const;
  static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
  INLINE std::wstring decode_text(const std::string &text) const;