dtoolutil: improve Unicode encoding/decoding, support non-BMP chars

- Support encoding and decoding four-byte UTF-8 sequences - E_unicode supports surrogate pairs, renamed to E_utf16be for clarity - char32_t should be used for storing a Unicode code point
2025-09-30 08:44:19 -04:00 · 2018-10-08 22:33:54 +02:00 · 2018-10-08 22:33:54 +02:00 · 29b577971f
commit 29b577971f
parent 9061fd9416
6 changed files with 167 additions and 41 deletions
--- a/dtool/src/dtoolutil/stringDecoder.I
+++ b/dtool/src/dtoolutil/stringDecoder.I
@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
 *
 */
 INLINE StringUnicodeDecoder::
-StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
+StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
 }
--- a/dtool/src/dtoolutil/stringDecoder.cxx
+++ b/dtool/src/dtoolutil/stringDecoder.cxx
@ -26,7 +26,7 @@ StringDecoder::
 /**
 * Returns the next character in sequence.
 */
-int StringDecoder::
+char32_t StringDecoder::
 get_next_character() {
  if (test_eof()) {
    return -1;
@ -57,19 +57,20 @@ get_notify_ptr() {

 /*
 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
-one, two, or three 8-bit bytes, depending on the value of the
+one, two, three or four 8-bit bytes, depending on the value of the
 character. The following table shows the format of such UTF-8 byte
 sequences (where the "free bits" shown by x's in the table are
 combined in the order shown, and interpreted from most significant to
 least significant):

 Binary format of bytes in sequence:
-                                        Number of    Maximum expressible
- 1st byte     2nd byte    3rd byte      free bits:      Unicode value:
+                                              Number of    Maximum expressible
+ 1st byte    2nd byte   3rd byte   4th byte   free bits:     Unicode value:

- 0xxxxxxx                                  7           007F hex   (127)
- 110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
- 1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
+ 0xxxxxxx                                         7          007F hex   (127)
+ 110xxxxx    10xxxxxx                          (5+6)=11      07FF hex  (2047)
+ 1110xxxx    10xxxxxx   10xxxxxx              (4+6+6)=16     FFFF hex (65535)
+ 11110xxx    10xxxxxx   10xxxxxx   10xxxxxx   (4+6*3)=21   10FFFF hex (1114111)

 The value of each individual byte indicates its UTF-8 function, as follows:

@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
 80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
 C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
 E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
+ F0 to F7 hex (240 to 247):  first byte of a four-byte sequence.
 */

 /**
 * Returns the next character in sequence.
 */
-int StringUtf8Decoder::
+char32_t StringUtf8Decoder::
 get_next_character() {
  unsigned int result;
  while (!test_eof()) {
@ -125,6 +127,35 @@ get_next_character() {
      unsigned int three = (unsigned char)_input[_p++];
      result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
      return result;
+
+    } else if ((result & 0xf8) == 0xf0) {
+      // First byte of four.
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int two = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int three = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int four = (unsigned char)_input[_p++];
+      result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
+      return result;
    }

    // Otherwise--the high bit is set but it is not one of the introductory
@ -144,7 +175,7 @@ get_next_character() {
 /**
 * Returns the next character in sequence.
 */
-int StringUnicodeDecoder::
+char32_t StringUtf16Decoder::
 get_next_character() {
  if (test_eof()) {
    return -1;
@ -159,5 +190,33 @@ get_next_character() {
    return -1;
  }
  unsigned int low = (unsigned char)_input[_p++];
-  return ((high << 8) | low);
+  int ch = ((high << 8) | low);
+
+  /*
+  using std::swap;
+
+  if (ch == 0xfffe) {
+    // This is a byte-swapped byte-order-marker.  That means we need to swap
+    // the endianness of the rest of the stream.
+    char *data = (char *)_input.data();
+    for (size_t p = _p; p < _input.size() - 1; p += 2) {
+      std::swap(data[p], data[p + 1]);
+    }
+    ch = 0xfeff;
+  }
+  */
+
+  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
+    // This is a high surrogate.  Look for a subsequent low surrogate.
+    unsigned int high = (unsigned char)_input[_p];
+    unsigned int low = (unsigned char)_input[_p + 1];
+    int ch2 = ((high << 8) | low);
+    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+      // Yes, this is a low surrogate.
+      _p += 2;
+      return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
+    }
+  }
+  // No, this is just a regular character, or an unpaired surrogate.
+  return ch;
 }
--- a/dtool/src/dtoolutil/stringDecoder.h
+++ b/dtool/src/dtoolutil/stringDecoder.h
@ -26,7 +26,7 @@ public:
  INLINE StringDecoder(const std::string &input);
  virtual ~StringDecoder();

-  virtual int get_next_character();
+  virtual char32_t get_next_character();
  INLINE bool is_eof();

  static void set_notify_ptr(std::ostream *ptr);
@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
 public:
  INLINE StringUtf8Decoder(const std::string &input);

-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };

 /**
 * This decoder extracts characters two at a time to get a plain wide
- * character sequence.
+ * character sequence.  It supports surrogate pairs.
 */
-class StringUnicodeDecoder : public StringDecoder {
+class StringUtf16Decoder : public StringDecoder {
 public:
-  INLINE StringUnicodeDecoder(const std::string &input);
+  INLINE StringUtf16Decoder(const std::string &input);

-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };

+// Deprecated alias of StringUtf16Encoder.
+typedef StringUtf16Decoder StringUnicodeDecoder;
+
 #include "stringDecoder.I"

 #endif
--- a/dtool/src/dtoolutil/textEncoder.I
+++ b/dtool/src/dtoolutil/textEncoder.I
@ -169,8 +169,23 @@ append_text(const std::string &text) {
 * wide character, up to 16 bits in Unicode.
 */
 INLINE void TextEncoder::
-append_unicode_char(int character) {
+append_unicode_char(char32_t character) {
+#if WCHAR_MAX >= 0x10FFFF
+  // wchar_t might be UTF-32.
  _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
+#else
+  if ((character & ~0xffff) == 0) {
+    _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
+  } else {
+    // Encode as a surrogate pair.
+    uint32_t v = (uint32_t)character - 0x10000u;
+    wchar_t wstr[2] = {
+      (wchar_t)((v >> 10u) | 0xd800u),
+      (wchar_t)((v & 0x3ffu) | 0xdc00u),
+    };
+    _wtext = get_wtext() + std::wstring(wstr, 2);
+  }
+#endif
  _flags = (_flags | F_got_wtext) & ~F_got_text;
  text_changed();
 }
--- a/dtool/src/dtoolutil/textEncoder.cxx
+++ b/dtool/src/dtoolutil/textEncoder.cxx
@ -21,7 +21,7 @@ using std::ostream;
 using std::string;
 using std::wstring;

-TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
+TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;

 /**
 * Adjusts the text stored within the encoder to all uppercase letters
@ -109,11 +109,11 @@ is_wtext() const {
 }

 /**
- * Encodes a single wide char into a one-, two-, or three-byte string,
- * according to the given encoding system.
+ * Encodes a single Unicode character into a one-, two-, three-, or four-byte
+ * string, according to the given encoding system.
 */
 string TextEncoder::
-encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
+encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
  switch (encoding) {
  case E_iso8859:
    if ((ch & ~0xff) == 0) {
@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
      return
        string(1, (char)((ch >> 6) | 0xc0)) +
        string(1, (char)((ch & 0x3f) | 0x80));
-    } else {
+    } else if ((ch & ~0xffff) == 0) {
      return
        string(1, (char)((ch >> 12) | 0xe0)) +
        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
        string(1, (char)((ch & 0x3f) | 0x80));
+    } else {
+      return
+        string(1, (char)((ch >> 18) | 0xf0)) +
+        string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
+        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
+        string(1, (char)((ch & 0x3f) | 0x80));
    }

-  case E_unicode:
-    return
-      string(1, (char)(ch >> 8)) +
-      string(1, (char)(ch & 0xff));
+  case E_utf16be:
+    if ((ch & ~0xffff) == 0) {
+      // Note that this passes through surrogates and BOMs unharmed.
+      return
+        string(1, (char)(ch >> 8)) +
+        string(1, (char)(ch & 0xff));
+    } else {
+      // Use a surrogate pair.
+      uint32_t v = (uint32_t)ch - 0x10000u;
+      uint16_t hi = (v >> 10u) | 0xd800u;
+      uint16_t lo = (v & 0x3ffu) | 0xdc00u;
+      char encoded[4] = {
+        (char)(hi >> 8),
+        (char)(hi & 0xff),
+        (char)(lo >> 8),
+        (char)(lo & 0xff),
+      };
+      return string(encoded, 4);
+    }
  }

  return "";
@ -169,8 +190,25 @@ string TextEncoder::
 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
  string result;

-  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
-    result += encode_wchar(*pi, encoding);
+  for (size_t i = 0; i < wtext.size(); ++i) {
+    wchar_t ch = wtext[i];
+
+    // On some systems, wstring may be UTF-16, and contain surrogate pairs.
+#if WCHAR_MAX < 0x10FFFF
+    if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
+      // This is a high surrogate.  Look for a subsequent low surrogate.
+      wchar_t ch2 = wtext[i + 1];
+      if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+        // Yes, this is a low surrogate.
+        char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
+        result += encode_wchar(code_point, encoding);
+        i++;
+        continue;
+      }
+    }
+#endif
+
+    result += encode_wchar(ch, encoding);
  }

  return result;
@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
      return decode_text_impl(decoder);
    }

-  case E_unicode:
+  case E_utf16be:
    {
-      StringUnicodeDecoder decoder(text);
+      StringUtf16Decoder decoder(text);
      return decode_text_impl(decoder);
    }

@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
  wstring result;
  // bool expand_amp = get_expand_amp();

-  wchar_t character = decoder.get_next_character();
+  char32_t character = decoder.get_next_character();
  while (!decoder.is_eof()) {
    /*
    if (character == '&' && expand_amp) {
@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
      character = expand_amp_sequence(decoder);
    }
    */
-    result += character;
+    if (character <= WCHAR_MAX) {
+      result += character;
+    } else {
+      // We need to encode this as a surrogate pair.
+      uint32_t v = (uint32_t)character - 0x10000u;
+      result += (wchar_t)((v >> 10u) | 0xd800u);
+      result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
+    }
    character = decoder.get_next_character();
  }

@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
  case TextEncoder::E_utf8:
    return out << "utf8";

-  case TextEncoder::E_unicode:
-    return out << "unicode";
+  case TextEncoder::E_utf16be:
+    return out << "utf16be";
  };

  return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
    encoding = TextEncoder::E_iso8859;
  } else if (word == "utf8" || word == "utf-8") {
    encoding = TextEncoder::E_utf8;
-  } else if (word == "unicode") {
-    encoding = TextEncoder::E_unicode;
+  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
+                                  word == "utf16-be" || word == "utf-16-be") {
+    encoding = TextEncoder::E_utf16be;
  } else {
    ostream *notify_ptr = StringDecoder::get_notify_ptr();
    if (notify_ptr != nullptr) {
--- a/dtool/src/dtoolutil/textEncoder.h
+++ b/dtool/src/dtoolutil/textEncoder.h
@ -35,7 +35,10 @@ PUBLISHED:
  enum Encoding {
    E_iso8859,
    E_utf8,
-    E_unicode
+    E_utf16be,
+
+    // Deprecated alias for E_utf16be
+    E_unicode = E_utf16be,
  };

  INLINE TextEncoder();
@ -70,7 +73,7 @@ PUBLISHED:
  INLINE std::string get_text(Encoding encoding) const;
  INLINE void append_text(const std::string &text);
 #endif
-  INLINE void append_unicode_char(int character);
+  INLINE void append_unicode_char(char32_t character);
  INLINE size_t get_num_chars() const;
  INLINE int get_unicode_char(size_t index) const;
  INLINE void set_unicode_char(size_t index, int character);
@ -103,13 +106,13 @@ PUBLISHED:
  bool is_wtext() const;

 #ifdef CPPPARSER
-  EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
+  EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
  EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
  EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
  EXTEND INLINE PyObject *decode_text(PyObject *text) const;
  EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
 #else
-  static std::string encode_wchar(wchar_t ch, Encoding encoding);
+  static std::string encode_wchar(char32_t ch, Encoding encoding);
  INLINE std::string encode_wtext(const std::wstring &wtext) const;
  static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
  INLINE std::wstring decode_text(const std::string &text) const;