From 29b577971f71c0e6d2dfc07fb8eeeef6712d2f58 Mon Sep 17 00:00:00 2001
From: rdb <git@rdb.name>
Date: Mon, 8 Oct 2018 22:33:54 +0200
Subject: [PATCH] dtoolutil: improve Unicode encoding/decoding, support non-BMP
 chars

- Support encoding and decoding four-byte UTF-8 sequences
- E_unicode supports surrogate pairs, renamed to E_utf16be for clarity
- char32_t should be used for storing a Unicode code point
---
 dtool/src/dtoolutil/stringDecoder.I   |  2 +-
 dtool/src/dtoolutil/stringDecoder.cxx | 79 +++++++++++++++++++++----
 dtool/src/dtoolutil/stringDecoder.h   | 15 +++--
 dtool/src/dtoolutil/textEncoder.I     | 17 +++++-
 dtool/src/dtoolutil/textEncoder.cxx   | 84 +++++++++++++++++++++------
 dtool/src/dtoolutil/textEncoder.h     | 11 ++--
 6 files changed, 167 insertions(+), 41 deletions(-)

diff --git a/dtool/src/dtoolutil/stringDecoder.I b/dtool/src/dtoolutil/stringDecoder.I
index f7a3b14701..ce128833d0 100644
--- a/dtool/src/dtoolutil/stringDecoder.I
+++ b/dtool/src/dtoolutil/stringDecoder.I
@@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
  *
  */
 INLINE StringUnicodeDecoder::
-StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
+StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
 }
diff --git a/dtool/src/dtoolutil/stringDecoder.cxx b/dtool/src/dtoolutil/stringDecoder.cxx
index e77e0c5e13..f9ecfdecd3 100644
--- a/dtool/src/dtoolutil/stringDecoder.cxx
+++ b/dtool/src/dtoolutil/stringDecoder.cxx
@@ -26,7 +26,7 @@ StringDecoder::
 /**
  * Returns the next character in sequence.
  */
-int StringDecoder::
+char32_t StringDecoder::
 get_next_character() {
   if (test_eof()) {
     return -1;
@@ -57,19 +57,20 @@ get_notify_ptr() {
 
 /*
 In UTF-8, each 16-bit Unicode character is encoded as a sequence of
-one, two, or three 8-bit bytes, depending on the value of the
+one, two, three or four 8-bit bytes, depending on the value of the
 character. The following table shows the format of such UTF-8 byte
 sequences (where the "free bits" shown by x's in the table are
 combined in the order shown, and interpreted from most significant to
 least significant):
 
  Binary format of bytes in sequence:
-                                        Number of    Maximum expressible
- 1st byte     2nd byte    3rd byte      free bits:      Unicode value:
+                                              Number of    Maximum expressible
+ 1st byte    2nd byte   3rd byte   4th byte   free bits:     Unicode value:
 
- 0xxxxxxx                                  7           007F hex   (127)
- 110xxxxx     10xxxxxx                  (5+6)=11       07FF hex  (2047)
- 1110xxxx     10xxxxxx    10xxxxxx     (4+6+6)=16      FFFF hex (65535)
+ 0xxxxxxx                                         7          007F hex   (127)
+ 110xxxxx    10xxxxxx                          (5+6)=11      07FF hex  (2047)
+ 1110xxxx    10xxxxxx   10xxxxxx              (4+6+6)=16     FFFF hex (65535)
+ 11110xxx    10xxxxxx   10xxxxxx   10xxxxxx   (4+6*3)=21   10FFFF hex (1114111)
 
 The value of each individual byte indicates its UTF-8 function, as follows:
 
@@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
  80 to BF hex (128 to 191):  continuing byte in a multi-byte sequence.
  C2 to DF hex (194 to 223):  first byte of a two-byte sequence.
  E0 to EF hex (224 to 239):  first byte of a three-byte sequence.
+ F0 to F7 hex (240 to 247):  first byte of a four-byte sequence.
 */
 
 /**
  * Returns the next character in sequence.
  */
-int StringUtf8Decoder::
+char32_t StringUtf8Decoder::
 get_next_character() {
   unsigned int result;
   while (!test_eof()) {
@@ -125,6 +127,35 @@ get_next_character() {
       unsigned int three = (unsigned char)_input[_p++];
       result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
       return result;
+
+    } else if ((result & 0xf8) == 0xf0) {
+      // First byte of four.
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int two = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int three = (unsigned char)_input[_p++];
+      if (test_eof()) {
+        if (_notify_ptr != nullptr) {
+          (*_notify_ptr)
+            << "utf-8 encoded string '" << _input << "' ends abruptly.\n";
+        }
+        return -1;
+      }
+      unsigned int four = (unsigned char)_input[_p++];
+      result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
+      return result;
     }
 
     // Otherwise--the high bit is set but it is not one of the introductory
@@ -144,7 +175,7 @@ get_next_character() {
 /**
  * Returns the next character in sequence.
  */
-int StringUnicodeDecoder::
+char32_t StringUtf16Decoder::
 get_next_character() {
   if (test_eof()) {
     return -1;
@@ -159,5 +190,33 @@ get_next_character() {
     return -1;
   }
   unsigned int low = (unsigned char)_input[_p++];
-  return ((high << 8) | low);
+  int ch = ((high << 8) | low);
+
+  /*
+  using std::swap;
+
+  if (ch == 0xfffe) {
+    // This is a byte-swapped byte-order-marker.  That means we need to swap
+    // the endianness of the rest of the stream.
+    char *data = (char *)_input.data();
+    for (size_t p = _p; p < _input.size() - 1; p += 2) {
+      std::swap(data[p], data[p + 1]);
+    }
+    ch = 0xfeff;
+  }
+  */
+
+  if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
+    // This is a high surrogate.  Look for a subsequent low surrogate.
+    unsigned int high = (unsigned char)_input[_p];
+    unsigned int low = (unsigned char)_input[_p + 1];
+    int ch2 = ((high << 8) | low);
+    if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+      // Yes, this is a low surrogate.
+      _p += 2;
+      return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
+    }
+  }
+  // No, this is just a regular character, or an unpaired surrogate.
+  return ch;
 }
diff --git a/dtool/src/dtoolutil/stringDecoder.h b/dtool/src/dtoolutil/stringDecoder.h
index c0b2534ee2..6885f77e08 100644
--- a/dtool/src/dtoolutil/stringDecoder.h
+++ b/dtool/src/dtoolutil/stringDecoder.h
@@ -26,7 +26,7 @@ public:
   INLINE StringDecoder(const std::string &input);
   virtual ~StringDecoder();
 
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
   INLINE bool is_eof();
 
   static void set_notify_ptr(std::ostream *ptr);
@@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
 public:
   INLINE StringUtf8Decoder(const std::string &input);
 
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };
 
 /**
  * This decoder extracts characters two at a time to get a plain wide
- * character sequence.
+ * character sequence.  It supports surrogate pairs.
  */
-class StringUnicodeDecoder : public StringDecoder {
+class StringUtf16Decoder : public StringDecoder {
 public:
-  INLINE StringUnicodeDecoder(const std::string &input);
+  INLINE StringUtf16Decoder(const std::string &input);
 
-  virtual int get_next_character();
+  virtual char32_t get_next_character();
 };
 
+// Deprecated alias of StringUtf16Encoder.
+typedef StringUtf16Decoder StringUnicodeDecoder;
+
 #include "stringDecoder.I"
 
 #endif
diff --git a/dtool/src/dtoolutil/textEncoder.I b/dtool/src/dtoolutil/textEncoder.I
index 417ef386e2..766319d6da 100644
--- a/dtool/src/dtoolutil/textEncoder.I
+++ b/dtool/src/dtoolutil/textEncoder.I
@@ -169,8 +169,23 @@ append_text(const std::string &text) {
  * wide character, up to 16 bits in Unicode.
  */
 INLINE void TextEncoder::
-append_unicode_char(int character) {
+append_unicode_char(char32_t character) {
+#if WCHAR_MAX >= 0x10FFFF
+  // wchar_t might be UTF-32.
   _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
+#else
+  if ((character & ~0xffff) == 0) {
+    _wtext = get_wtext() + std::wstring(1, (wchar_t)character);
+  } else {
+    // Encode as a surrogate pair.
+    uint32_t v = (uint32_t)character - 0x10000u;
+    wchar_t wstr[2] = {
+      (wchar_t)((v >> 10u) | 0xd800u),
+      (wchar_t)((v & 0x3ffu) | 0xdc00u),
+    };
+    _wtext = get_wtext() + std::wstring(wstr, 2);
+  }
+#endif
   _flags = (_flags | F_got_wtext) & ~F_got_text;
   text_changed();
 }
diff --git a/dtool/src/dtoolutil/textEncoder.cxx b/dtool/src/dtoolutil/textEncoder.cxx
index da835b7bfb..1065f21dcb 100644
--- a/dtool/src/dtoolutil/textEncoder.cxx
+++ b/dtool/src/dtoolutil/textEncoder.cxx
@@ -21,7 +21,7 @@ using std::ostream;
 using std::string;
 using std::wstring;
 
-TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
+TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
 
 /**
  * Adjusts the text stored within the encoder to all uppercase letters
@@ -109,11 +109,11 @@ is_wtext() const {
 }
 
 /**
- * Encodes a single wide char into a one-, two-, or three-byte string,
- * according to the given encoding system.
+ * Encodes a single Unicode character into a one-, two-, three-, or four-byte
+ * string, according to the given encoding system.
  */
 string TextEncoder::
-encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
+encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
   switch (encoding) {
   case E_iso8859:
     if ((ch & ~0xff) == 0) {
@@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
       return
         string(1, (char)((ch >> 6) | 0xc0)) +
         string(1, (char)((ch & 0x3f) | 0x80));
-    } else {
+    } else if ((ch & ~0xffff) == 0) {
       return
         string(1, (char)((ch >> 12) | 0xe0)) +
         string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
         string(1, (char)((ch & 0x3f) | 0x80));
+    } else {
+      return
+        string(1, (char)((ch >> 18) | 0xf0)) +
+        string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
+        string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
+        string(1, (char)((ch & 0x3f) | 0x80));
     }
 
-  case E_unicode:
-    return
-      string(1, (char)(ch >> 8)) +
-      string(1, (char)(ch & 0xff));
+  case E_utf16be:
+    if ((ch & ~0xffff) == 0) {
+      // Note that this passes through surrogates and BOMs unharmed.
+      return
+        string(1, (char)(ch >> 8)) +
+        string(1, (char)(ch & 0xff));
+    } else {
+      // Use a surrogate pair.
+      uint32_t v = (uint32_t)ch - 0x10000u;
+      uint16_t hi = (v >> 10u) | 0xd800u;
+      uint16_t lo = (v & 0x3ffu) | 0xdc00u;
+      char encoded[4] = {
+        (char)(hi >> 8),
+        (char)(hi & 0xff),
+        (char)(lo >> 8),
+        (char)(lo & 0xff),
+      };
+      return string(encoded, 4);
+    }
   }
 
   return "";
@@ -169,8 +190,25 @@ string TextEncoder::
 encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
   string result;
 
-  for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
-    result += encode_wchar(*pi, encoding);
+  for (size_t i = 0; i < wtext.size(); ++i) {
+    wchar_t ch = wtext[i];
+
+    // On some systems, wstring may be UTF-16, and contain surrogate pairs.
+#if WCHAR_MAX < 0x10FFFF
+    if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
+      // This is a high surrogate.  Look for a subsequent low surrogate.
+      wchar_t ch2 = wtext[i + 1];
+      if (ch2 >= 0xdc00 && ch2 < 0xe000) {
+        // Yes, this is a low surrogate.
+        char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
+        result += encode_wchar(code_point, encoding);
+        i++;
+        continue;
+      }
+    }
+#endif
+
+    result += encode_wchar(ch, encoding);
   }
 
   return result;
@@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
       return decode_text_impl(decoder);
     }
 
-  case E_unicode:
+  case E_utf16be:
     {
-      StringUnicodeDecoder decoder(text);
+      StringUtf16Decoder decoder(text);
       return decode_text_impl(decoder);
     }
 
@@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
   wstring result;
   // bool expand_amp = get_expand_amp();
 
-  wchar_t character = decoder.get_next_character();
+  char32_t character = decoder.get_next_character();
   while (!decoder.is_eof()) {
     /*
     if (character == '&' && expand_amp) {
@@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
       character = expand_amp_sequence(decoder);
     }
     */
-    result += character;
+    if (character <= WCHAR_MAX) {
+      result += character;
+    } else {
+      // We need to encode this as a surrogate pair.
+      uint32_t v = (uint32_t)character - 0x10000u;
+      result += (wchar_t)((v >> 10u) | 0xd800u);
+      result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
+    }
     character = decoder.get_next_character();
   }
 
@@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
   case TextEncoder::E_utf8:
     return out << "utf8";
 
-  case TextEncoder::E_unicode:
-    return out << "unicode";
+  case TextEncoder::E_utf16be:
+    return out << "utf16be";
   };
 
   return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
@@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
     encoding = TextEncoder::E_iso8859;
   } else if (word == "utf8" || word == "utf-8") {
     encoding = TextEncoder::E_utf8;
-  } else if (word == "unicode") {
-    encoding = TextEncoder::E_unicode;
+  } else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
+                                  word == "utf16-be" || word == "utf-16-be") {
+    encoding = TextEncoder::E_utf16be;
   } else {
     ostream *notify_ptr = StringDecoder::get_notify_ptr();
     if (notify_ptr != nullptr) {
diff --git a/dtool/src/dtoolutil/textEncoder.h b/dtool/src/dtoolutil/textEncoder.h
index baa0ef9b3e..71d93a71ca 100644
--- a/dtool/src/dtoolutil/textEncoder.h
+++ b/dtool/src/dtoolutil/textEncoder.h
@@ -35,7 +35,10 @@ PUBLISHED:
   enum Encoding {
     E_iso8859,
     E_utf8,
-    E_unicode
+    E_utf16be,
+
+    // Deprecated alias for E_utf16be
+    E_unicode = E_utf16be,
   };
 
   INLINE TextEncoder();
@@ -70,7 +73,7 @@ PUBLISHED:
   INLINE std::string get_text(Encoding encoding) const;
   INLINE void append_text(const std::string &text);
 #endif
-  INLINE void append_unicode_char(int character);
+  INLINE void append_unicode_char(char32_t character);
   INLINE size_t get_num_chars() const;
   INLINE int get_unicode_char(size_t index) const;
   INLINE void set_unicode_char(size_t index, int character);
@@ -103,13 +106,13 @@ PUBLISHED:
   bool is_wtext() const;
 
 #ifdef CPPPARSER
-  EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
+  EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
   EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
   EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
   EXTEND INLINE PyObject *decode_text(PyObject *text) const;
   EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
 #else
-  static std::string encode_wchar(wchar_t ch, Encoding encoding);
+  static std::string encode_wchar(char32_t ch, Encoding encoding);
   INLINE std::string encode_wtext(const std::wstring &wtext) const;
   static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
   INLINE std::wstring decode_text(const std::string &text) const;