mirror of
https://github.com/panda3d/panda3d.git
synced 2025-09-30 08:44:19 -04:00
dtoolutil: improve Unicode encoding/decoding, support non-BMP chars
- Support encoding and decoding four-byte UTF-8 sequences - E_unicode supports surrogate pairs, renamed to E_utf16be for clarity - char32_t should be used for storing a Unicode code point
This commit is contained in:
parent
9061fd9416
commit
29b577971f
@ -53,5 +53,5 @@ StringUtf8Decoder(const std::string &input) : StringDecoder(input) {
|
||||
*
|
||||
*/
|
||||
INLINE StringUnicodeDecoder::
|
||||
StringUnicodeDecoder(const std::string &input) : StringDecoder(input) {
|
||||
StringUtf16Decoder(const std::string &input) : StringDecoder(input) {
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ StringDecoder::
|
||||
/**
|
||||
* Returns the next character in sequence.
|
||||
*/
|
||||
int StringDecoder::
|
||||
char32_t StringDecoder::
|
||||
get_next_character() {
|
||||
if (test_eof()) {
|
||||
return -1;
|
||||
@ -57,19 +57,20 @@ get_notify_ptr() {
|
||||
|
||||
/*
|
||||
In UTF-8, each 16-bit Unicode character is encoded as a sequence of
|
||||
one, two, or three 8-bit bytes, depending on the value of the
|
||||
one, two, three or four 8-bit bytes, depending on the value of the
|
||||
character. The following table shows the format of such UTF-8 byte
|
||||
sequences (where the "free bits" shown by x's in the table are
|
||||
combined in the order shown, and interpreted from most significant to
|
||||
least significant):
|
||||
|
||||
Binary format of bytes in sequence:
|
||||
Number of Maximum expressible
|
||||
1st byte 2nd byte 3rd byte free bits: Unicode value:
|
||||
Number of Maximum expressible
|
||||
1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value:
|
||||
|
||||
0xxxxxxx 7 007F hex (127)
|
||||
110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
|
||||
1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
|
||||
0xxxxxxx 7 007F hex (127)
|
||||
110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
|
||||
1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
|
||||
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111)
|
||||
|
||||
The value of each individual byte indicates its UTF-8 function, as follows:
|
||||
|
||||
@ -77,12 +78,13 @@ The value of each individual byte indicates its UTF-8 function, as follows:
|
||||
80 to BF hex (128 to 191): continuing byte in a multi-byte sequence.
|
||||
C2 to DF hex (194 to 223): first byte of a two-byte sequence.
|
||||
E0 to EF hex (224 to 239): first byte of a three-byte sequence.
|
||||
F0 to F7 hex (240 to 247): first byte of a four-byte sequence.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Returns the next character in sequence.
|
||||
*/
|
||||
int StringUtf8Decoder::
|
||||
char32_t StringUtf8Decoder::
|
||||
get_next_character() {
|
||||
unsigned int result;
|
||||
while (!test_eof()) {
|
||||
@ -125,6 +127,35 @@ get_next_character() {
|
||||
unsigned int three = (unsigned char)_input[_p++];
|
||||
result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
|
||||
return result;
|
||||
|
||||
} else if ((result & 0xf8) == 0xf0) {
|
||||
// First byte of four.
|
||||
if (test_eof()) {
|
||||
if (_notify_ptr != nullptr) {
|
||||
(*_notify_ptr)
|
||||
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
unsigned int two = (unsigned char)_input[_p++];
|
||||
if (test_eof()) {
|
||||
if (_notify_ptr != nullptr) {
|
||||
(*_notify_ptr)
|
||||
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
unsigned int three = (unsigned char)_input[_p++];
|
||||
if (test_eof()) {
|
||||
if (_notify_ptr != nullptr) {
|
||||
(*_notify_ptr)
|
||||
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
unsigned int four = (unsigned char)_input[_p++];
|
||||
result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Otherwise--the high bit is set but it is not one of the introductory
|
||||
@ -144,7 +175,7 @@ get_next_character() {
|
||||
/**
|
||||
* Returns the next character in sequence.
|
||||
*/
|
||||
int StringUnicodeDecoder::
|
||||
char32_t StringUtf16Decoder::
|
||||
get_next_character() {
|
||||
if (test_eof()) {
|
||||
return -1;
|
||||
@ -159,5 +190,33 @@ get_next_character() {
|
||||
return -1;
|
||||
}
|
||||
unsigned int low = (unsigned char)_input[_p++];
|
||||
return ((high << 8) | low);
|
||||
int ch = ((high << 8) | low);
|
||||
|
||||
/*
|
||||
using std::swap;
|
||||
|
||||
if (ch == 0xfffe) {
|
||||
// This is a byte-swapped byte-order-marker. That means we need to swap
|
||||
// the endianness of the rest of the stream.
|
||||
char *data = (char *)_input.data();
|
||||
for (size_t p = _p; p < _input.size() - 1; p += 2) {
|
||||
std::swap(data[p], data[p + 1]);
|
||||
}
|
||||
ch = 0xfeff;
|
||||
}
|
||||
*/
|
||||
|
||||
if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
|
||||
// This is a high surrogate. Look for a subsequent low surrogate.
|
||||
unsigned int high = (unsigned char)_input[_p];
|
||||
unsigned int low = (unsigned char)_input[_p + 1];
|
||||
int ch2 = ((high << 8) | low);
|
||||
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
|
||||
// Yes, this is a low surrogate.
|
||||
_p += 2;
|
||||
return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
|
||||
}
|
||||
}
|
||||
// No, this is just a regular character, or an unpaired surrogate.
|
||||
return ch;
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ public:
|
||||
INLINE StringDecoder(const std::string &input);
|
||||
virtual ~StringDecoder();
|
||||
|
||||
virtual int get_next_character();
|
||||
virtual char32_t get_next_character();
|
||||
INLINE bool is_eof();
|
||||
|
||||
static void set_notify_ptr(std::ostream *ptr);
|
||||
@ -48,20 +48,23 @@ class StringUtf8Decoder : public StringDecoder {
|
||||
public:
|
||||
INLINE StringUtf8Decoder(const std::string &input);
|
||||
|
||||
virtual int get_next_character();
|
||||
virtual char32_t get_next_character();
|
||||
};
|
||||
|
||||
/**
|
||||
* This decoder extracts characters two at a time to get a plain wide
|
||||
* character sequence.
|
||||
* character sequence. It supports surrogate pairs.
|
||||
*/
|
||||
class StringUnicodeDecoder : public StringDecoder {
|
||||
class StringUtf16Decoder : public StringDecoder {
|
||||
public:
|
||||
INLINE StringUnicodeDecoder(const std::string &input);
|
||||
INLINE StringUtf16Decoder(const std::string &input);
|
||||
|
||||
virtual int get_next_character();
|
||||
virtual char32_t get_next_character();
|
||||
};
|
||||
|
||||
// Deprecated alias of StringUtf16Encoder.
|
||||
typedef StringUtf16Decoder StringUnicodeDecoder;
|
||||
|
||||
#include "stringDecoder.I"
|
||||
|
||||
#endif
|
||||
|
@ -169,8 +169,23 @@ append_text(const std::string &text) {
|
||||
* wide character, up to 16 bits in Unicode.
|
||||
*/
|
||||
INLINE void TextEncoder::
|
||||
append_unicode_char(int character) {
|
||||
append_unicode_char(char32_t character) {
|
||||
#if WCHAR_MAX >= 0x10FFFF
|
||||
// wchar_t might be UTF-32.
|
||||
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
|
||||
#else
|
||||
if ((character & ~0xffff) == 0) {
|
||||
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
|
||||
} else {
|
||||
// Encode as a surrogate pair.
|
||||
uint32_t v = (uint32_t)character - 0x10000u;
|
||||
wchar_t wstr[2] = {
|
||||
(wchar_t)((v >> 10u) | 0xd800u),
|
||||
(wchar_t)((v & 0x3ffu) | 0xdc00u),
|
||||
};
|
||||
_wtext = get_wtext() + std::wstring(wstr, 2);
|
||||
}
|
||||
#endif
|
||||
_flags = (_flags | F_got_wtext) & ~F_got_text;
|
||||
text_changed();
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ using std::ostream;
|
||||
using std::string;
|
||||
using std::wstring;
|
||||
|
||||
TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_iso8859;
|
||||
TextEncoder::Encoding TextEncoder::_default_encoding = TextEncoder::E_utf8;
|
||||
|
||||
/**
|
||||
* Adjusts the text stored within the encoder to all uppercase letters
|
||||
@ -109,11 +109,11 @@ is_wtext() const {
|
||||
}
|
||||
|
||||
/**
|
||||
* Encodes a single wide char into a one-, two-, or three-byte string,
|
||||
* according to the given encoding system.
|
||||
* Encodes a single Unicode character into a one-, two-, three-, or four-byte
|
||||
* string, according to the given encoding system.
|
||||
*/
|
||||
string TextEncoder::
|
||||
encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
|
||||
encode_wchar(char32_t ch, TextEncoder::Encoding encoding) {
|
||||
switch (encoding) {
|
||||
case E_iso8859:
|
||||
if ((ch & ~0xff) == 0) {
|
||||
@ -145,17 +145,38 @@ encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
|
||||
return
|
||||
string(1, (char)((ch >> 6) | 0xc0)) +
|
||||
string(1, (char)((ch & 0x3f) | 0x80));
|
||||
} else {
|
||||
} else if ((ch & ~0xffff) == 0) {
|
||||
return
|
||||
string(1, (char)((ch >> 12) | 0xe0)) +
|
||||
string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
|
||||
string(1, (char)((ch & 0x3f) | 0x80));
|
||||
} else {
|
||||
return
|
||||
string(1, (char)((ch >> 18) | 0xf0)) +
|
||||
string(1, (char)(((ch >> 12) & 0x3f) | 0x80)) +
|
||||
string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
|
||||
string(1, (char)((ch & 0x3f) | 0x80));
|
||||
}
|
||||
|
||||
case E_unicode:
|
||||
return
|
||||
string(1, (char)(ch >> 8)) +
|
||||
string(1, (char)(ch & 0xff));
|
||||
case E_utf16be:
|
||||
if ((ch & ~0xffff) == 0) {
|
||||
// Note that this passes through surrogates and BOMs unharmed.
|
||||
return
|
||||
string(1, (char)(ch >> 8)) +
|
||||
string(1, (char)(ch & 0xff));
|
||||
} else {
|
||||
// Use a surrogate pair.
|
||||
uint32_t v = (uint32_t)ch - 0x10000u;
|
||||
uint16_t hi = (v >> 10u) | 0xd800u;
|
||||
uint16_t lo = (v & 0x3ffu) | 0xdc00u;
|
||||
char encoded[4] = {
|
||||
(char)(hi >> 8),
|
||||
(char)(hi & 0xff),
|
||||
(char)(lo >> 8),
|
||||
(char)(lo & 0xff),
|
||||
};
|
||||
return string(encoded, 4);
|
||||
}
|
||||
}
|
||||
|
||||
return "";
|
||||
@ -169,8 +190,25 @@ string TextEncoder::
|
||||
encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
|
||||
string result;
|
||||
|
||||
for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
|
||||
result += encode_wchar(*pi, encoding);
|
||||
for (size_t i = 0; i < wtext.size(); ++i) {
|
||||
wchar_t ch = wtext[i];
|
||||
|
||||
// On some systems, wstring may be UTF-16, and contain surrogate pairs.
|
||||
#if WCHAR_MAX < 0x10FFFF
|
||||
if (ch >= 0xd800 && ch < 0xdc00 && (i + 1) < wtext.size()) {
|
||||
// This is a high surrogate. Look for a subsequent low surrogate.
|
||||
wchar_t ch2 = wtext[i + 1];
|
||||
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
|
||||
// Yes, this is a low surrogate.
|
||||
char32_t code_point = 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
|
||||
result += encode_wchar(code_point, encoding);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
result += encode_wchar(ch, encoding);
|
||||
}
|
||||
|
||||
return result;
|
||||
@ -189,9 +227,9 @@ decode_text(const string &text, TextEncoder::Encoding encoding) {
|
||||
return decode_text_impl(decoder);
|
||||
}
|
||||
|
||||
case E_unicode:
|
||||
case E_utf16be:
|
||||
{
|
||||
StringUnicodeDecoder decoder(text);
|
||||
StringUtf16Decoder decoder(text);
|
||||
return decode_text_impl(decoder);
|
||||
}
|
||||
|
||||
@ -213,7 +251,7 @@ decode_text_impl(StringDecoder &decoder) {
|
||||
wstring result;
|
||||
// bool expand_amp = get_expand_amp();
|
||||
|
||||
wchar_t character = decoder.get_next_character();
|
||||
char32_t character = decoder.get_next_character();
|
||||
while (!decoder.is_eof()) {
|
||||
/*
|
||||
if (character == '&' && expand_amp) {
|
||||
@ -221,7 +259,14 @@ decode_text_impl(StringDecoder &decoder) {
|
||||
character = expand_amp_sequence(decoder);
|
||||
}
|
||||
*/
|
||||
result += character;
|
||||
if (character <= WCHAR_MAX) {
|
||||
result += character;
|
||||
} else {
|
||||
// We need to encode this as a surrogate pair.
|
||||
uint32_t v = (uint32_t)character - 0x10000u;
|
||||
result += (wchar_t)((v >> 10u) | 0xd800u);
|
||||
result += (wchar_t)((v & 0x3ffu) | 0xdc00u);
|
||||
}
|
||||
character = decoder.get_next_character();
|
||||
}
|
||||
|
||||
@ -335,8 +380,8 @@ operator << (ostream &out, TextEncoder::Encoding encoding) {
|
||||
case TextEncoder::E_utf8:
|
||||
return out << "utf8";
|
||||
|
||||
case TextEncoder::E_unicode:
|
||||
return out << "unicode";
|
||||
case TextEncoder::E_utf16be:
|
||||
return out << "utf16be";
|
||||
};
|
||||
|
||||
return out << "**invalid TextEncoder::Encoding(" << (int)encoding << ")**";
|
||||
@ -354,8 +399,9 @@ operator >> (istream &in, TextEncoder::Encoding &encoding) {
|
||||
encoding = TextEncoder::E_iso8859;
|
||||
} else if (word == "utf8" || word == "utf-8") {
|
||||
encoding = TextEncoder::E_utf8;
|
||||
} else if (word == "unicode") {
|
||||
encoding = TextEncoder::E_unicode;
|
||||
} else if (word == "unicode" || word == "utf16be" || word == "utf-16be" ||
|
||||
word == "utf16-be" || word == "utf-16-be") {
|
||||
encoding = TextEncoder::E_utf16be;
|
||||
} else {
|
||||
ostream *notify_ptr = StringDecoder::get_notify_ptr();
|
||||
if (notify_ptr != nullptr) {
|
||||
|
@ -35,7 +35,10 @@ PUBLISHED:
|
||||
enum Encoding {
|
||||
E_iso8859,
|
||||
E_utf8,
|
||||
E_unicode
|
||||
E_utf16be,
|
||||
|
||||
// Deprecated alias for E_utf16be
|
||||
E_unicode = E_utf16be,
|
||||
};
|
||||
|
||||
INLINE TextEncoder();
|
||||
@ -70,7 +73,7 @@ PUBLISHED:
|
||||
INLINE std::string get_text(Encoding encoding) const;
|
||||
INLINE void append_text(const std::string &text);
|
||||
#endif
|
||||
INLINE void append_unicode_char(int character);
|
||||
INLINE void append_unicode_char(char32_t character);
|
||||
INLINE size_t get_num_chars() const;
|
||||
INLINE int get_unicode_char(size_t index) const;
|
||||
INLINE void set_unicode_char(size_t index, int character);
|
||||
@ -103,13 +106,13 @@ PUBLISHED:
|
||||
bool is_wtext() const;
|
||||
|
||||
#ifdef CPPPARSER
|
||||
EXTEND static PyObject *encode_wchar(wchar_t ch, Encoding encoding);
|
||||
EXTEND static PyObject *encode_wchar(char32_t ch, Encoding encoding);
|
||||
EXTEND INLINE PyObject *encode_wtext(const std::wstring &wtext) const;
|
||||
EXTEND static PyObject *encode_wtext(const std::wstring &wtext, Encoding encoding);
|
||||
EXTEND INLINE PyObject *decode_text(PyObject *text) const;
|
||||
EXTEND static PyObject *decode_text(PyObject *text, Encoding encoding);
|
||||
#else
|
||||
static std::string encode_wchar(wchar_t ch, Encoding encoding);
|
||||
static std::string encode_wchar(char32_t ch, Encoding encoding);
|
||||
INLINE std::string encode_wtext(const std::wstring &wtext) const;
|
||||
static std::string encode_wtext(const std::wstring &wtext, Encoding encoding);
|
||||
INLINE std::wstring decode_text(const std::string &text) const;
|
||||
|
Loading…
x
Reference in New Issue
Block a user