From 50682a4409a2a9af69f62f1b81ef91574e8f3a0a Mon Sep 17 00:00:00 2001 From: David Rose Date: Mon, 20 Jan 2003 17:58:07 +0000 Subject: [PATCH] display warnings for unexpected characters in utf-8 string --- panda/src/text/stringDecoder.cxx | 77 ++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/panda/src/text/stringDecoder.cxx b/panda/src/text/stringDecoder.cxx index 525d6a1c54..61b1ea9948 100644 --- a/panda/src/text/stringDecoder.cxx +++ b/panda/src/text/stringDecoder.cxx @@ -17,6 +17,7 @@ //////////////////////////////////////////////////////////////////// #include "stringDecoder.h" +#include "config_text.h" //////////////////////////////////////////////////////////////////// // Function: StringDecoder::Destructor @@ -71,33 +72,51 @@ The value of each individual byte indicates its UTF-8 function, as follows: //////////////////////////////////////////////////////////////////// int StringUtf8Decoder:: get_next_character() { - if (test_eof()) { - return -1; + unsigned int result; + while (!test_eof()) { + result = (unsigned char)_input[_p++]; + if ((result & 0x80) == 0) { + // A 7-bit ascii value in one byte. + return result; + + } if ((result & 0xe0) == 0xc0) { + // First byte of two. + unsigned int two = 0; + if (test_eof()) { + text_cat.warning() + << "utf-8 encoded string ends abruptly.\n"; + return -1; + } + two = (unsigned char)_input[_p++]; + result = ((result & 0x1f) << 6) | (two & 0x3f); + return result; + + } else if ((result & 0xf0) == 0xe0) { + // First byte of three. + if (test_eof()) { + text_cat.warning() + << "utf-8 encoded string ends abruptly.\n"; + return -1; + } + unsigned int two = (unsigned char)_input[_p++]; + if (test_eof()) { + text_cat.warning() + << "utf-8 encoded string ends abruptly.\n"; + return -1; + } + unsigned int three = (unsigned char)_input[_p++]; + result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); + return result; + } + + // Otherwise--the high bit is set but it is not one of the + // introductory utf-8 bytes--we have an error. + text_cat.warning() + << "Non utf-8 byte in string: 0x" << hex << result << dec << "\n"; } - unsigned int result = (unsigned char)_input[_p++]; - if ((result & 0xe0) == 0xc0) { - // First byte of two. - unsigned int two = 0; - if (!test_eof()) { - two = (unsigned char)_input[_p++]; - } - result = ((result & 0x1f) << 6) | (two & 0x3f); - - } else if ((result & 0xf0) == 0xe0) { - // First byte of three. - unsigned int two = 0; - unsigned int three = 0; - if (!test_eof()) { - two = (unsigned char)_input[_p++]; - } - if (!test_eof()) { - three = (unsigned char)_input[_p++]; - } - result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); - } - - return result; + // End of string reached. + return -1; } //////////////////////////////////////////////////////////////////// @@ -112,9 +131,11 @@ get_next_character() { } unsigned int high = (unsigned char)_input[_p++]; - unsigned int low = 0; - if (!test_eof()) { - low = (unsigned char)_input[_p++]; + if (test_eof()) { + text_cat.warning() + << "Unicode-encoded string has odd number of bytes.\n"; + return -1; } + unsigned int low = (unsigned char)_input[_p++]; return ((high << 8) | low); }