wstring utils

2025-10-04 02:42:49 -04:00 · 2006-03-09 22:36:24 +00:00 · 2006-03-09 22:36:24 +00:00 · 56b25dafe6
commit 56b25dafe6
parent 34cd6206a2
4 changed files with 141 additions and 0 deletions
--- a/panda/src/express/textEncoder.I
+++ b/panda/src/express/textEncoder.I
@ -371,6 +371,26 @@ unicode_isupper(int character) {
  return entry->_char_type == UnicodeLatinMap::CT_upper;
 }

+////////////////////////////////////////////////////////////////////
+//     Function: TextEncoder::unicode_isspace
+//       Access: Published, Static
+//  Description: Returns true if the indicated character is a
+//               whitespace letter, false otherwise.  This is akin to
+//               ctype's isspace(), extended to Unicode.
+////////////////////////////////////////////////////////////////////
+INLINE bool TextEncoder::
+unicode_isspace(int character) {
+  switch (character) {
+  case ' ':
+  case '\t':
+  case '\n':
+    return true;
+
+  default:
+    return false;
+  }
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: TextEncoder::unicode_islower
 //       Access: Published, Static
--- a/panda/src/express/textEncoder.h
+++ b/panda/src/express/textEncoder.h
@ -79,6 +79,7 @@ PUBLISHED:
  INLINE static bool unicode_ispunct(int character);
  INLINE static bool unicode_islower(int character);
  INLINE static bool unicode_isupper(int character);
+  INLINE static bool unicode_isspace(int character);
  INLINE static int unicode_toupper(int character);
  INLINE static int unicode_tolower(int character);

--- a/panda/src/putil/string_utils.cxx
+++ b/panda/src/putil/string_utils.cxx
@ -17,6 +17,7 @@
 ////////////////////////////////////////////////////////////////////

 #include "string_utils.h"
+#include "textEncoder.h"

 #include <ctype.h>

@ -131,6 +132,40 @@ extract_words(const string &str, vector_string &words) {
  return num_words;
 }

+////////////////////////////////////////////////////////////////////
+//     Function: extract_words
+//  Description: Divides the string into a number of words according
+//               to whitespace.  The words vector should be cleared by
+//               the user before calling; otherwise, the list of words
+//               in the string will be appended to the end of whatever
+//               was there before.
+//
+//               The return value is the number of words extracted.
+////////////////////////////////////////////////////////////////////
+int
+extract_words(const wstring &str, pvector<wstring> &words) {
+  int num_words = 0;
+
+  size_t pos = 0;
+  while (pos < str.length() && TextEncoder::unicode_isspace(str[pos])) {
+    pos++;
+  }
+  while (pos < str.length()) {
+    size_t word_start = pos;
+    while (pos < str.length() && !TextEncoder::unicode_isspace(str[pos])) {
+      pos++;
+    }
+    words.push_back(str.substr(word_start, pos - word_start));
+    num_words++;
+
+    while (pos < str.length() && TextEncoder::unicode_isspace(str[pos])) {
+      pos++;
+    }
+  }
+
+  return num_words;
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: tokenize
 //  Description: Chops the source string up into pieces delimited by
@ -158,6 +193,33 @@ tokenize(const string &str, vector_string &words, const string &delimiters) {
  words.push_back(string());
 }

+////////////////////////////////////////////////////////////////////
+//     Function: tokenize
+//  Description: Chops the source string up into pieces delimited by
+//               any of the characters specified in delimiters.
+//               Repeated delimiter characters represent zero-length
+//               tokens.
+//
+//               It is the user's responsibility to ensure the output
+//               vector is cleared before calling this function; the
+//               results will simply be appended to the end of the
+//               vector.
+////////////////////////////////////////////////////////////////////
+void
+tokenize(const wstring &str, pvector<wstring> &words, const wstring &delimiters) {
+  size_t p = 0;
+  while (p < str.length()) {
+    size_t q = str.find_first_of(delimiters, p);
+    if (q == string::npos) {
+      words.push_back(str.substr(p));
+      return;
+    }
+    words.push_back(str.substr(p, q - p));
+    p = q + 1;
+  }
+  words.push_back(wstring());
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: trim_left
 //  Description: Returns a new string representing the contents of the
@ -173,6 +235,21 @@ trim_left(const string &str) {
  return str.substr(begin);
 }

+////////////////////////////////////////////////////////////////////
+//     Function: trim_left
+//  Description: Returns a new string representing the contents of the
+//               given string with the leading whitespace removed.
+////////////////////////////////////////////////////////////////////
+wstring
+trim_left(const wstring &str) {
+  size_t begin = 0;
+  while (begin < str.size() && TextEncoder::unicode_isspace(str[begin])) {
+    begin++;
+  }
+
+  return str.substr(begin);
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: trim_right
 //  Description: Returns a new string representing the contents of the
@ -189,6 +266,22 @@ trim_right(const string &str) {
  return str.substr(begin, end - begin);
 }

+////////////////////////////////////////////////////////////////////
+//     Function: trim_right
+//  Description: Returns a new string representing the contents of the
+//               given string with the trailing whitespace removed.
+////////////////////////////////////////////////////////////////////
+wstring
+trim_right(const wstring &str) {
+  size_t begin = 0;
+  size_t end = str.size();
+  while (end > begin && TextEncoder::unicode_isspace(str[end - 1])) {
+    end--;
+  }
+
+  return str.substr(begin, end - begin);
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: trim
 //  Description: Returns a new string representing the contents of the
@ -210,6 +303,27 @@ trim(const string &str) {
  return str.substr(begin, end - begin);
 }

+////////////////////////////////////////////////////////////////////
+//     Function: trim
+//  Description: Returns a new string representing the contents of the
+//               given string with both leading and trailing
+//               whitespace removed.
+////////////////////////////////////////////////////////////////////
+wstring
+trim(const wstring &str) {
+  size_t begin = 0;
+  while (begin < str.size() && TextEncoder::unicode_isspace(str[begin])) {
+    begin++;
+  }
+
+  size_t end = str.size();
+  while (end > begin && TextEncoder::unicode_isspace(str[end - 1])) {
+    end--;
+  }
+
+  return str.substr(begin, end - begin);
+}
+
 ////////////////////////////////////////////////////////////////////
 //     Function: string_to_int
 //  Description: A string-interface wrapper around the C library
--- a/panda/src/putil/string_utils.h
+++ b/panda/src/putil/string_utils.h
@ -39,15 +39,21 @@ EXPCL_PANDA string upcase(const string &s);

 // Separates the string into words according to whitespace.
 EXPCL_PANDA int extract_words(const string &str, vector_string &words);
+EXPCL_PANDA int extract_words(const wstring &str, pvector<wstring> &words);

 // Separates the string into words according to the indicated delimiters.
 EXPCL_PANDA void tokenize(const string &str, vector_string &words,
                          const string &delimiters);
+EXPCL_PANDA void tokenize(const wstring &str, pvector<wstring> &words,
+                          const wstring &delimiters);

 // Trims leading and/or trailing whitespace from the string.
 EXPCL_PANDA string trim_left(const string &str);
+EXPCL_PANDA wstring trim_left(const wstring &str);
 EXPCL_PANDA string trim_right(const string &str);
+EXPCL_PANDA wstring trim_right(const wstring &str);
 EXPCL_PANDA string trim(const string &str);
+EXPCL_PANDA wstring trim(const wstring &str);

 // Functions to parse numeric values out of a string.
 EXPCL_PANDA int string_to_int(const string &str, string &tail);