From 31c6db51ba8ef65cca07be2b9838dee30fda1119 Mon Sep 17 00:00:00 2001
From: Laurent Cheylus <foxy@free.fr>
Date: Wed, 21 May 2025 17:45:01 +0200
Subject: [PATCH] encoding.utf8: add more tests for UTF-8 strings (#24544)

---
 .../utf8/validate/encoding_utf8_test.v        | 78 +++++++++++++++++++
 vlib/encoding/utf8/validate/validate_utf8.v   | 13 ++++
 2 files changed, 91 insertions(+)

diff --git a/vlib/encoding/utf8/validate/encoding_utf8_test.v b/vlib/encoding/utf8/validate/encoding_utf8_test.v
index fe085a5d1f..aa43ea8137 100644
--- a/vlib/encoding/utf8/validate/encoding_utf8_test.v
+++ b/vlib/encoding/utf8/validate/encoding_utf8_test.v
@@ -3,7 +3,85 @@ import encoding.utf8.validate
 fn test_validate_str() {
 	assert validate.utf8_string('añçá') == true
 	assert validate.utf8_string('\x61\xC3\xB1\xC3\xA7\xC3\xA1') == true
+
+	assert validate.utf8_string('\x01') == true
+	assert validate.utf8_string('\x7e') == true
+	assert validate.utf8_string('\x7f') == true
+	assert validate.utf8_string('\xc2\x80') == true
+	assert validate.utf8_string('\xc2\x81') == true
+	assert validate.utf8_string('\xc2\xbf') == true
+	assert validate.utf8_string('\xc3\x80') == true
+	assert validate.utf8_string('\xc3\x81') == true
+	assert validate.utf8_string('\xc3\x88') == true
+	assert validate.utf8_string('\xc3\x90') == true
+	assert validate.utf8_string('\xc3\xa0') == true
+	assert validate.utf8_string('\xc3\xb0') == true
+	assert validate.utf8_string('\xc3\xb8') == true
+	assert validate.utf8_string('\xc3\xbf') == true
+	assert validate.utf8_string('\xc4\x80') == true
+	assert validate.utf8_string('\xdf\xbf') == true
+	assert validate.utf8_string('\xd0\x80') == true
+	assert validate.utf8_string('\xe0\xa0\x80') == true
+	assert validate.utf8_string('\xe0\xa0\x81') == true
+	assert validate.utf8_string('\xe1\x80\x80') == true
+	assert validate.utf8_string('\xed\x80\x80') == true
+	assert validate.utf8_string('\xed\x9f\xbf') == true
+	assert validate.utf8_string('\xee\x80\x80') == true
+	assert validate.utf8_string('\xef\xbf\xbe') == true
+	assert validate.utf8_string('\xef\xbf\xbf') == true
+	assert validate.utf8_string('\xf0\x90\x80\x80') == true
+	assert validate.utf8_string('\xf0\x90\x80\x81') == true
+	assert validate.utf8_string('\xf1\x80\x80\x80') == true
+	assert validate.utf8_string('\xf4\x8f\xbf\xbe') == true
+	assert validate.utf8_string('\xf4\x8f\xbf\xbf') == true
+	assert validate.utf8_string('\xef\xbf\xbd') == true
+}
+
+fn test_validate_invalid_str() {
 	assert validate.utf8_string('\xC0\xC1') == false
 	assert validate.utf8_string('\xF5\xFF') == false
 	assert validate.utf8_string('\xE0\xEF') == false
+
+	// xx
+	assert validate.utf8_string('\x91\x80\x80\x80') == false
+
+	// s1
+	assert validate.utf8_string('\xC2\x7F\x80\x80') == false
+	assert validate.utf8_string('\xC2\xC0\x80\x80') == false
+	assert validate.utf8_string('\xDF\x7F\x80\x80') == false
+	assert validate.utf8_string('\xDF\xC0\x80\x80') == false
+
+	// s2
+	assert validate.utf8_string('\xE0\x9F\xBF\x80') == false
+	assert validate.utf8_string('\xE0\xA0\x7F\x80') == false
+	assert validate.utf8_string('\xE0\xBF\xC0\x80') == false
+	assert validate.utf8_string('\xE0\xC0\x80\x80') == false
+
+	// s3
+	assert validate.utf8_string('\xE1\x7F\xBF\x80') == false
+	assert validate.utf8_string('\xE1\x80\x7F\x80') == false
+	assert validate.utf8_string('\xE1\xBF\xC0\x80') == false
+	assert validate.utf8_string('\xE1\xC0\x80\x80') == false
+
+	// s4
+	assert validate.utf8_string('\xED\x7F\xBF\x80') == false
+	assert validate.utf8_string('\xED\x80\x7F\x80') == false
+	assert validate.utf8_string('\xED\x9F\xC0\x80') == false
+	assert validate.utf8_string('\xED\xA0\x80\x80') == false
+
+	// s5
+	assert validate.utf8_string('\xF0\x8F\xBF\xBF') == false
+	assert validate.utf8_string('\xF0\x90\x7F\xBF') == false
+	assert validate.utf8_string('\xF0\x90\x80\x7F') == false
+	assert validate.utf8_string('\xF0\xBF\xBF\xC0') == false
+	assert validate.utf8_string('\xF0\xBF\xC0\x80') == false
+	assert validate.utf8_string('\xF0\xC0\x80\x80') == false
+
+	// s6
+	assert validate.utf8_string('\xF1\x7F\xBF\xBF') == false
+	assert validate.utf8_string('\xF1\x80\x7F\xBF') == false
+	assert validate.utf8_string('\xF1\x80\x80\x7F') == false
+	assert validate.utf8_string('\xF1\xBF\xBF\xC0') == false
+	assert validate.utf8_string('\xF1\xBF\xC0\x80') == false
+	assert validate.utf8_string('\xF1\xC0\x80\x80') == false
 }
diff --git a/vlib/encoding/utf8/validate/validate_utf8.v b/vlib/encoding/utf8/validate/validate_utf8.v
index 44e42ea6ec..428c0e0658 100644
--- a/vlib/encoding/utf8/validate/validate_utf8.v
+++ b/vlib/encoding/utf8/validate/validate_utf8.v
@@ -48,6 +48,19 @@ fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
 	return false
 }
 
+/* Check UTF-8 Byte sequences according to Unicode Standard
+ * https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/
+ * Code Points        1st       2s       3s       4s
+ * U+0000..U+007F     00..7F
+ * U+0080..U+07FF     C2..DF   80..BF
+ * U+0800..U+0FFF     E0       A0..BF   80..BF
+ * U+1000..U+CFFF     E1..EC   80..BF   80..BF
+ * U+D000..U+D7FF     ED       80..9F   80..BF
+ * U+E000..U+FFFF     EE..EF   80..BF   80..BF
+ * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+ * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+ * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+ */
 fn (mut s Utf8State) next_state(c u8) {
 	// sequence 1
 	if s.index == 0 {