decoder2: fix checking and decoding of escape characters (fix #24834) (#24915)

Co-authored-by: Lars Dumke <lars-luis.dumke@keysight.com>
This commit is contained in:
Larsimusrex 2025-07-17 07:34:05 +02:00 committed by GitHub
parent 7b3e7cb447
commit 54c6daab54
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 69 additions and 127 deletions

View File

@ -380,14 +380,16 @@ fn (mut checker Decoder) check_json_format(val string) ! {
checker.checker_idx++
// check if the JSON string is a valid escape sequence
for val[checker.checker_idx] != `"` && val[checker.checker_idx - 1] != `\\` {
for val[checker.checker_idx] != `"` {
if val[checker.checker_idx] == `\\` {
if checker.checker_idx + 1 >= checker_end - 1 {
return checker.error('invalid escape sequence')
}
escaped_char := val[checker.checker_idx + 1]
match escaped_char {
`/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` {}
`/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` {
checker.checker_idx++ // make sure escaped quotation marks are skipped
}
`u` {
// check if the JSON string is a valid unicode escape sequence
escaped_char_last_index := checker.checker_idx + 5
@ -406,7 +408,6 @@ fn (mut checker Decoder) check_json_format(val string) ! {
}
}
}
// REVIEW: Should we increment the index here?
continue
} else {
return checker.error('short unicode escape sequence ${checker.json[checker.checker_idx..
@ -560,51 +561,68 @@ fn (mut decoder Decoder) decode_value[T](mut val T) ! {
string_info := decoder.current_node.value
if string_info.value_kind == .string_ {
buffer_length, escape_positions := decoder.calculate_string_space_and_escapes()!
mut string_buffer := []u8{cap: string_info.length} // might be too long but most json strings don't contain many escape characters anyways
string_buffer := []u8{cap: buffer_length}
mut buffer_index := 1
mut string_index := 1
if escape_positions.len == 0 {
if string_info.length != 0 {
for string_index < string_info.length - 1 {
current_byte := decoder.json[string_info.position + string_index]
if current_byte == `\\` {
// push all characters up to this point
unsafe {
string_buffer.push_many(decoder.json.str + string_info.position + 1,
buffer_length)
}
}
} else {
for i := 0; i < escape_positions.len; i++ {
escape_position := escape_positions[i]
if i == 0 {
// Pushes a substring from the JSON string into the string buffer.
// The substring starts at the position of the value in the JSON string plus one,
// and ends at the escape position minus one.
// This is used to handle escaped characters within the JSON string.
unsafe {
string_buffer.push_many(decoder.json.str + string_info.position + 1,
escape_position - string_info.position - 1)
}
} else {
// Pushes a substring from the JSON string into the string buffer, starting after the previous escape position
// and ending just before the current escape position. This handles the characters between escape sequences.
unsafe {
string_buffer.push_many(decoder.json.str + escape_positions[i - 1] + 6,
escape_position - escape_positions[i - 1] - 6)
}
string_buffer.push_many(decoder.json.str + string_info.position +
buffer_index, string_index - buffer_index)
}
unescaped_buffer := generate_unicode_escape_sequence(unsafe {
(decoder.json.str + escape_positions[i] + 2).vbytes(4)
})!
string_index++
unsafe { string_buffer.push_many(&unescaped_buffer[0], unescaped_buffer.len) }
}
end_of_last_escape_position := escape_positions[escape_positions.len - 1] + 6
unsafe {
string_buffer.push_many(decoder.json.str + end_of_last_escape_position,
string_info.length - end_of_last_escape_position - 1)
escaped_char := decoder.json[string_info.position + string_index]
string_index++
match escaped_char {
`/`, `"`, `\\` {
string_buffer << escaped_char
}
`b` {
string_buffer << `\b`
}
`f` {
string_buffer << `\f`
}
`n` {
string_buffer << `\n`
}
`r` {
string_buffer << `\r`
}
`t` {
string_buffer << `\t`
}
`u` {
string_buffer << rune(strconv.parse_uint(decoder.json[
string_info.position + string_index..string_info.position +
string_index + 4], 16, 32)!).bytes()
string_index += 4
}
else {} // has already been checked
}
buffer_index = string_index
} else {
string_index++
}
}
// push the rest
unsafe {
string_buffer.push_many(decoder.json.str + string_info.position + buffer_index,
string_index - buffer_index)
}
val = string_buffer.bytestr()
} else {
return error('Expected string, but got ${string_info.value_kind}')
@ -979,94 +997,6 @@ fn utf8_byte_len(unicode_value u32) int {
}
}
fn (mut decoder Decoder) calculate_string_space_and_escapes() !(int, []int) {
value_info := decoder.current_node.value
len := value_info.length
if len < 2 || decoder.json[value_info.position] != `"`
|| decoder.json[value_info.position + len - 1] != `"` {
return error('Invalid JSON string format')
}
mut space_required := 0
mut escape_positions := []int{}
mut idx := 1 // Start after the opening quote
for idx < len - 1 {
current_byte := decoder.json[value_info.position + idx]
if current_byte == `\\` {
// Escape sequence, handle accordingly
idx++
if idx >= len - 1 {
return error('Invalid escape sequence at the end of string')
}
escaped_char := decoder.json[value_info.position + idx]
match escaped_char {
// All simple escapes take 1 byte of space
`/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` {
space_required++
}
`u` {
// Unicode escape sequence \uXXXX
if idx + 4 >= len - 1 {
return error('Invalid unicode escape sequence')
}
// Extract the hex value from the \uXXXX sequence
hex_str := decoder.json[value_info.position + idx + 1..value_info.position +
idx + 5]
unicode_value := u32(strconv.parse_int(hex_str, 16, 32)!)
// Determine the number of bytes needed for this Unicode character in UTF-8
space_required += utf8_byte_len(unicode_value)
idx += 4 // Skip the next 4 hex digits
// REVIEW: If the Unicode character is a surrogate pair, we need to skip the next \uXXXX sequence?
// \\uXXXX is 6 bytes, so we need to skip 5 more bytes
escape_positions << value_info.position + idx - 5
}
else {
return error('Unknown escape sequence')
}
}
} else {
// Regular character, just increment space required by 1 byte
space_required++
}
idx++
}
return space_required, escape_positions
}
// \uXXXX to unicode with 4 hex digits
fn generate_unicode_escape_sequence(escape_sequence_byte []u8) ![]u8 {
if escape_sequence_byte.len != 4 {
return error('Invalid unicode escape sequence')
}
unicode_value := u32(strconv.parse_int(escape_sequence_byte.bytestr(), 16, 32)!)
mut utf8_bytes := []u8{cap: utf8_byte_len(unicode_value)}
if unicode_value <= 0x7F {
utf8_bytes << u8(unicode_value)
} else if unicode_value <= 0x7FF {
utf8_bytes << u8(0xC0 | (unicode_value >> 6))
utf8_bytes << u8(0x80 | (unicode_value & 0x3F))
} else if unicode_value <= 0xFFFF {
utf8_bytes << u8(0xE0 | (unicode_value >> 12))
utf8_bytes << u8(0x80 | ((unicode_value >> 6) & 0x3F))
utf8_bytes << u8(0x80 | (unicode_value & 0x3F))
} else {
utf8_bytes << u8(0xF0 | (unicode_value >> 18))
utf8_bytes << u8(0x80 | ((unicode_value >> 12) & 0x3F))
utf8_bytes << u8(0x80 | ((unicode_value >> 6) & 0x3F))
utf8_bytes << u8(0x80 | (unicode_value & 0x3F))
}
return utf8_bytes
}
// string_buffer_to_generic_number converts a buffer of bytes (data) into a generic type T and
// stores the result in the provided result pointer.
// The function supports conversion to the following types:

View File

@ -0,0 +1,12 @@
import x.json2
import x.json2.decoder2
fn test_decode_escaped_string() {
escaped_strings := ['test', 'test\\sd', 'test\nsd', '\ntest', 'test\\"', 'test\\', 'test\u1234ps',
'test\u1234', '\u1234\\\t"', '']
json_string := json2.encode[[]string](escaped_strings)
decoded_strings := decoder2.decode[[]string](json_string)!
assert escaped_strings == decoded_strings
}