diff --git a/vlib/x/json2/decoder2/decode.v b/vlib/x/json2/decoder2/decode.v index 599f660364..0733ad7a56 100644 --- a/vlib/x/json2/decoder2/decode.v +++ b/vlib/x/json2/decoder2/decode.v @@ -380,14 +380,16 @@ fn (mut checker Decoder) check_json_format(val string) ! { checker.checker_idx++ // check if the JSON string is a valid escape sequence - for val[checker.checker_idx] != `"` && val[checker.checker_idx - 1] != `\\` { + for val[checker.checker_idx] != `"` { if val[checker.checker_idx] == `\\` { if checker.checker_idx + 1 >= checker_end - 1 { return checker.error('invalid escape sequence') } escaped_char := val[checker.checker_idx + 1] match escaped_char { - `/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` {} + `/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` { + checker.checker_idx++ // make sure escaped quotation marks are skipped + } `u` { // check if the JSON string is a valid unicode escape sequence escaped_char_last_index := checker.checker_idx + 5 @@ -406,7 +408,6 @@ fn (mut checker Decoder) check_json_format(val string) ! { } } } - // REVIEW: Should we increment the index here? continue } else { return checker.error('short unicode escape sequence ${checker.json[checker.checker_idx.. @@ -560,51 +561,68 @@ fn (mut decoder Decoder) decode_value[T](mut val T) ! { string_info := decoder.current_node.value if string_info.value_kind == .string_ { - buffer_length, escape_positions := decoder.calculate_string_space_and_escapes()! + mut string_buffer := []u8{cap: string_info.length} // might be too long but most json strings don't contain many escape characters anyways - string_buffer := []u8{cap: buffer_length} + mut buffer_index := 1 + mut string_index := 1 - if escape_positions.len == 0 { - if string_info.length != 0 { + for string_index < string_info.length - 1 { + current_byte := decoder.json[string_info.position + string_index] + + if current_byte == `\\` { + // push all characters up to this point unsafe { - string_buffer.push_many(decoder.json.str + string_info.position + 1, - buffer_length) - } - } - } else { - for i := 0; i < escape_positions.len; i++ { - escape_position := escape_positions[i] - if i == 0 { - // Pushes a substring from the JSON string into the string buffer. - // The substring starts at the position of the value in the JSON string plus one, - // and ends at the escape position minus one. - // This is used to handle escaped characters within the JSON string. - unsafe { - string_buffer.push_many(decoder.json.str + string_info.position + 1, - escape_position - string_info.position - 1) - } - } else { - // Pushes a substring from the JSON string into the string buffer, starting after the previous escape position - // and ending just before the current escape position. This handles the characters between escape sequences. - unsafe { - string_buffer.push_many(decoder.json.str + escape_positions[i - 1] + 6, - escape_position - escape_positions[i - 1] - 6) - } + string_buffer.push_many(decoder.json.str + string_info.position + + buffer_index, string_index - buffer_index) } - unescaped_buffer := generate_unicode_escape_sequence(unsafe { - (decoder.json.str + escape_positions[i] + 2).vbytes(4) - })! + string_index++ - unsafe { string_buffer.push_many(&unescaped_buffer[0], unescaped_buffer.len) } - } - end_of_last_escape_position := escape_positions[escape_positions.len - 1] + 6 - unsafe { - string_buffer.push_many(decoder.json.str + end_of_last_escape_position, - string_info.length - end_of_last_escape_position - 1) + escaped_char := decoder.json[string_info.position + string_index] + + string_index++ + + match escaped_char { + `/`, `"`, `\\` { + string_buffer << escaped_char + } + `b` { + string_buffer << `\b` + } + `f` { + string_buffer << `\f` + } + `n` { + string_buffer << `\n` + } + `r` { + string_buffer << `\r` + } + `t` { + string_buffer << `\t` + } + `u` { + string_buffer << rune(strconv.parse_uint(decoder.json[ + string_info.position + string_index..string_info.position + + string_index + 4], 16, 32)!).bytes() + + string_index += 4 + } + else {} // has already been checked + } + + buffer_index = string_index + } else { + string_index++ } } + // push the rest + unsafe { + string_buffer.push_many(decoder.json.str + string_info.position + buffer_index, + string_index - buffer_index) + } + val = string_buffer.bytestr() } else { return error('Expected string, but got ${string_info.value_kind}') @@ -979,94 +997,6 @@ fn utf8_byte_len(unicode_value u32) int { } } -fn (mut decoder Decoder) calculate_string_space_and_escapes() !(int, []int) { - value_info := decoder.current_node.value - len := value_info.length - - if len < 2 || decoder.json[value_info.position] != `"` - || decoder.json[value_info.position + len - 1] != `"` { - return error('Invalid JSON string format') - } - - mut space_required := 0 - mut escape_positions := []int{} - mut idx := 1 // Start after the opening quote - - for idx < len - 1 { - current_byte := decoder.json[value_info.position + idx] - - if current_byte == `\\` { - // Escape sequence, handle accordingly - idx++ - if idx >= len - 1 { - return error('Invalid escape sequence at the end of string') - } - escaped_char := decoder.json[value_info.position + idx] - match escaped_char { - // All simple escapes take 1 byte of space - `/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` { - space_required++ - } - `u` { - // Unicode escape sequence \uXXXX - if idx + 4 >= len - 1 { - return error('Invalid unicode escape sequence') - } - // Extract the hex value from the \uXXXX sequence - hex_str := decoder.json[value_info.position + idx + 1..value_info.position + - idx + 5] - unicode_value := u32(strconv.parse_int(hex_str, 16, 32)!) - // Determine the number of bytes needed for this Unicode character in UTF-8 - space_required += utf8_byte_len(unicode_value) - idx += 4 // Skip the next 4 hex digits - - // REVIEW: If the Unicode character is a surrogate pair, we need to skip the next \uXXXX sequence? - - // \\uXXXX is 6 bytes, so we need to skip 5 more bytes - escape_positions << value_info.position + idx - 5 - } - else { - return error('Unknown escape sequence') - } - } - } else { - // Regular character, just increment space required by 1 byte - space_required++ - } - idx++ - } - - return space_required, escape_positions -} - -// \uXXXX to unicode with 4 hex digits -fn generate_unicode_escape_sequence(escape_sequence_byte []u8) ![]u8 { - if escape_sequence_byte.len != 4 { - return error('Invalid unicode escape sequence') - } - - unicode_value := u32(strconv.parse_int(escape_sequence_byte.bytestr(), 16, 32)!) - mut utf8_bytes := []u8{cap: utf8_byte_len(unicode_value)} - - if unicode_value <= 0x7F { - utf8_bytes << u8(unicode_value) - } else if unicode_value <= 0x7FF { - utf8_bytes << u8(0xC0 | (unicode_value >> 6)) - utf8_bytes << u8(0x80 | (unicode_value & 0x3F)) - } else if unicode_value <= 0xFFFF { - utf8_bytes << u8(0xE0 | (unicode_value >> 12)) - utf8_bytes << u8(0x80 | ((unicode_value >> 6) & 0x3F)) - utf8_bytes << u8(0x80 | (unicode_value & 0x3F)) - } else { - utf8_bytes << u8(0xF0 | (unicode_value >> 18)) - utf8_bytes << u8(0x80 | ((unicode_value >> 12) & 0x3F)) - utf8_bytes << u8(0x80 | ((unicode_value >> 6) & 0x3F)) - utf8_bytes << u8(0x80 | (unicode_value & 0x3F)) - } - - return utf8_bytes -} - // string_buffer_to_generic_number converts a buffer of bytes (data) into a generic type T and // stores the result in the provided result pointer. // The function supports conversion to the following types: diff --git a/vlib/x/json2/decoder2/tests/decode_escaped_string_test.v b/vlib/x/json2/decoder2/tests/decode_escaped_string_test.v new file mode 100644 index 0000000000..e79a61c3fb --- /dev/null +++ b/vlib/x/json2/decoder2/tests/decode_escaped_string_test.v @@ -0,0 +1,12 @@ +import x.json2 +import x.json2.decoder2 + +fn test_decode_escaped_string() { + escaped_strings := ['test', 'test\\sd', 'test\nsd', '\ntest', 'test\\"', 'test\\', 'test\u1234ps', + 'test\u1234', '\u1234\\\t"', ''] + + json_string := json2.encode[[]string](escaped_strings) + decoded_strings := decoder2.decode[[]string](json_string)! + + assert escaped_strings == decoded_strings +}