diff --git a/vlib/builtin/string_test.v b/vlib/builtin/string_test.v index b4e01c7e96..3c1125ec42 100644 --- a/vlib/builtin/string_test.v +++ b/vlib/builtin/string_test.v @@ -1067,22 +1067,57 @@ fn test_split_into_lines() { } } -fn test_string_literal_with_backslash() { - a := 'HelloWorld' +const single_backslash = '\\' +const double_backslash = '\\\\' +const newline = '\n' + +// vfmt off +fn test_string_literal_with_backslash_followed_by_newline() { + // Note `\` is followed *directly* by a newline, then some more whitespace, then a non whitespace string. + // In this case, the \ is treated as line breaking, and the whitespace after that on the new line, + // should be just ignored. + // + // See also https://doc.rust-lang.org/reference/tokens.html#string-literals + // >> Both byte sequences are normally translated to U+000A, but as a special exception, + // when an unescaped U+005C character occurs immediately before the line-break, + // the U+005C character, the line-break, and all whitespace at the beginning of the + // next line are ignored. + a := 'Hello\ + World' assert a == 'HelloWorld' - b := 'OneTwoThree' - assert b == 'OneTwoThree' -} + // Here, `\\\` means `\\` followed by `\`, followed by a newline. + // the first is a single escaped \, that should go into the literal, the second together with + // the newline and the whitespace after it, is a line-break, and should be simply ignored. + // Same with `\\\\\`, which is `\\\\`, followed by `\`, i.e. an escaped double backslash, + // and a line-break after it: + b := 'One \ + Two Three \\\ + Four \\\\ + Five \\\\\ + end' + assert b == 'One Two Three ${single_backslash}Four ${double_backslash}${newline} Five ${double_backslash}end' + + // Note `\\` is followed *directly* by a newline, but `\\` is just an escape for `\`, + // and thus the newline has no special meaning, and should go into the string literal. + c := 'Hello\\ + World' + assert c == 'Hello\\\n World' + + d := 'One\\ + Two Three \\ + Four' + assert d == 'One\\\n Two Three \\\n Four' +} +// vfmt on -/* type MyString = string fn test_string_alias() { s := MyString('hi') ss := s + '!' + assert ss == 'hi!' } -*/ // sort an array of structs, by their string field values diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 09fef00e8b..9402ba73fd 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -1239,7 +1239,7 @@ pub fn (mut s Scanner) ident_string() string { backslash_count++ } // end of string - if c == s.quote && (is_raw || backslash_count % 2 == 0) { + if c == s.quote && (is_raw || backslash_count & 1 == 0) { // handle '123\\' backslash at the end break } @@ -1253,7 +1253,7 @@ pub fn (mut s Scanner) ident_string() string { s.inc_line_number() } // Escape `\x` `\u` `\U` - if backslash_count % 2 == 1 && !is_raw && !is_cstr { + if backslash_count & 1 == 1 && !is_raw && !is_cstr { // Escape `\x` if c == `x` { if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit() @@ -1287,13 +1287,13 @@ pub fn (mut s Scanner) ident_string() string { u32_escapes_pos << s.pos - 1 } // Unknown escape sequence - if !is_escape_sequence(c) && !c.is_digit() { + if !is_escape_sequence(c) && !c.is_digit() && c != `\n` { s.error('`${c.ascii_str()}` unknown escape sequence') } } // ${var} (ignore in vfmt mode) (skip \$) if prevc == `$` && c == `{` && !is_raw - && s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 { + && s.count_symbol_before(s.pos - 2, scanner.backslash) & 1 == 0 { s.is_inside_string = true if s.is_enclosed_inter { s.is_nested_enclosed_inter = true @@ -1306,7 +1306,7 @@ pub fn (mut s Scanner) ident_string() string { } // $var if prevc == `$` && util.is_name_char(c) && !is_raw - && s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 { + && s.count_symbol_before(s.pos - 2, scanner.backslash) & 1 == 0 { s.is_inside_string = true s.is_inter_start = true s.pos -= 2 @@ -1483,13 +1483,26 @@ fn trim_slash_line_break(s string) string { mut start := 0 mut ret_str := s for { + // find the position of the first `\` followed by a newline, after `start`: idx := ret_str.index_after('\\\n', start) - if idx != -1 { - ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r') - start = idx - } else { + if idx == -1 { break } + start = idx + // Here, ret_str[idx] is \, and ret_str[idx+1] is newline. + // Depending on the number of backslashes before the newline, we should either + // treat the last one and the whitespace after it as line-break, or just ignore it: + mut nbackslashes := 0 + for eidx := idx; eidx >= 0 && ret_str[eidx] == `\\`; eidx-- { + nbackslashes++ + } + // eprintln('>> start: ${start:-5} | nbackslashes: ${nbackslashes:-5} | ret_str: $ret_str') + if idx == 0 || (nbackslashes & 1) == 1 { + ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r') + } else { + // ensure the loop will terminate, when we could not strip anything: + start++ + } } return ret_str } @@ -1560,7 +1573,7 @@ pub fn (mut s Scanner) ident_char() string { // e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605 // we don't handle binary escape codes in rune literals orig := c - if c.len % 2 == 0 + if c.len & 1 == 0 && (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) { if escaped_unicode_16 { // there can only be one, so attempt to decode it now