parser,scanner,ast: make the scanner and parser more robust, by implementing more limits (preventing panics, discovered by fuzzing)

This commit is contained in:
Delyan Angelov 2024-08-20 13:21:25 +03:00
parent 50457647de
commit c92577e6ed
No known key found for this signature in database
GPG Key ID: 66886C0F12D595ED
14 changed files with 158 additions and 61 deletions

2
.gitignore vendored
View File

@ -152,3 +152,5 @@ vlib/v/tests/*.js
# ignore .NET7.0 Assembly Files
bench/vectors/bin
bench/vectors/obj
autofuzz.log

View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
## Note: radamsa is a fuzzer, available from https://gitlab.com/akihe/radamsa
## ./v -g cmd/tools/measure/parser_speed.v
while true; do
radamsa --meta autofuzz.log examples/hello_world.v > x.v;
VFUZZER=true cmd/tools/measure/parser_speed x.v || break;
done

View File

@ -0,0 +1,29 @@
module file_lists
import os
// expand_files accepts a list of files and folders, and returns a list of all the .v and .vsh files, found in them.
// The input list of files, supports recursive `@file.lst` expansion, where each line is treated as another file/folder.
pub fn expand_files(files []string) ![]string {
mut res := []string{}
for file in files {
if file == '' {
continue
}
if file.starts_with('@') {
lst_path := files[0].all_after('@').trim_space()
listed_files := os.read_file(lst_path)!.split('\n').map(it.trim_space())
res << expand_files(listed_files)!
continue
}
if os.is_dir(file) {
res << os.walk_ext(file, '.vsh')
res << os.walk_ext(file, '.v')
continue
}
if os.exists(file) {
res << file
}
}
return res
}

View File

@ -7,29 +7,33 @@ import v.parser
import v.errors
import v.scanner
import term
import file_lists
const skip_tests = os.getenv_opt('SKIP_TESTS') or { '' }.bool()
const skip_tests = os.getenv('SKIP_TESTS').bool()
const fuzzer_mode = os.getenv('VFUZZER').bool()
const comments_mode = scanner.CommentsMode.from(os.getenv('SCANNER_MODE')) or {
scanner.CommentsMode.parse_comments
}
fn main() {
dump(comments_mode)
files := os.args#[1..]
if files.len > 0 && files[0].starts_with('@') {
lst_path := files[0].all_after('@')
listed_files := os.read_file(lst_path)!.split('\n')
process_files(listed_files)!
return
if !fuzzer_mode {
dump(comments_mode)
}
process_files(files)!
all_files := file_lists.expand_files(os.args#[1..])!
process_files(all_files)!
}
fn hline() {
if fuzzer_mode {
return
}
println('----------------------------------------------------------------------------------------------------------------------------------------------------------')
}
fn theader() {
if fuzzer_mode {
return
}
println(' Time Tokens Bytes Lines Bytes/Token Errors FMT.len')
}
@ -71,7 +75,9 @@ fn process_files(files []string) ! {
total_lines += ast_file.nr_lines
total_errors += p.errors.len
total_fmt_len += formatted_content.len
println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10} ${formatted_content.len:8} ${f}')
if !fuzzer_mode {
println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10} ${formatted_content.len:8} ${f}')
}
}
hline()
theader()

View File

@ -6,29 +6,33 @@ import v.parser
import v.errors
import v.scanner
import term
import file_lists
const skip_tests = os.getenv_opt('SKIP_TESTS') or { '' }.bool()
const skip_tests = os.getenv('SKIP_TESTS').bool()
const fuzzer_mode = os.getenv('VFUZZER').bool()
const comments_mode = scanner.CommentsMode.from(os.getenv('SCANNER_MODE')) or {
scanner.CommentsMode.skip_comments
}
fn main() {
dump(comments_mode)
files := os.args#[1..]
if files.len > 0 && files[0].starts_with('@') {
lst_path := files[0].all_after('@')
listed_files := os.read_file(lst_path)!.split('\n')
process_files(listed_files)!
return
if !fuzzer_mode {
dump(comments_mode)
}
process_files(files)!
all_files := file_lists.expand_files(os.args#[1..])!
process_files(all_files)!
}
fn hline() {
if fuzzer_mode {
return
}
println('---------------------------------------------------------------------------------------------------------------------------------------------------')
}
fn theader() {
if fuzzer_mode {
return
}
println(' Time Tokens Bytes Lines Bytes/Token Errors')
}
@ -53,20 +57,25 @@ fn process_files(files []string) ! {
if skip_tests && f.ends_with('_test.v') {
continue
}
total_files++
// do not measure the scanning, but only the parsing:
mut p := new_parser(f, comments_mode, table, pref_)
///
if fuzzer_mode {
p.scanner.max_eofs = 200
}
sw.restart()
ast_file := p.parse()
f_us := sw.elapsed().microseconds()
///
total_us += f_us
total_bytes += p.scanner.text.len
total_tokens += p.scanner.all_tokens.len
total_lines += ast_file.nr_lines
total_errors += p.errors.len
println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10} ${f}')
if !fuzzer_mode {
println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10} ${f}')
}
total_files++
}
hline()
theader()

View File

@ -2,30 +2,34 @@ import os
import time
import term
import v.scanner
import file_lists
import v.pref
const skip_tests = os.getenv_opt('SKIP_TESTS') or { '' }.bool()
const skip_tests = os.getenv('SKIP_TESTS').bool()
const fuzzer_mode = os.getenv('VFUZZER').bool()
const comments_mode = scanner.CommentsMode.from(os.getenv('SCANNER_MODE')) or {
scanner.CommentsMode.skip_comments
}
fn main() {
dump(comments_mode)
files := os.args#[1..]
if files.len > 0 && files[0].starts_with('@') {
lst_path := files[0].all_after('@')
listed_files := os.read_file(lst_path)!.split('\n')
process_files(listed_files)!
return
if !fuzzer_mode {
dump(comments_mode)
}
process_files(files)!
all_files := file_lists.expand_files(os.args#[1..])!
process_files(all_files)!
}
fn hline() {
if fuzzer_mode {
return
}
println('----------------------------------------------------------------------------------------------------------------------------------------------------')
}
fn theader() {
if fuzzer_mode {
return
}
println(' Time Tokens Bytes Lines Bytes/Token Errors')
}
@ -58,7 +62,9 @@ fn process_files(files []string) ! {
total_tokens += s.all_tokens.len
total_lines += s.nr_lines
total_errors += s.errors.len
println('${f_us:10}us ${s.all_tokens.len:10} ${s.text.len:10} ${s.nr_lines:10} ${(f64(s.text.len) / s.all_tokens.len):13.3f} ${s.errors.len:10} ${f}')
if !fuzzer_mode {
println('${f_us:10}us ${s.all_tokens.len:10} ${s.text.len:10} ${s.nr_lines:10} ${(f64(s.text.len) / s.all_tokens.len):13.3f} ${s.errors.len:10} ${f}')
}
}
hline()
theader()

View File

@ -764,6 +764,7 @@ pub fn prepare_test_session(zargs string, folder string, oskipped []string, main
$if windows {
// skip process/command examples on windows. TODO: remove the need for this, fix os.Command
if fnormalised.ends_with('examples/process/command.v') {
skipped << fnormalised.replace(nparent_dir + '/', '')
continue
}
}
@ -771,6 +772,7 @@ pub fn prepare_test_session(zargs string, folder string, oskipped []string, main
start := c#[0..testing.header_bytes_to_search_for_module_main]
if start.contains('module ') && !start.contains('module main') {
skipped << fnormalised.replace(nparent_dir + '/', '')
continue next_file
}
for skip_prefix in oskipped {
skip_folder := skip_prefix + '/'

View File

@ -2464,6 +2464,9 @@ pub fn (mut lx IndexExpr) recursive_arraymap_set_is_setter() {
pub fn all_registers(mut t Table, arch pref.Arch) map[string]ScopeObject {
mut res := map[string]ScopeObject{}
match arch {
._auto {
return all_registers(mut t, .amd64)
}
.amd64, .i386 {
for bit_size, array in ast.x86_no_number_register_list {
for name in array {
@ -2522,7 +2525,7 @@ pub fn all_registers(mut t Table, arch pref.Arch) map[string]ScopeObject {
// no registers
}
else { // TODO
panic('all_registers: unhandled arch')
panic('all_registers: unhandled arch: ${arch}')
}
}

View File

@ -18,16 +18,12 @@ fn (mut p Parser) assign_stmt() ast.Stmt {
return p.partial_assign_stmt(exprs)
}
const max_expr_level = 100
fn (mut p Parser) check_undefined_variables(names []string, val ast.Expr) ! {
p.expr_level++
defer {
p.expr_level--
}
if p.expr_level > parser.max_expr_level {
return error('expr level > ${parser.max_expr_level}')
}
p.check_expr_level()!
match val {
ast.Ident {
for name in names {

View File

@ -6,6 +6,15 @@ module parser
import v.ast
import v.token
const max_expr_level = 100
@[inline]
fn (mut p Parser) check_expr_level() ! {
if p.expr_level > parser.max_expr_level {
return error('expr level > ${parser.max_expr_level}')
}
}
fn (mut p Parser) expr(precedence int) ast.Expr {
return p.check_expr(precedence) or {
if token.is_decl(p.tok.kind) && p.disallow_declarations_in_script_mode() {
@ -17,6 +26,11 @@ fn (mut p Parser) expr(precedence int) ast.Expr {
fn (mut p Parser) check_expr(precedence int) !ast.Expr {
p.trace_parser('expr(${precedence})')
p.expr_level++
defer {
p.expr_level--
}
p.check_expr_level()!
mut node := ast.empty_expr
is_stmt_ident := p.is_stmt_ident
p.is_stmt_ident = false

View File

@ -163,6 +163,10 @@ fn (mut p Parser) if_expr(is_comptime bool) ast.IfExpr {
body_pos := p.tok.pos()
p.inside_if = false
p.inside_comptime_if = false
if p.opened_scopes > p.max_opened_scopes {
p.error('too many nested conditionals, scopes: ${p.opened_scopes}')
return ast.IfExpr{}
}
p.open_scope()
stmts := p.parse_block_no_scope(false)
branches << ast.IfBranch{

View File

@ -106,9 +106,13 @@ mut:
script_mode bool
script_mode_start_token token.Token
pub mut:
scanner &scanner.Scanner = unsafe { nil }
table &ast.Table = unsafe { nil }
scope &ast.Scope = unsafe { nil }
scanner &scanner.Scanner = unsafe { nil }
table &ast.Table = unsafe { nil }
scope &ast.Scope = unsafe { nil }
opened_scopes int
max_opened_scopes int = 100 // values above 300 risk stack overflow
errors []errors.Error
warnings []errors.Warning
notices []errors.Notice
@ -451,10 +455,7 @@ fn (p &Parser) peek_token(n int) token.Token {
fn (p &Parser) peek_token_after_var_list() token.Token {
mut n := 0
mut tok := p.tok
for {
if tok.kind == .eof {
break
}
for tok.kind != .eof {
if tok.kind == .key_mut {
n += 2
} else {
@ -546,10 +547,14 @@ fn (p &Parser) is_array_type() bool {
}
fn (mut p Parser) open_scope() {
if p.opened_scopes > p.max_opened_scopes {
p.error('nested opened scopes limit reached: ${p.max_opened_scopes}')
}
p.scope = &ast.Scope{
parent: p.scope
start_pos: p.tok.pos
}
p.opened_scopes++
}
fn (mut p Parser) close_scope() {
@ -561,6 +566,7 @@ fn (mut p Parser) close_scope() {
p.scope.end_pos = p.prev_tok.pos
p.scope.parent.children << p.scope
p.scope = p.scope.parent
p.opened_scopes--
}
fn (mut p Parser) parse_block() []ast.Stmt {
@ -1202,7 +1208,7 @@ fn (mut p Parser) asm_stmt(is_top_level bool) ast.AsmStmt {
// x86: https://www.felixcloutier.com/x86/
// arm: https://developer.arm.com/documentation/dui0068/b/arm-instruction-reference
mut templates := []ast.AsmTemplate{}
for p.tok.kind !in [.semicolon, .rcbr] {
for p.tok.kind !in [.semicolon, .rcbr, .eof] {
template_pos := p.tok.pos()
mut name := ''
if p.tok.kind == .name && arch == .amd64 && p.tok.lit in ['rex', 'vex', 'xop'] {
@ -1299,7 +1305,7 @@ fn (mut p Parser) asm_stmt(is_top_level bool) ast.AsmStmt {
}
}
else {
verror('p.parse_number_literal() invalid output: `${number_lit}`')
p.error('p.parse_number_literal() invalid output: `${number_lit}`')
}
}
}
@ -1441,7 +1447,7 @@ fn (mut p Parser) reg_or_alias() ast.AsmArg {
if x is ast.AsmRegister {
return ast.AsmArg(x as ast.AsmRegister)
} else {
verror('non-register ast.ScopeObject found in scope')
p.error('non-register ast.ScopeObject found in scope')
return ast.AsmDisp{} // should not be reached
}
} else if p.prev_tok.len >= 2 && p.prev_tok.lit[0] in [`b`, `f`]
@ -1713,6 +1719,10 @@ fn (mut p Parser) asm_ios(output bool) []ast.AsmIO {
return []
}
for {
if p.tok.kind == .eof {
p.error('reached eof in asm_ios')
return []
}
pos := p.tok.pos()
mut constraint := ''
@ -1747,6 +1757,7 @@ fn (mut p Parser) asm_ios(output bool) []ast.AsmIO {
// Numbered constraints - https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html
if p.tok.lit.int() >= 10 {
p.error_with_pos('The digit must be between 0 and 9 only', pos)
return []
}
p.check(.number)
} else {
@ -1759,6 +1770,7 @@ fn (mut p Parser) asm_ios(output bool) []ast.AsmIO {
expr = expr.expr
} else {
p.error('asm in/output must be enclosed in brackets')
return []
}
mut alias := ''
if p.tok.kind == .key_as {
@ -4348,6 +4360,10 @@ fn (mut p Parser) type_decl() ast.TypeDecl {
// type SumType = Aaa | Bbb | Ccc
if sum_variants.len > 1 {
for variant in sum_variants {
if variant.typ == 0 {
// the type symbol is probably coming from another .v file
continue
}
variant_sym := p.table.sym(variant.typ)
// TODO: implement this check for error too
if variant_sym.kind == .none_ {
@ -4394,9 +4410,13 @@ fn (mut p Parser) type_decl() ast.TypeDecl {
}
// sum_variants will have only one element
parent_type := sum_variants[0].typ
parent_sym := p.table.sym(parent_type)
pidx := parent_type.idx()
p.check_for_impure_v(parent_sym.language, decl_pos)
mut parent_language := ast.Language.v
if parent_type != 0 {
parent_sym := p.table.sym(parent_type)
parent_language = parent_sym.language
p.check_for_impure_v(parent_sym.language, decl_pos)
}
prepend_mod_name := if language == .v { p.prepend_mod(name) } else { name } // `C.time_t`, not `time.C.time_t`
idx := p.table.register_sym(ast.TypeSymbol{
kind: .alias
@ -4406,7 +4426,7 @@ fn (mut p Parser) type_decl() ast.TypeDecl {
parent_idx: pidx
info: ast.Alias{
parent_type: parent_type
language: parent_sym.language
language: parent_language
}
is_pub: is_pub
})
@ -4474,11 +4494,6 @@ fn (p &Parser) new_true_expr() ast.Expr {
}
}
@[noreturn]
fn verror(s string) {
util.verror('parser error', s)
}
fn (mut p Parser) top_level_statement_start() {
if p.scanner.comments_mode == .toplevel_comments {
p.scanner.set_is_inside_toplevel_statement(true)

View File

@ -53,6 +53,7 @@ pub mut:
all_tokens []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
tidx int
eofs int
max_eofs int = 50
inter_cbr_count int
pref &pref.Preferences
error_details []string
@ -557,7 +558,7 @@ fn (mut s Scanner) skip_whitespace() {
fn (mut s Scanner) end_of_file() token.Token {
s.eofs++
if s.eofs > 50 {
if s.eofs > s.max_eofs {
s.line_nr--
if s.file_path == scanner.internally_generated_v_code {
// show a bit more context for that case, since the source may not be easily visible by just inspecting a source file on the filesystem
@ -566,7 +567,7 @@ fn (mut s Scanner) end_of_file() token.Token {
dump(s.text.len)
}
panic(
'the end of file `${s.file_path}` has been reached 50 times already, the v parser is probably stuck.\n' +
'the end of file `${s.file_path}` has been reached ${s.max_eofs} times already, the v parser is probably stuck.\n' +
'This should not happen. Please report the bug here, and include the last 2-3 lines of your source code:\n' +
'https://github.com/vlang/v/issues/new?labels=Bug&template=bug_report.md')
}

View File

@ -156,7 +156,7 @@ pub fn source_file_context(kind string, filepath string, pos token.Pos) []string
aline := mu.max(0, mu.min(source_lines.len - 1, pos.line_nr + util.error_context_after))
tab_spaces := ' '
for iline := bline; iline <= aline; iline++ {
sline := source_lines[iline]
sline := source_lines[iline] or { '' }
start_column := mu.max(0, mu.min(pos.col, sline.len))
end_column := mu.max(0, mu.min(pos.col + mu.max(0, pos.len), sline.len))
cline := if iline == pos.line_nr {
@ -179,7 +179,7 @@ pub fn source_file_context(kind string, filepath string, pos token.Pos) []string
i++
} else {
char_len := utf8_char_len(sline[i])
spaces := ' '.repeat(utf8_str_visible_length(sline[i..i + char_len]))
spaces := ' '.repeat(utf8_str_visible_length(sline#[i..i + char_len]))
pointerline_builder.write_string(spaces)
i += char_len
}