parser,scanner,ast: make the scanner and parser more robust, by implementing more limits (preventing panics, discovered by fuzzing)

2025-08-03 09:47:15 -04:00 · 2024-08-20 13:21:25 +03:00 · 2024-08-20 13:21:25 +03:00 · c92577e6ed
commit c92577e6ed
parent 50457647de
14 changed files with 158 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@ -152,3 +152,5 @@ vlib/v/tests/*.js
 # ignore .NET7.0 Assembly Files
 bench/vectors/bin
 bench/vectors/obj
+
+autofuzz.log
--- a/cmd/tools/fuzz/fuzz_v_parser_with_radamsa.sh
+++ b/cmd/tools/fuzz/fuzz_v_parser_with_radamsa.sh
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+## Note: radamsa is a fuzzer, available from https://gitlab.com/akihe/radamsa
+
+## ./v -g cmd/tools/measure/parser_speed.v
+
+while true; do 
+	radamsa --meta autofuzz.log examples/hello_world.v > x.v; 
+	VFUZZER=true cmd/tools/measure/parser_speed x.v || break; 
+done
--- a/cmd/tools/measure/file_lists/expand.v
+++ b/cmd/tools/measure/file_lists/expand.v
@ -0,0 +1,29 @@
+module file_lists
+
+import os
+
+// expand_files accepts a list of files and folders, and returns a list of all the .v and .vsh files, found in them.
+// The input list of files, supports recursive `@file.lst` expansion, where each line is treated as another file/folder.
+pub fn expand_files(files []string) ![]string {
+	mut res := []string{}
+	for file in files {
+		if file == '' {
+			continue
+		}
+		if file.starts_with('@') {
+			lst_path := files[0].all_after('@').trim_space()
+			listed_files := os.read_file(lst_path)!.split('\n').map(it.trim_space())
+			res << expand_files(listed_files)!
+			continue
+		}
+		if os.is_dir(file) {
+			res << os.walk_ext(file, '.vsh')
+			res << os.walk_ext(file, '.v')
+			continue
+		}
+		if os.exists(file) {
+			res << file
+		}
+	}
+	return res
+}
--- a/cmd/tools/measure/fmt_speed.v
+++ b/cmd/tools/measure/fmt_speed.v
@ -7,29 +7,33 @@ import v.parser
 import v.errors
 import v.scanner
 import term
+import file_lists

-const skip_tests = os.getenv_opt('SKIP_TESTS') or { '' }.bool()
+const skip_tests = os.getenv('SKIP_TESTS').bool()
+const fuzzer_mode = os.getenv('VFUZZER').bool()
 const comments_mode = scanner.CommentsMode.from(os.getenv('SCANNER_MODE')) or {
 	scanner.CommentsMode.parse_comments
 }

 fn main() {
-	dump(comments_mode)
-	files := os.args#[1..]
-	if files.len > 0 && files[0].starts_with('@') {
-		lst_path := files[0].all_after('@')
-		listed_files := os.read_file(lst_path)!.split('\n')
-		process_files(listed_files)!
-		return
+	if !fuzzer_mode {
+		dump(comments_mode)
 	}
-	process_files(files)!
+	all_files := file_lists.expand_files(os.args#[1..])!
+	process_files(all_files)!
 }

 fn hline() {
+	if fuzzer_mode {
+		return
+	}
 	println('----------------------------------------------------------------------------------------------------------------------------------------------------------')
 }

 fn theader() {
+	if fuzzer_mode {
+		return
+	}
 	println('        Time     Tokens      Bytes      Lines   Bytes/Token     Errors   FMT.len')
 }

@ -71,7 +75,9 @@ fn process_files(files []string) ! {
 		total_lines += ast_file.nr_lines
 		total_errors += p.errors.len
 		total_fmt_len += formatted_content.len
-		println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10}  ${formatted_content.len:8}   ${f}')
+		if !fuzzer_mode {
+			println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10}  ${formatted_content.len:8}   ${f}')
+		}
 	}
 	hline()
 	theader()
--- a/cmd/tools/measure/parser_speed.v
+++ b/cmd/tools/measure/parser_speed.v
@ -6,29 +6,33 @@ import v.parser
 import v.errors
 import v.scanner
 import term
+import file_lists

-const skip_tests = os.getenv_opt('SKIP_TESTS') or { '' }.bool()
+const skip_tests = os.getenv('SKIP_TESTS').bool()
+const fuzzer_mode = os.getenv('VFUZZER').bool()
 const comments_mode = scanner.CommentsMode.from(os.getenv('SCANNER_MODE')) or {
 	scanner.CommentsMode.skip_comments
 }

 fn main() {
-	dump(comments_mode)
-	files := os.args#[1..]
-	if files.len > 0 && files[0].starts_with('@') {
-		lst_path := files[0].all_after('@')
-		listed_files := os.read_file(lst_path)!.split('\n')
-		process_files(listed_files)!
-		return
+	if !fuzzer_mode {
+		dump(comments_mode)
 	}
-	process_files(files)!
+	all_files := file_lists.expand_files(os.args#[1..])!
+	process_files(all_files)!
 }

 fn hline() {
+	if fuzzer_mode {
+		return
+	}
 	println('---------------------------------------------------------------------------------------------------------------------------------------------------')
 }

 fn theader() {
+	if fuzzer_mode {
+		return
+	}
 	println('        Time     Tokens      Bytes      Lines   Bytes/Token     Errors')
 }

@ -53,20 +57,25 @@ fn process_files(files []string) ! {
 		if skip_tests && f.ends_with('_test.v') {
 			continue
 		}
-		total_files++
 		// do not measure the scanning, but only the parsing:
 		mut p := new_parser(f, comments_mode, table, pref_)
-		///
+		if fuzzer_mode {
+			p.scanner.max_eofs = 200
+		}
+
 		sw.restart()
 		ast_file := p.parse()
 		f_us := sw.elapsed().microseconds()
-		///
+
 		total_us += f_us
 		total_bytes += p.scanner.text.len
 		total_tokens += p.scanner.all_tokens.len
 		total_lines += ast_file.nr_lines
 		total_errors += p.errors.len
-		println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10}   ${f}')
+		if !fuzzer_mode {
+			println('${f_us:10}us ${p.scanner.all_tokens.len:10} ${p.scanner.text.len:10} ${ast_file.nr_lines:10} ${(f64(p.scanner.text.len) / p.scanner.all_tokens.len):13.3} ${p.errors.len:10}   ${f}')
+		}
+		total_files++
 	}
 	hline()
 	theader()
--- a/cmd/tools/measure/scanner_speed.v
+++ b/cmd/tools/measure/scanner_speed.v
@ -2,30 +2,34 @@ import os
 import time
 import term
 import v.scanner
+import file_lists
 import v.pref

-const skip_tests = os.getenv_opt('SKIP_TESTS') or { '' }.bool()
+const skip_tests = os.getenv('SKIP_TESTS').bool()
+const fuzzer_mode = os.getenv('VFUZZER').bool()
 const comments_mode = scanner.CommentsMode.from(os.getenv('SCANNER_MODE')) or {
 	scanner.CommentsMode.skip_comments
 }

 fn main() {
-	dump(comments_mode)
-	files := os.args#[1..]
-	if files.len > 0 && files[0].starts_with('@') {
-		lst_path := files[0].all_after('@')
-		listed_files := os.read_file(lst_path)!.split('\n')
-		process_files(listed_files)!
-		return
+	if !fuzzer_mode {
+		dump(comments_mode)
 	}
-	process_files(files)!
+	all_files := file_lists.expand_files(os.args#[1..])!
+	process_files(all_files)!
 }

 fn hline() {
+	if fuzzer_mode {
+		return
+	}
 	println('----------------------------------------------------------------------------------------------------------------------------------------------------')
 }

 fn theader() {
+	if fuzzer_mode {
+		return
+	}
 	println('        Time     Tokens      Bytes      Lines   Bytes/Token     Errors')
 }

@ -58,7 +62,9 @@ fn process_files(files []string) ! {
 		total_tokens += s.all_tokens.len
 		total_lines += s.nr_lines
 		total_errors += s.errors.len
-		println('${f_us:10}us ${s.all_tokens.len:10} ${s.text.len:10} ${s.nr_lines:10} ${(f64(s.text.len) / s.all_tokens.len):13.3f} ${s.errors.len:10}   ${f}')
+		if !fuzzer_mode {
+			println('${f_us:10}us ${s.all_tokens.len:10} ${s.text.len:10} ${s.nr_lines:10} ${(f64(s.text.len) / s.all_tokens.len):13.3f} ${s.errors.len:10}   ${f}')
+		}
 	}
 	hline()
 	theader()
--- a/cmd/tools/modules/testing/common.v
+++ b/cmd/tools/modules/testing/common.v
@ -764,6 +764,7 @@ pub fn prepare_test_session(zargs string, folder string, oskipped []string, main
 		$if windows {
 			// skip process/command examples on windows. TODO: remove the need for this, fix os.Command
 			if fnormalised.ends_with('examples/process/command.v') {
+				skipped << fnormalised.replace(nparent_dir + '/', '')
 				continue
 			}
 		}
@ -771,6 +772,7 @@ pub fn prepare_test_session(zargs string, folder string, oskipped []string, main
 		start := c#[0..testing.header_bytes_to_search_for_module_main]
 		if start.contains('module ') && !start.contains('module main') {
 			skipped << fnormalised.replace(nparent_dir + '/', '')
+			continue next_file
 		}
 		for skip_prefix in oskipped {
 			skip_folder := skip_prefix + '/'
--- a/vlib/v/ast/ast.v
+++ b/vlib/v/ast/ast.v
@ -2464,6 +2464,9 @@ pub fn (mut lx IndexExpr) recursive_arraymap_set_is_setter() {
 pub fn all_registers(mut t Table, arch pref.Arch) map[string]ScopeObject {
 	mut res := map[string]ScopeObject{}
 	match arch {
+		._auto {
+			return all_registers(mut t, .amd64)
+		}
 		.amd64, .i386 {
 			for bit_size, array in ast.x86_no_number_register_list {
 				for name in array {
@ -2522,7 +2525,7 @@ pub fn all_registers(mut t Table, arch pref.Arch) map[string]ScopeObject {
 			// no registers
 		}
 		else { // TODO
-			panic('all_registers: unhandled arch')
+			panic('all_registers: unhandled arch: ${arch}')
 		}
 	}

--- a/vlib/v/parser/assign.v
+++ b/vlib/v/parser/assign.v
@ -18,16 +18,12 @@ fn (mut p Parser) assign_stmt() ast.Stmt {
 	return p.partial_assign_stmt(exprs)
 }

-const max_expr_level = 100
-
 fn (mut p Parser) check_undefined_variables(names []string, val ast.Expr) ! {
 	p.expr_level++
 	defer {
 		p.expr_level--
 	}
-	if p.expr_level > parser.max_expr_level {
-		return error('expr level > ${parser.max_expr_level}')
-	}
+	p.check_expr_level()!
 	match val {
 		ast.Ident {
 			for name in names {
--- a/vlib/v/parser/expr.v
+++ b/vlib/v/parser/expr.v
@ -6,6 +6,15 @@ module parser
 import v.ast
 import v.token

+const max_expr_level = 100
+
+@[inline]
+fn (mut p Parser) check_expr_level() ! {
+	if p.expr_level > parser.max_expr_level {
+		return error('expr level > ${parser.max_expr_level}')
+	}
+}
+
 fn (mut p Parser) expr(precedence int) ast.Expr {
 	return p.check_expr(precedence) or {
 		if token.is_decl(p.tok.kind) && p.disallow_declarations_in_script_mode() {
@ -17,6 +26,11 @@ fn (mut p Parser) expr(precedence int) ast.Expr {

 fn (mut p Parser) check_expr(precedence int) !ast.Expr {
 	p.trace_parser('expr(${precedence})')
+	p.expr_level++
+	defer {
+		p.expr_level--
+	}
+	p.check_expr_level()!
 	mut node := ast.empty_expr
 	is_stmt_ident := p.is_stmt_ident
 	p.is_stmt_ident = false
--- a/vlib/v/parser/if_match.v
+++ b/vlib/v/parser/if_match.v
@ -163,6 +163,10 @@ fn (mut p Parser) if_expr(is_comptime bool) ast.IfExpr {
 		body_pos := p.tok.pos()
 		p.inside_if = false
 		p.inside_comptime_if = false
+		if p.opened_scopes > p.max_opened_scopes {
+			p.error('too many nested conditionals, scopes: ${p.opened_scopes}')
+			return ast.IfExpr{}
+		}
 		p.open_scope()
 		stmts := p.parse_block_no_scope(false)
 		branches << ast.IfBranch{
--- a/vlib/v/parser/parser.v
+++ b/vlib/v/parser/parser.v
@ -106,9 +106,13 @@ mut:
 	script_mode               bool
 	script_mode_start_token   token.Token
 pub mut:
-	scanner        &scanner.Scanner = unsafe { nil }
-	table          &ast.Table       = unsafe { nil }
-	scope          &ast.Scope       = unsafe { nil }
+	scanner &scanner.Scanner = unsafe { nil }
+	table   &ast.Table       = unsafe { nil }
+	scope   &ast.Scope       = unsafe { nil }
+
+	opened_scopes     int
+	max_opened_scopes int = 100 // values above 300 risk stack overflow
+
 	errors         []errors.Error
 	warnings       []errors.Warning
 	notices        []errors.Notice
@ -451,10 +455,7 @@ fn (p &Parser) peek_token(n int) token.Token {
 fn (p &Parser) peek_token_after_var_list() token.Token {
 	mut n := 0
 	mut tok := p.tok
-	for {
-		if tok.kind == .eof {
-			break
-		}
+	for tok.kind != .eof {
 		if tok.kind == .key_mut {
 			n += 2
 		} else {
@ -546,10 +547,14 @@ fn (p &Parser) is_array_type() bool {
 }

 fn (mut p Parser) open_scope() {
+	if p.opened_scopes > p.max_opened_scopes {
+		p.error('nested opened scopes limit reached: ${p.max_opened_scopes}')
+	}
 	p.scope = &ast.Scope{
 		parent:    p.scope
 		start_pos: p.tok.pos
 	}
+	p.opened_scopes++
 }

 fn (mut p Parser) close_scope() {
@ -561,6 +566,7 @@ fn (mut p Parser) close_scope() {
 	p.scope.end_pos = p.prev_tok.pos
 	p.scope.parent.children << p.scope
 	p.scope = p.scope.parent
+	p.opened_scopes--
 }

 fn (mut p Parser) parse_block() []ast.Stmt {
@ -1202,7 +1208,7 @@ fn (mut p Parser) asm_stmt(is_top_level bool) ast.AsmStmt {
 	// x86: https://www.felixcloutier.com/x86/
 	// arm: https://developer.arm.com/documentation/dui0068/b/arm-instruction-reference
 	mut templates := []ast.AsmTemplate{}
-	for p.tok.kind !in [.semicolon, .rcbr] {
+	for p.tok.kind !in [.semicolon, .rcbr, .eof] {
 		template_pos := p.tok.pos()
 		mut name := ''
 		if p.tok.kind == .name && arch == .amd64 && p.tok.lit in ['rex', 'vex', 'xop'] {
@ -1299,7 +1305,7 @@ fn (mut p Parser) asm_stmt(is_top_level bool) ast.AsmStmt {
 								}
 							}
 							else {
-								verror('p.parse_number_literal() invalid output: `${number_lit}`')
+								p.error('p.parse_number_literal() invalid output: `${number_lit}`')
 							}
 						}
 					}
@ -1441,7 +1447,7 @@ fn (mut p Parser) reg_or_alias() ast.AsmArg {
 		if x is ast.AsmRegister {
 			return ast.AsmArg(x as ast.AsmRegister)
 		} else {
-			verror('non-register ast.ScopeObject found in scope')
+			p.error('non-register ast.ScopeObject found in scope')
 			return ast.AsmDisp{} // should not be reached
 		}
 	} else if p.prev_tok.len >= 2 && p.prev_tok.lit[0] in [`b`, `f`]
@ -1713,6 +1719,10 @@ fn (mut p Parser) asm_ios(output bool) []ast.AsmIO {
 		return []
 	}
 	for {
+		if p.tok.kind == .eof {
+			p.error('reached eof in asm_ios')
+			return []
+		}
 		pos := p.tok.pos()

 		mut constraint := ''
@ -1747,6 +1757,7 @@ fn (mut p Parser) asm_ios(output bool) []ast.AsmIO {
 					// Numbered constraints - https://gcc.gnu.org/onlinedocs/gcc/Simple-Constraints.html
 					if p.tok.lit.int() >= 10 {
 						p.error_with_pos('The digit must be between 0 and 9 only', pos)
+						return []
 					}
 					p.check(.number)
 				} else {
@ -1759,6 +1770,7 @@ fn (mut p Parser) asm_ios(output bool) []ast.AsmIO {
 			expr = expr.expr
 		} else {
 			p.error('asm in/output must be enclosed in brackets')
+			return []
 		}
 		mut alias := ''
 		if p.tok.kind == .key_as {
@ -4348,6 +4360,10 @@ fn (mut p Parser) type_decl() ast.TypeDecl {
 	// type SumType = Aaa | Bbb | Ccc
 	if sum_variants.len > 1 {
 		for variant in sum_variants {
+			if variant.typ == 0 {
+				// the type symbol is probably coming from another .v file
+				continue
+			}
 			variant_sym := p.table.sym(variant.typ)
 			// TODO: implement this check for error too
 			if variant_sym.kind == .none_ {
@ -4394,9 +4410,13 @@ fn (mut p Parser) type_decl() ast.TypeDecl {
 	}
 	// sum_variants will have only one element
 	parent_type := sum_variants[0].typ
-	parent_sym := p.table.sym(parent_type)
 	pidx := parent_type.idx()
-	p.check_for_impure_v(parent_sym.language, decl_pos)
+	mut parent_language := ast.Language.v
+	if parent_type != 0 {
+		parent_sym := p.table.sym(parent_type)
+		parent_language = parent_sym.language
+		p.check_for_impure_v(parent_sym.language, decl_pos)
+	}
 	prepend_mod_name := if language == .v { p.prepend_mod(name) } else { name } // `C.time_t`, not `time.C.time_t`
 	idx := p.table.register_sym(ast.TypeSymbol{
 		kind:       .alias
@ -4406,7 +4426,7 @@ fn (mut p Parser) type_decl() ast.TypeDecl {
 		parent_idx: pidx
 		info:       ast.Alias{
 			parent_type: parent_type
-			language:    parent_sym.language
+			language:    parent_language
 		}
 		is_pub: is_pub
 	})
@ -4474,11 +4494,6 @@ fn (p &Parser) new_true_expr() ast.Expr {
 	}
 }

-@[noreturn]
-fn verror(s string) {
-	util.verror('parser error', s)
-}
-
 fn (mut p Parser) top_level_statement_start() {
 	if p.scanner.comments_mode == .toplevel_comments {
 		p.scanner.set_is_inside_toplevel_statement(true)
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -53,6 +53,7 @@ pub mut:
 	all_tokens                  []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
 	tidx                        int
 	eofs                        int
+	max_eofs                    int = 50
 	inter_cbr_count             int
 	pref                        &pref.Preferences
 	error_details               []string
@ -557,7 +558,7 @@ fn (mut s Scanner) skip_whitespace() {

 fn (mut s Scanner) end_of_file() token.Token {
 	s.eofs++
-	if s.eofs > 50 {
+	if s.eofs > s.max_eofs {
 		s.line_nr--
 		if s.file_path == scanner.internally_generated_v_code {
 			// show a bit more context for that case, since the source may not be easily visible by just inspecting a source file on the filesystem
@ -566,7 +567,7 @@ fn (mut s Scanner) end_of_file() token.Token {
 			dump(s.text.len)
 		}
 		panic(
-			'the end of file `${s.file_path}` has been reached 50 times already, the v parser is probably stuck.\n' +
+			'the end of file `${s.file_path}` has been reached ${s.max_eofs} times already, the v parser is probably stuck.\n' +
 			'This should not happen. Please report the bug here, and include the last 2-3 lines of your source code:\n' +
 			'https://github.com/vlang/v/issues/new?labels=Bug&template=bug_report.md')
 	}
--- a/vlib/v/util/errors.v
+++ b/vlib/v/util/errors.v
@ -156,7 +156,7 @@ pub fn source_file_context(kind string, filepath string, pos token.Pos) []string
 	aline := mu.max(0, mu.min(source_lines.len - 1, pos.line_nr + util.error_context_after))
 	tab_spaces := '    '
 	for iline := bline; iline <= aline; iline++ {
-		sline := source_lines[iline]
+		sline := source_lines[iline] or { '' }
 		start_column := mu.max(0, mu.min(pos.col, sline.len))
 		end_column := mu.max(0, mu.min(pos.col + mu.max(0, pos.len), sline.len))
 		cline := if iline == pos.line_nr {
@ -179,7 +179,7 @@ pub fn source_file_context(kind string, filepath string, pos token.Pos) []string
 					i++
 				} else {
 					char_len := utf8_char_len(sline[i])
-					spaces := ' '.repeat(utf8_str_visible_length(sline[i..i + char_len]))
+					spaces := ' '.repeat(utf8_str_visible_length(sline#[i..i + char_len]))
 					pointerline_builder.write_string(spaces)
 					i += char_len
 				}