From ac3045b472750f991a0e28c9b369b898d016cfbc Mon Sep 17 00:00:00 2001
From: Kim Shrier <gitkim@westryn.net>
Date: Mon, 12 Aug 2024 01:51:08 -0600
Subject: [PATCH] tools: add an amalgamate tool and description of usage
 (#22034)

---
 cmd/tools/amalgamate.v            | 221 ++++++++++++++++++++++++++++++
 thirdparty/libgc/amalgamation.txt |  55 +++++++-
 2 files changed, 270 insertions(+), 6 deletions(-)
 create mode 100644 cmd/tools/amalgamate.v
diff --git a/cmd/tools/amalgamate.v b/cmd/tools/amalgamate.v
new file mode 100644
index 0000000000..d5e95b7670
--- /dev/null
+++ b/cmd/tools/amalgamate.v
@@ -0,0 +1,221 @@
+// amalgamate multiple C source files into a single
+// C source file.  See https://sqlite.org/amalgamation.html
+// for a description of file amalgamation.
+//
+// If an input file is not specified, source is read
+// from stdin.
+//
+// If an output file is not specified, source is output
+// to stdout.
+
+module main
+
+import flag
+import os
+import regex
+
+const app_name = 'amalgamate'
+const app_version = '0.0.1'
+
+// pre-compile the include statement regex
+const re = regex.regex_opt(r'^\s*#\s*include\s*"([^"]+)"')!
+
+struct Config {
+mut:
+	input_files []string
+	output_file string
+	search_dirs []string
+	blacklist   []string
+}
+
+struct Context {
+	config Config
+mut:
+	processed_files []string
+}
+
+fn parse_arguments() Config {
+	mut cfg := Config{}
+
+	mut parser := flag.new_flag_parser(os.args)
+	parser.skip_executable()
+	parser.application(app_name)
+	parser.version(app_version)
+
+	parser.arguments_description('[file ...]')
+
+	parser.description('combine multiple .c and .h files into one.')
+	parser.description('')
+	parser.description('Combine input, coming from either stdin or input files, into one')
+	parser.description('large file.  Include statements are processed and the contents')
+	parser.description('copied in place.  Only #include "file.h" statements cause their')
+	parser.description('contents to be copied, not #include <file.h> statements.  If no')
+	parser.description('input files are specified, read from stdin.')
+
+	parser.footer('\nAn example showing multiple blacklisted files and multiple search')
+	parser.footer('directories.')
+	parser.footer('')
+	parser.footer('    amalgamate -o output_file.c -b ignore_me.h \\')
+	parser.footer('        -b ignore_me_2.h -b other/ignore_me.h \\')
+	parser.footer('        -s relative/search/dir -s /absolute/search/dir \\')
+	parser.footer('        file1.c file2.c')
+	parser.footer('')
+
+	cfg.output_file = parser.string('output', `o`, '', 'output file.  If not specified,\n' +
+		flag.space + 'defaults to stdout.\n', val_desc: '<filename>')
+
+	cfg.blacklist = parser.string_multi('blacklist', `b`,
+		'blacklist a file name.  This prevents\n' + flag.space +
+		'the named file from being included.\n' + flag.space +
+		'This can be specified more that once.\n', val_desc: '<include_file>')
+
+	cfg.search_dirs = parser.string_multi('search_path', `s`,
+		'add a directory to the search path.\n' + flag.space +
+		'An include file is searched for in\n' + flag.space +
+		'the current working directory and\n' + flag.space +
+		'if not found, the directories in this\n' + flag.space +
+		'list are searched, in order, until the\n' + flag.space +
+		'file is found or the search list is\n' + flag.space +
+		'exhausted.  This can be specified\n' + flag.space + 'more that once.\n',
+		val_desc: '<search_dir>'
+	)
+
+	cfg.input_files = parser.finalize() or {
+		// this only reports the first unrecognized argument
+		eprintln('${err}\n')
+		eprintln('${parser.usage()}\n')
+		exit(1)
+	}
+
+	return cfg
+}
+
+fn main() {
+	cfg := parse_arguments()
+
+	mut ctx := Context{
+		config: cfg
+	}
+
+	ctx.amalgamate() or {
+		eprintln('error: ${err}')
+		exit(1)
+	}
+}
+
+fn (mut c Context) amalgamate() ! {
+	mut source := ''
+
+	if c.config.input_files.len == 0 {
+		// source += '/* ########## stdin */\n'
+		// if there are no input files, read from stdin
+		local_dir := os.getwd()
+		source += c.handle_includes(local_dir, os.get_raw_lines_joined())!
+		// source += '/* ########## stdin end */\n'
+	} else {
+		// read each input file, in order, and
+		// handle all of its includes.
+		for file in c.config.input_files {
+			if file in c.config.blacklist {
+				// skip blacklisted files
+				continue
+			}
+
+			found_file := c.find_file(file)!
+
+			if found_file in c.processed_files {
+				// skip over files already read
+				continue
+			}
+
+			// source += '/* ########## ${file} */\n'
+			c.processed_files << found_file
+			local_dir := os.dir(found_file)
+			file_source_code := os.read_file(found_file)!
+			source += c.handle_includes(local_dir, file_source_code)!
+			// source += '/* ########## ${file} end */\n'
+		}
+	}
+
+	if c.config.output_file == '' {
+		print(source)
+	} else {
+		os.write_file(c.config.output_file, source)!
+	}
+
+	return
+}
+
+fn (c Context) find_file(file string) !string {
+	mut full_path := os.real_path(file)
+
+	if os.is_file(full_path) {
+		return full_path
+	}
+
+	for dir in c.config.search_dirs {
+		full_path = os.real_path(os.join_path_single(dir, file))
+
+		if os.is_file(full_path) {
+			return full_path
+		}
+	}
+
+	return error('file "${file}" not found')
+}
+
+// handle_includes looks for lines that start with #include
+// and inserts the lines from the named include file.
+//
+// The pattern matches file names for local header files,
+// not system header files as are denoted by < and >.
+fn (mut c Context) handle_includes(local_dir string, input_source string) !string {
+	source_lines := input_source.split_into_lines()
+	mut output_lines := []string{}
+
+	for line in source_lines {
+		start, _ := re.match_string(line)
+
+		if start >= 0 {
+			file := line[re.groups[0]..re.groups[1]]
+			mut found_file := ''
+
+			if file in c.config.blacklist {
+				// leave blacklisted files alone
+				if file in c.processed_files {
+					// we don't want a second include
+					output_lines << '\n'
+				} else {
+					output_lines << line
+					c.processed_files << file
+				}
+				continue
+			}
+
+			if !os.is_abs_path(file) {
+				found_file = c.find_file(os.join_path_single(local_dir, file)) or {
+					// keep looking
+					''
+				}
+			}
+
+			if found_file == '' {
+				found_file = c.find_file(file)!
+			}
+
+			if found_file in c.processed_files {
+				// skip over files already read
+				continue
+			}
+			c.processed_files << found_file
+			file_source_code := os.read_file(found_file)!
+			// output_lines << '/* ########## ${file} begin */\n'
+			output_lines << c.handle_includes(os.dir(found_file), file_source_code)!
+			// output_lines << '/* ########## ${file} end */\n'
+		} else {
+			output_lines << line
+		}
+	}
+
+	return output_lines.join_lines() + '\n'
+}
diff --git a/thirdparty/libgc/amalgamation.txt b/thirdparty/libgc/amalgamation.txt
index de25a3d367..4212281054 100644
--- a/thirdparty/libgc/amalgamation.txt
+++ b/thirdparty/libgc/amalgamation.txt
@@ -1,7 +1,50 @@
 The libgc source is distributed here as an amalgamation (https://sqlite.org/amalgamation.html).
-This means that, rather than mirroring the entire bdwgc repo here,
-[this script](https://gist.github.com/spaceface777/34d25420f2dc4953fb7864f44a211105) was used
-to bundle all local includes together into a single C file, which is much easier to handle.
-Furthermore, the script above was also used to minify (i.e. remove comments and whitespace in)
-the garbage collector source. Together, these details help keep the V source distribution small,
-can reduce compile times by 3%-15%, and can help C compilers generate more optimized code.
+This means that, rather than mirroring the entire bdwgc repo here, the amalgamate tool
+was used to bundle all C files and local includes together into a single C file, which is
+much easier to handle.  This helps keep the V source distribution small, can reduce compile
+times by 3%-15%, and can help C compilers generate more optimized code.
+
+For generating the libgc amalgamation, the following commands were used:
+
+    git clone https://github.com/ivmai/bdwgc.git
+    cd bdwgc
+    ./autogen.sh
+    ./configure --enable-threads=pthreads \
+        --enable-static \
+        --enable-shared=no \
+        --enable-thread-local-alloc=no \
+        --enable-parallel-mark \
+        --enable-single-obj-compilation \
+        --enable-gc-debug
+        
+    ../../../cmd/tools/amalgamate -o ../gc.c \
+        -b atomic_ops.h \
+        -b gc/gc.h \
+        -b gc/gc_backptr.h \
+        -b gc/gc_disclaim.h \
+        -b gc/gc_gcj.h \
+        -b gc/gc_inline.h \
+        -b gc/gc_mark.h \
+        -b gc/gc_pthread_redirects.h \
+        -b gc/gc_tiny_fl.h \
+        -b gc/gc_typed.h \
+        -b gc/javaxfc.h \
+        -b il/PCR_IL.h \
+        -b mm/PCR_MM.h \
+        -b psp2-support.h \
+        -b stubinfo.h \
+        -b th/PCR_ThCtl.h \
+        -b vd/PCR_VD.h \
+        -s include \
+        -s include/private \
+        extra/gc.c
+
+The updated header files are then copied into the include/gc directory.  We can delete
+include/gc/gc_cpp.h since this header is not needed by V.  And, we can remove the git
+repo for bdwgc.
+
+    cp include/gc/*.h ../include/gc
+    cd ..
+    rm include/gc/gc_cpp.h
+    rm -rf bdwgc
+