diff --git a/.benchmark/benchmark.py b/.benchmark/benchmark.py new file mode 100755 index 00000000..3edb42c8 --- /dev/null +++ b/.benchmark/benchmark.py @@ -0,0 +1,793 @@ +#!/usr/bin/env python3 +import argparse +import logging +import subprocess +import coloredlogs +import json +import os +import sys +import platform +import datetime +import time +import tempfile +import shutil +import glob + +from packaging.version import Version + +# A registry for benchmark functions. +benchmark_registry = [] + + +def benchmark(func): + """Decorator to register a benchmark function.""" + benchmark_registry.append(func) + return func + + +def needs_version(version): + """Decorator to specify a required version for a benchmark.""" + + def decorator(func): + func.required_version = Version(version) + return func + + return decorator + + +def needs_binary(binary): + """Decorator to specify a required binary for a benchmark.""" + + def decorator(func): + func.required_binary = binary + return func + + return decorator + + +def needs_tag(tag): + """Decorator to specify a required tag for a benchmark.""" + + def decorator(func): + func.required_tag = tag + return func + + return decorator + + +def without_tag(tag): + """Decorator to specify a tag that should not be present for a benchmark.""" + + def decorator(func): + func.excluded_tag = tag + return func + + return decorator + + +def binary_size_benchmark(env, binary_name): + binary = env.config.binary(binary_name) + res = { + "binary": binary_name, + "binary_size": os.path.getsize(binary), + } + env.sample(res) + + +@benchmark +@needs_binary("mkdwarfs") +def mkdwarfs_size(env): + binary_size_benchmark(env, "mkdwarfs") + + +@benchmark +@needs_binary("dwarfsck") +def dwarfsck_size(env): + binary_size_benchmark(env, "dwarfsck") + + +@benchmark +@needs_binary("dwarfsextract") +def dwarfsextract_size(env): + binary_size_benchmark(env, "dwarfsextract") + + +@benchmark +@needs_binary("dwarfs") +def dwarfs_size(env): + binary_size_benchmark(env, "dwarfs") + + +def mkdwarfs_benchmark(env, inp, args, **kwargs): + image = env.tmp("output.dwarfs") + res = env.mkdwarfs( + f"-i {env.data(inp)} -o {image} {args} --force --no-progress --log-level=error", + **kwargs, + ) + res["image_size"] = os.path.getsize(image) + os.remove(image) + env.sample(res) + + +@benchmark +@needs_binary("mkdwarfs") +def segmenter_perl_l7(env): + mkdwarfs_benchmark( + env, "perl-install-small", "-C null -N4 -l7 --metadata-compression=null" + ) + + +@benchmark +@needs_binary("mkdwarfs") +def segmenter_perl_l9(env): + mkdwarfs_benchmark( + env, "perl-install-small", "-C null -N4 -l9 --metadata-compression=null" + ) + + +@benchmark +@needs_binary("mkdwarfs") +def compress_perl_l7(env): + mkdwarfs_benchmark( + env, + "perl-install-small", + "-N4 -l7 -C zstd:level=12 --metadata-compression=null", + min_runs=5, + ) + + +@benchmark +@needs_binary("mkdwarfs") +def compress_perl_l9(env): + mkdwarfs_benchmark( + env, + "perl-install-small", + "-N4 -l9 -C lzma:level=3 --metadata-compression=null", + min_runs=5, + ) + + +@benchmark +@needs_binary("mkdwarfs") +@needs_version("0.9.0") +@without_tag("minimal") +def compress_fits(env): + mkdwarfs_benchmark(env, "2024-02-07", "-N4 --categorize") + + +@benchmark +@needs_binary("mkdwarfs") +@needs_version("0.8.0") +@without_tag("minimal") +def compress_pcmaudio(env): + mkdwarfs_benchmark(env, "pcmaudio", "-N4 --categorize") + + +@benchmark +@needs_binary("dwarfsextract") +def extract_perl_zstd(env): + output = env.tmp("output") + os.makedirs(output, exist_ok=True) + res = env.dwarfsextract( + f"-i {env.data('perl-install-small-v0.7.5.dwarfs')} -o {output}" + ) + shutil.rmtree(output) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsextract") +@without_tag("minimal") +def extract_perl_zstd_gnutar(env): + output = env.tmp("output.tar") + res = env.dwarfsextract( + f"-i {env.data('perl-install-small-v0.7.5.dwarfs')} -f gnutar -o {output}" + ) + os.remove(output) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsextract") +@without_tag("minimal") +def extract_perl_zstd_gnutar_devnull(env): + res = env.dwarfsextract( + f"-i {env.data('perl-install-small-v0.7.5.dwarfs')} -f gnutar -o /dev/null" + ) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsextract") +@needs_version("0.9.0") +@without_tag("minimal") +def extract_fits(env): + output = env.tmp("output") + os.makedirs(output, exist_ok=True) + res = env.dwarfsextract(f"-i {env.data('2024-02-07.dwarfs')} -o {output}") + shutil.rmtree(output) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsextract") +@needs_version("0.9.0") +@without_tag("minimal") +def extract_fits_gnutar(env): + output = env.tmp("output.tar") + res = env.dwarfsextract(f"-i {env.data('2024-02-07.dwarfs')} -f gnutar -o {output}") + os.remove(output) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsextract") +@needs_version("0.8.0") +@without_tag("minimal") +def extract_pcmaudio(env): + output = env.tmp("output") + os.makedirs(output, exist_ok=True) + res = env.dwarfsextract(f"-i {env.data('pcmaudio.dwarfs')} -o {output}") + shutil.rmtree(output) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsextract") +@needs_version("0.9.0") +@without_tag("minimal") +def extract_pcmaudio_gnutar(env): + output = env.tmp("output.tar") + res = env.dwarfsextract(f"-i {env.data('pcmaudio.dwarfs')} -f gnutar -o {output}") + os.remove(output) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsck") +@needs_version("0.8.0") +def dwarfsck_no_check_perl_zstd(env): + res = env.dwarfsck(f"{env.data('perl-install-small-v0.7.5.dwarfs')} --no-check") + env.sample(res) + + +@benchmark +@needs_binary("dwarfsck") +def check_integrity_perl_zstd(env): + res = env.dwarfsck( + f"{env.data('perl-install-small-v0.7.5.dwarfs')} --check-integrity" + ) + env.sample(res) + + +@benchmark +@needs_binary("dwarfsck") +@needs_version("0.9.2") +def checksum_files_perl_zstd_sha256(env): + res = env.dwarfsck( + f"{env.data('perl-install-small-v0.7.5.dwarfs')} --checksum sha256" + ) + env.sample(res) + + +def make_script(filename, content): + with open(filename, "w") as f: + f.write(content) + os.chmod(filename, 0o755) + + +def mount_and_run_test(env, image, cmd, opts=None, **kwargs): + mnt = env.tmp("mnt") + os.makedirs(mnt, exist_ok=True) + script = env.tmp("script.sh") + if opts is None: + opts = "" + cmd = cmd.format(**locals()) + make_script( + script, + f"""#!/bin/bash +set -e +{env.config.binary("dwarfs")} {image} {mnt} {opts} +trap 'fusermount -u {mnt}' EXIT +{cmd} +""", + ) + env.sample(env.hyperfine(script, **kwargs)) + + +@benchmark +@needs_binary("dwarfs") +def mount_and_run_emacs_l6(env): + mount_and_run_test( + env, env.data(f"emacs-{platform.machine()}-l6.dwarfs"), "{mnt}/AppRun --help" + ) + + +@benchmark +@needs_binary("dwarfs") +@needs_version("0.12.0") +def mount_and_run_emacs_l6_mmap(env): + mount_and_run_test( + env, + env.data(f"emacs-{platform.machine()}-l6.dwarfs"), + "{mnt}/AppRun --help", + "-oblock_allocator=mmap", + ) + + +@benchmark +@needs_binary("dwarfs") +def mount_and_run_emacs_l9(env): + mount_and_run_test( + env, env.data(f"emacs-{platform.machine()}-l9.dwarfs"), "{mnt}/AppRun --help" + ) + + +@benchmark +@needs_binary("dwarfs") +def mount_and_cat_files(env): + mount_and_run_test( + env, + env.data(f"perl-install-1M-zstd.dwarfs"), + "find {mnt}/default/perl-5.2[0-9].* -type f -print0 | xargs -0 -P16 -n64 cat | dd of=/dev/null bs=1M", + min_runs=5, + ) + + +@benchmark +@needs_binary("dwarfs") +@needs_version("0.12.0") +def mount_and_cat_files_mmap(env): + mount_and_run_test( + env, + env.data(f"perl-install-1M-zstd.dwarfs"), + "find {mnt}/default/perl-5.2[0-9].* -type f -print0 | xargs -0 -P16 -n64 cat | dd of=/dev/null bs=1M", + "-oblock_allocator=mmap", + min_runs=5, + ) + + +class BenchmarkEnvironment(object): + def __init__(self, config, data_dir, output_dir, name): + self.config = config + self.data_dir = data_dir + self.output = output_dir + self.name = name + + def tmp(self, name): + return os.path.join(self.config.tmpdir, name) + + def data(self, name): + return os.path.join(self.data_dir, name) + + def mkdwarfs(self, *args, **kwargs): + return self.hyperfine(self.config.binary("mkdwarfs"), *args, **kwargs) + + def dwarfs(self, *args, **kwargs): + return self.hyperfine(self.config.binary("dwarfs"), *args, **kwargs) + + def dwarfsck(self, *args, **kwargs): + return self.hyperfine(self.config.binary("dwarfsck"), *args, **kwargs) + + def dwarfsextract(self, *args, **kwargs): + return self.hyperfine(self.config.binary("dwarfsextract"), *args, **kwargs) + + def hyperfine(self, *cmd, **kwargs): + res = self.config.hyperfine(" ".join(cmd), self.name, **kwargs) + return res["results"][0] + + def sample(self, result): + compiler = None + if "gcc" in self.config.tags: + compiler = "gcc" + if "clang" in self.config.tags: + compiler = "clang" + obj = { + "name": self.name, + "type": self.config.config_type(), + "is_release": self.config.is_release, + "arch": platform.machine(), + "compiler": compiler, + "lto": "lto" in self.config.tags, + "minsize": "minsize" in self.config.tags, + "minimal": "minimal" in self.config.tags, + "musl": "musl" in self.config.tags, + "mimalloc": "mimalloc" in self.config.tags, + "processor": platform.processor(), + "cpus": self.config.cpus, + "hostname": platform.node(), + "config": self.config.full_config, + "version": str(self.config.version), + "commit": self.config.commit, + "commit_time": self.config.commit_time.timestamp(), + "time": datetime.datetime.now().timestamp(), + "tags": list(self.config.tags), + } + obj.update(result) + version = self.config.version + if self.config.commit: + version = f"{version}-{self.config.commit}" + if self.config.full_config: + version = f"{version}-{self.config.full_config}" + sample_file = os.path.join( + self.output, + f"{self.name}-{self.config.config_type()}-{platform.machine()}-{version}-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S.%f')}.json", + ) + with open(sample_file, "w") as f: + json.dump(obj, f, indent=4) + + +class Config(object): + def __init__(self, directory, filename, prefix, suffix=None): + self.directory = directory + self.filename = filename + + # remove prefix and suffix from filename to get version and config + assert filename.startswith( + prefix + ), f"Filename {filename} does not start with prefix {prefix}" + assert suffix is None or filename.endswith( + suffix + ), f"Filename {filename} does not end with suffix {suffix}" + cfgver = filename[len(prefix) :] + if suffix: + cfgver = cfgver[: -len(suffix)] + + # everything before `-Linux-` is the version, everything after `-{arch}-` is the config + parts = cfgver.split(f"-Linux-{platform.machine()}") + assert ( + len(parts) == 2 + ), f"Filename {filename} does not contain '-Linux-{platform.machine()}'" + verhash = parts[0] + if len(parts[1]) == 0: + self.full_config = None + self.tags = set() + else: + assert parts[1].startswith( + "-" + ), f"Config {parts[1]} does not start with '-'" + self.full_config = parts[1].lstrip("-") + self.tags = set(parts[1].lstrip("-").split("-")) + + # the verhash contains the version, optionally followed by the number of commits and the commit hash + parts = verhash.split("-") + if len(parts) == 1: + self.version = Version(parts[0]) + self.commit = None + self.is_release = True + else: + assert len(parts) == 3, f"Cannot parse version from {verhash}" + self.version = Version(parts[0]) + assert parts[2].startswith( + "g" + ), f"Commit hash {parts[2]} does not start with 'g'" + self.commit = parts[2][1:] + self.is_release = False + + def __repr__(self): + return f"{self.__class__.__name__}(directory={self.directory}, filename={self.filename}, config={self.full_config}), version={self.version}, commit={self.commit}, tags={self.tags})" + + def has_binary(self, binary): + """Check if the configuration has a specific binary.""" + return binary in self.binaries + + def at_least_version(self, version): + """Check if the configuration is at least a specific version.""" + return self.version >= version + + def set_cpus(self, cpus): + """Set the CPUs to use for the benchmark.""" + self.cpus = cpus + + def set_tmpdir(self, tmpdir): + """Set the temporary directory for the benchmark.""" + self.tmpdir = tmpdir + + def hyperfine(self, command, benchmark_name, **kwargs): + """Run a command using hyperfine.""" + cmd = [] + if self.cpus: + cmd.extend(["taskset", "--cpu-list", self.cpus]) + cmd.append("hyperfine") + # cmd.append("--show-output") + cmd.extend(["--warmup", kwargs.get("warmup", "2")]) + min_runs = kwargs.get("min_runs") + if min_runs is not None: + cmd.extend(["--min-runs", str(min_runs)]) + output = os.path.join(self.tmpdir, f"__hyperfine.json") + cmd.extend(["--export-json", output]) + cmd.extend(["--command-name", benchmark_name]) + cmd.append(command) + logging.debug(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + # parse the JSON output and remove the JSON file + with open(output, "r") as f: + data = json.load(f) + os.remove(output) + return data + + def binary(self, name): + """Get the path to a binary.""" + path = self.binaries.get(name) + if path is None: + raise ValueError( + f"Binary {name} not found in {self.__class__.__name__}({self.filename})" + ) + return path + + +class StandaloneConfig(Config): + def __init__(self, directory, tarball): + super().__init__(directory, tarball, "dwarfs-", ".tar.zst") + + def config_type(self): + return "standalone" + + def prepare(self): + # Extract the tarball into the temporary directory + tarball_path = os.path.join(self.directory, self.filename) + logging.info(f"Extracting {tarball_path} to {self.tmpdir}") + subprocess.run( + ["tar", "-xf", tarball_path, "-C", self.tmpdir, "--strip-components=1"], + check=True, + ) + self.binaries = { + "dwarfs": os.path.join(self.tmpdir, "sbin", "dwarfs"), + "mkdwarfs": os.path.join(self.tmpdir, "bin", "mkdwarfs"), + "dwarfsck": os.path.join(self.tmpdir, "bin", "dwarfsck"), + "dwarfsextract": os.path.join(self.tmpdir, "bin", "dwarfsextract"), + } + + # Ensure all binaries exist + for binary in self.binaries.values(): + assert os.path.exists(binary), f"Binary {binary} does not exist" + + +class UniversalConfig(Config): + def __init__(self, directory, binary): + super().__init__(directory, binary, "dwarfs-universal-") + + def config_type(self): + return "universal" + + def prepare(self): + # Copy the universal binary to the temporary directory + binary_path = os.path.join(self.directory, self.filename) + logging.info(f"Copying {binary_path} to {self.tmpdir}") + shutil.copy2(binary_path, self.tmpdir) + # Symlink the binaries to the universal binary + self.binaries = { + "dwarfs": os.path.join(self.tmpdir, "dwarfs"), + "mkdwarfs": os.path.join(self.tmpdir, "mkdwarfs"), + "dwarfsck": os.path.join(self.tmpdir, "dwarfsck"), + "dwarfsextract": os.path.join(self.tmpdir, "dwarfsextract"), + } + for binary in self.binaries.values(): + os.symlink(os.path.join(self.tmpdir, self.filename), binary) + + +class FuseExtractConfig(Config): + def __init__(self, directory, binary): + super().__init__(directory, binary, "dwarfs-fuse-extract-") + + def config_type(self): + return "fuse-extract" + + def prepare(self): + # Copy the universal binary to the temporary directory + binary_path = os.path.join(self.directory, self.filename) + logging.info(f"Copying {binary_path} to {self.tmpdir}") + shutil.copy2(binary_path, self.tmpdir) + # Symlink the binaries to the universal binary + self.binaries = { + "dwarfs": os.path.join(self.tmpdir, "dwarfs"), + "dwarfsextract": os.path.join(self.tmpdir, "dwarfsextract"), + } + for binary in self.binaries.values(): + os.symlink(os.path.join(self.tmpdir, self.filename), binary) + + +def find_configurations(input_dir): + configs = [] + + def transform_and_filter(paths): + return [ + os.path.basename(path) + for path in paths + if not any(x in path for x in ["-debug", "-reldbg", "-stacktrace"]) + ] + + # Find all tarballs matching `dwarfs-*Linux*.tar.zst` + tarballs = transform_and_filter( + glob.glob( + os.path.join(input_dir, f"dwarfs-*Linux-{platform.machine()}*.tar.zst") + ) + ) + configs.extend([StandaloneConfig(input_dir, tarball) for tarball in tarballs]) + + # Find all universal binaries matching `dwarfs-universal-*Linux*` + universal = transform_and_filter( + glob.glob( + os.path.join(input_dir, f"dwarfs-universal-*Linux-{platform.machine()}*") + ) + ) + configs.extend([UniversalConfig(input_dir, binary) for binary in universal]) + + # Find all fuse-extract binaries matching `fuse-extract-*Linux*` + fuse_extract = transform_and_filter( + glob.glob( + os.path.join(input_dir, f"dwarfs-fuse-extract-*Linux-{platform.machine()}*") + ) + ) + configs.extend([FuseExtractConfig(input_dir, binary) for binary in fuse_extract]) + + return configs + + +def main(): + defaults = { + "gandalf": { + "cpus": "0-15", + }, + "tangerinepi5b": { + "cpus": "4-7", + }, + "orangepi": { + "cpus": "4-7", + }, + } + + parser = argparse.ArgumentParser(description="Dwarfs Benchmark Runner Script") + parser.add_argument( + "--input-dir", + help="Directory containing tarballs and additional binaries.", + ) + parser.add_argument( + "--data-dir", + default=os.path.join(os.path.dirname(__file__), "data"), + help="Directory containing data files for benchmarks.", + ) + parser.add_argument( + "--tmp-dir", + default=os.environ.get("XDG_RUNTIME_DIR"), + help="Temporary directory for benchmarks. Defaults to XDG_RUNTIME_DIR.", + ) + parser.add_argument( + "--output-dir", help="Directory to store benchmark JSON samples." + ) + parser.add_argument( + "--cpus", + help="CPUs to run benchmarks on (e.g., '0-3'). Passed to taskset if provided.", + ) + parser.add_argument( + "--commit-time", + default="now", + help="Commit time for the benchmark. Defaults to 'now'.", + ) + parser.add_argument( + "--tag", + action="append", + default=[], + help="Additional tag in KEY=VALUE format (can be used multiple times).", + ) + parser.add_argument( + "--log-level", + default="INFO", + help="Set the logging level (e.g., DEBUG, INFO, WARNING).", + ) + parser.add_argument( + "--list", + action="store_true", + help="List all available benchmarks and exit.", + ) + parser.add_argument( + "--only", + action="append", + default=[], + help="Run only the specified benchmarks (can be used multiple times).", + ) + parser.add_argument( + "--config", + action="append", + default=[], + help="Run only the specified configurations (can be used multiple times).", + ) + args = parser.parse_args() + + # Set up logging with colored output + coloredlogs.install( + level=args.log_level, + fmt="%(asctime)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + isatty=True, + ) + + if args.list: + print("Available benchmarks:") + for benchmark_func in benchmark_registry: + print(f" {benchmark_func.__name__}") + sys.exit(0) + + if args.input_dir is None: + parser.error("The --input-dir argument is required.") + + if args.output_dir is None: + parser.error("The --output-dir argument is required.") + + commit_time = ( + datetime.datetime.now() + if args.commit_time == "now" + else datetime.datetime.fromtimestamp(int(args.commit_time)) + ) + + nodedef = defaults.get(platform.node()) + if nodedef is not None: + logging.info(f"Using defaults for {platform.node()}: {nodedef}") + for key, value in nodedef.items(): + if getattr(args, key) is None: + setattr(args, key, value) + + configs = find_configurations(args.input_dir) + benchmarks = set(args.only) + + # additional_tags = parse_extra_tags(args.tag) + os.makedirs(args.output_dir, exist_ok=True) + + for config in configs: + if args.config and config.full_config not in args.config: + logging.debug(f"Skipping {config.filename} (not selected)") + continue + + logging.info(f"Processing: {config}") + config.set_cpus(args.cpus) + config.commit_time = commit_time + + with tempfile.TemporaryDirectory(dir=args.tmp_dir) as temp_root: + logging.debug(f"Using temporary directory: {temp_root}") + config.set_tmpdir(temp_root) + config.prepare() + + for benchmark_func in benchmark_registry: + if benchmarks and benchmark_func.__name__ not in benchmarks: + logging.debug(f"Skipping {benchmark_func.__name__} (not selected)") + continue + + # Check if the function has required version or binary + if hasattr(benchmark_func, "required_version"): + if not config.at_least_version(benchmark_func.required_version): + logging.info( + f"Skipping {benchmark_func.__name__} for {config.filename} due to version requirement {benchmark_func.required_version}." + ) + continue + + if hasattr(benchmark_func, "required_binary"): + if not config.has_binary(benchmark_func.required_binary): + logging.info( + f"Skipping {benchmark_func.__name__} for {config.filename} due to missing {benchmark_func.required_binary}." + ) + continue + + if hasattr(benchmark_func, "required_tag"): + if benchmark_func.required_tag not in config.tags: + logging.info( + f"Skipping {benchmark_func.__name__} for {config.filename} due to missing tag {benchmark_func.required_tag}." + ) + continue + + if hasattr(benchmark_func, "excluded_tag"): + if benchmark_func.excluded_tag in config.tags: + logging.info( + f"Skipping {benchmark_func.__name__} for {config.filename} due to excluded tag {benchmark_func.excluded_tag}." + ) + continue + + # Call the benchmark function + benchmark_func( + BenchmarkEnvironment( + config, args.data_dir, args.output_dir, benchmark_func.__name__ + ) + ) + + +if __name__ == "__main__": + main() diff --git a/.gitignore b/.gitignore index cb8d20cf..2da63b62 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,8 @@ /man/*.1.html *.log /.gdb_history +/.benchmark/data/ +/.benchmark/*.db *~ .*.swp