diff --git a/test/lz_synthetic_generator.h b/test/lz_synthetic_generator.h
new file mode 100644
index 00000000..2dd1aad1
--- /dev/null
+++ b/test/lz_synthetic_generator.h
@@ -0,0 +1,216 @@
+/* vim:set ts=2 sw=2 sts=2 et: */
+/**
+ * \author Marcus Holland-Moritz (github@mhxnet.de)
+ * \copyright Copyright (c) Marcus Holland-Moritz
+ *
+ * This file is part of dwarfs.
+ *
+ * dwarfs is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * dwarfs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with dwarfs. If not, see .
+ *
+ * SPDX-License-Identifier: GPL-3.0-only
+ */
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace dwarfs::test {
+
+struct lz_params {
+ // Probability of choosing a "copy from the past" step vs. emitting a literal
+ double copy_prob = 0.70;
+
+ // Max distance for backreferences (typical LZ77 windows are 32–64 KiB)
+ std::size_t window = 1u << 15; // 32 KiB
+
+ // Copy lengths ~ truncated geometric around this mean (controls
+ // repetitiveness)
+ std::size_t min_match = 4;
+ std::size_t max_match = 128;
+ double target_match_mean = 20.0; // average copy length
+
+ // Geometric distribution for distance (smaller distances more likely)
+ double distance_mean = 128.0;
+
+ // Chance each character in a copy mutates into a random literal (adds
+ // “noise”)
+ double mutation_rate = 0.005;
+
+ // If true, literals look like English-ish text; if false, literals are 0–255
+ // bytes
+ bool text_mode = true;
+
+ // RNG seed for reproducibility
+ std::uint64_t seed = 0x1234'5678'9abc'def0ULL;
+};
+
+class lz_synthetic_generator {
+ public:
+ explicit lz_synthetic_generator(lz_params p = {})
+ : p_{p}
+ , rng_{p.seed} {
+ if (p_.text_mode) {
+ init_text_alphabet();
+ } else {
+ init_binary_alphabet();
+ }
+
+ // geometric_distribution parameterization: mean of failures = (1-p)/p
+ // We want E[min_match + failures] ≈ target_match_mean => E[failures] ≈
+ // target - min
+ double mean_fail =
+ std::max(1.0, p_.target_match_mean - static_cast(p_.min_match));
+ double p_len = 1.0 / (mean_fail + 1.0);
+ geo_len_ = std::geometric_distribution(p_len);
+
+ double mean_dist_fail = std::max(1.0, p_.distance_mean);
+ double p_dist = 1.0 / (mean_dist_fail + 1.0);
+ geo_dist_ = std::geometric_distribution(p_dist);
+
+ bern_copy_ = std::bernoulli_distribution(p_.copy_prob);
+ bern_mut_ = std::bernoulli_distribution(p_.mutation_rate);
+ }
+
+ std::string generate(std::size_t n_bytes) {
+ std::string out;
+ out.reserve(n_bytes);
+
+ while (out.size() < n_bytes) {
+ bool const can_copy = out.size() >= p_.min_match;
+ if (can_copy && bern_copy_(rng_)) {
+ emit_copy(out, n_bytes);
+ } else {
+ out.push_back(sample_literal());
+ }
+ }
+ return out;
+ }
+
+ private:
+ void init_text_alphabet() {
+ // Rough English-ish frequencies via "etaoin shrdlu..." ranking.
+ // Higher rank => higher weight. We include space/newline/punct/digits.
+ static constexpr std::string_view freq_rank =
+ " etaoinshrdlucmfwypvbgkqjxz"; // space first (most frequent)
+ // Map ranks to weights (largest for rank 0).
+ std::array weights{};
+ for (int i = 0; i < 256; ++i) {
+ weights[i] = 1;
+ }
+
+ auto apply_rank = [&](char c, size_t rank_base) {
+ int r = std::max(1, static_cast(freq_rank.size()) -
+ static_cast(rank_base));
+ weights[static_cast(c)] += r;
+ };
+
+ for (size_t i = 0; i < freq_rank.size(); ++i) {
+ char c = freq_rank[i];
+ apply_rank(c, i);
+ if (c >= 'a' && c <= 'z') {
+ apply_rank(char(c - 'a' + 'A'), i + 6); // uppercase similar but rarer
+ }
+ }
+
+ // Common punctuation and digits
+ std::string const punct = ".,;:-()[]{}!?\"'";
+ for (char c : punct) {
+ weights[static_cast(c)] += 8;
+ }
+ for (char c = '0'; c <= '9'; ++c) {
+ weights[static_cast(c)] += 4;
+ }
+
+ // Newlines and tabs, for “document” feel
+ weights['\n'] += 6;
+ weights['\t'] += 2;
+
+ // Build alphabet and weight vector for std::discrete_distribution
+ for (int i = 0; i < 256; ++i) {
+ if (weights[i] > 0) {
+ text_alphabet_.push_back(static_cast(i));
+ text_weights_.push_back(weights[i]);
+ }
+ }
+ text_dist_ = std::discrete_distribution(text_weights_.begin(),
+ text_weights_.end());
+ }
+
+ void init_binary_alphabet() {
+ binary_dist_ = std::uniform_int_distribution(0, 255);
+ }
+
+ char sample_literal() {
+ if (p_.text_mode) {
+ int idx = text_dist_(rng_);
+ return static_cast(text_alphabet_[static_cast(idx)]);
+ }
+ return static_cast(binary_dist_(rng_));
+ }
+
+ void emit_copy(std::string& out, std::size_t n_bytes) {
+ // Distance: 1 + geometric, truncated to current size and window
+ std::size_t max_dist = std::min(p_.window, out.size());
+ if (max_dist == 0) {
+ out.push_back(sample_literal());
+ return;
+ }
+
+ std::size_t dist = 1u + static_cast(geo_dist_(rng_));
+ if (dist > max_dist)
+ dist = 1u + (dist % max_dist); // ensure in-range
+
+ // Length: min_match + geometric, truncated by end and max_match
+ std::size_t max_len =
+ std::min(p_.max_match, n_bytes - out.size());
+ if (max_len < p_.min_match) {
+ out.push_back(sample_literal());
+ return;
+ }
+
+ std::size_t len = p_.min_match + static_cast(geo_len_(rng_));
+ if (len > max_len)
+ len = max_len;
+
+ std::size_t start = out.size() - dist;
+ for (std::size_t i = 0; i < len && out.size() < n_bytes; ++i) {
+ unsigned char c = static_cast(out[start + i]);
+ if (bern_mut_(rng_)) {
+ c = static_cast(sample_literal());
+ }
+ out.push_back(static_cast(c));
+ }
+ }
+
+ lz_params p_;
+ std::mt19937_64 rng_;
+
+ std::vector text_alphabet_;
+ std::vector text_weights_;
+ std::discrete_distribution text_dist_;
+
+ std::uniform_int_distribution binary_dist_{0, 255};
+
+ std::bernoulli_distribution bern_copy_;
+ std::bernoulli_distribution bern_mut_;
+ std::geometric_distribution geo_len_;
+ std::geometric_distribution geo_dist_;
+};
+
+} // namespace dwarfs::test
diff --git a/test/test_helpers.cpp b/test/test_helpers.cpp
index ad9f0c84..d9b3beba 100644
--- a/test/test_helpers.cpp
+++ b/test/test_helpers.cpp
@@ -38,6 +38,7 @@
#include
#include "loremipsum.h"
+#include "lz_synthetic_generator.h"
#include "mmap_mock.h"
#include "test_helpers.h"
@@ -271,9 +272,10 @@ void os_access_mock::add_file(fs::path const& path, size_t size, bool random) {
if (random) {
thread_local std::mt19937_64 rng{42};
- std::uniform_int_distribution<> choice_dist{0, 3};
+ std::uniform_int_distribution<> choice_dist{0, 4};
+ auto choice = choice_dist(rng);
- switch (choice_dist(rng)) {
+ switch (choice) {
default:
break;
@@ -281,6 +283,18 @@ void os_access_mock::add_file(fs::path const& path, size_t size, bool random) {
add(path, st,
[size, seed = rng()] { return create_random_string(size, seed); });
return;
+
+ case 1:
+ case 2: {
+ add(path, st, [size, seed = rng(), text_mode = choice == 1] {
+ lz_params lzp{};
+ lzp.text_mode = text_mode;
+ lzp.seed = seed;
+ lz_synthetic_generator gen{lzp};
+ return gen.generate(size);
+ });
+ return;
+ }
}
}
diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp
index 4e7305ba..6c15fe92 100644
--- a/test/tool_main_test.cpp
+++ b/test/tool_main_test.cpp
@@ -67,6 +67,7 @@
#include "filter_test_data.h"
#include "loremipsum.h"
+#include "lz_synthetic_generator.h"
#include "mmap_mock.h"
#include "test_helpers.h"
#include "test_logger.h"
@@ -258,6 +259,15 @@ class mkdwarfs_tester : public tester_common {
return test::create_random_string(size, 'A', 'Z', rng);
};
+ test::lz_params text_lzp{};
+ test::lz_params binary_lzp{};
+ text_lzp.text_mode = true;
+ binary_lzp.text_mode = false;
+ text_lzp.seed = rng();
+ binary_lzp.seed = rng();
+ test::lz_synthetic_generator text_gen{text_lzp};
+ test::lz_synthetic_generator binary_gen{binary_lzp};
+
for (int x = 0; x < opt.dimension; ++x) {
fs::path d1{random_path_component() + std::to_string(x)};
os->add_dir(d1);
@@ -268,13 +278,24 @@ class mkdwarfs_tester : public tester_common {
for (int z = 0; z < opt.dimension; ++z) {
fs::path f{d2 / (random_path_component() + std::to_string(z))};
- auto size = std::min(max_size, static_cast(size_dist(rng)));
+ auto const size =
+ std::min(max_size, static_cast(size_dist(rng)));
std::string data;
- if (size < 1024 * 1024 && rng() % 2 == 0) {
+ auto const choice = rng() % 4;
+ switch (choice) {
+ case 0:
data = test::create_random_string(size, rng);
- } else {
+ break;
+ case 1:
data = test::loremipsum(size);
+ break;
+ case 3:
+ data = text_gen.generate(size);
+ break;
+ case 4:
+ data = binary_gen.generate(size);
+ break;
}
os->add_file(f, data);