diff --git a/test/lz_synthetic_generator.h b/test/lz_synthetic_generator.h new file mode 100644 index 00000000..2dd1aad1 --- /dev/null +++ b/test/lz_synthetic_generator.h @@ -0,0 +1,216 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + * + * SPDX-License-Identifier: GPL-3.0-only + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace dwarfs::test { + +struct lz_params { + // Probability of choosing a "copy from the past" step vs. emitting a literal + double copy_prob = 0.70; + + // Max distance for backreferences (typical LZ77 windows are 32–64 KiB) + std::size_t window = 1u << 15; // 32 KiB + + // Copy lengths ~ truncated geometric around this mean (controls + // repetitiveness) + std::size_t min_match = 4; + std::size_t max_match = 128; + double target_match_mean = 20.0; // average copy length + + // Geometric distribution for distance (smaller distances more likely) + double distance_mean = 128.0; + + // Chance each character in a copy mutates into a random literal (adds + // “noise”) + double mutation_rate = 0.005; + + // If true, literals look like English-ish text; if false, literals are 0–255 + // bytes + bool text_mode = true; + + // RNG seed for reproducibility + std::uint64_t seed = 0x1234'5678'9abc'def0ULL; +}; + +class lz_synthetic_generator { + public: + explicit lz_synthetic_generator(lz_params p = {}) + : p_{p} + , rng_{p.seed} { + if (p_.text_mode) { + init_text_alphabet(); + } else { + init_binary_alphabet(); + } + + // geometric_distribution parameterization: mean of failures = (1-p)/p + // We want E[min_match + failures] ≈ target_match_mean => E[failures] ≈ + // target - min + double mean_fail = + std::max(1.0, p_.target_match_mean - static_cast(p_.min_match)); + double p_len = 1.0 / (mean_fail + 1.0); + geo_len_ = std::geometric_distribution(p_len); + + double mean_dist_fail = std::max(1.0, p_.distance_mean); + double p_dist = 1.0 / (mean_dist_fail + 1.0); + geo_dist_ = std::geometric_distribution(p_dist); + + bern_copy_ = std::bernoulli_distribution(p_.copy_prob); + bern_mut_ = std::bernoulli_distribution(p_.mutation_rate); + } + + std::string generate(std::size_t n_bytes) { + std::string out; + out.reserve(n_bytes); + + while (out.size() < n_bytes) { + bool const can_copy = out.size() >= p_.min_match; + if (can_copy && bern_copy_(rng_)) { + emit_copy(out, n_bytes); + } else { + out.push_back(sample_literal()); + } + } + return out; + } + + private: + void init_text_alphabet() { + // Rough English-ish frequencies via "etaoin shrdlu..." ranking. + // Higher rank => higher weight. We include space/newline/punct/digits. + static constexpr std::string_view freq_rank = + " etaoinshrdlucmfwypvbgkqjxz"; // space first (most frequent) + // Map ranks to weights (largest for rank 0). + std::array weights{}; + for (int i = 0; i < 256; ++i) { + weights[i] = 1; + } + + auto apply_rank = [&](char c, size_t rank_base) { + int r = std::max(1, static_cast(freq_rank.size()) - + static_cast(rank_base)); + weights[static_cast(c)] += r; + }; + + for (size_t i = 0; i < freq_rank.size(); ++i) { + char c = freq_rank[i]; + apply_rank(c, i); + if (c >= 'a' && c <= 'z') { + apply_rank(char(c - 'a' + 'A'), i + 6); // uppercase similar but rarer + } + } + + // Common punctuation and digits + std::string const punct = ".,;:-()[]{}!?\"'"; + for (char c : punct) { + weights[static_cast(c)] += 8; + } + for (char c = '0'; c <= '9'; ++c) { + weights[static_cast(c)] += 4; + } + + // Newlines and tabs, for “document” feel + weights['\n'] += 6; + weights['\t'] += 2; + + // Build alphabet and weight vector for std::discrete_distribution + for (int i = 0; i < 256; ++i) { + if (weights[i] > 0) { + text_alphabet_.push_back(static_cast(i)); + text_weights_.push_back(weights[i]); + } + } + text_dist_ = std::discrete_distribution(text_weights_.begin(), + text_weights_.end()); + } + + void init_binary_alphabet() { + binary_dist_ = std::uniform_int_distribution(0, 255); + } + + char sample_literal() { + if (p_.text_mode) { + int idx = text_dist_(rng_); + return static_cast(text_alphabet_[static_cast(idx)]); + } + return static_cast(binary_dist_(rng_)); + } + + void emit_copy(std::string& out, std::size_t n_bytes) { + // Distance: 1 + geometric, truncated to current size and window + std::size_t max_dist = std::min(p_.window, out.size()); + if (max_dist == 0) { + out.push_back(sample_literal()); + return; + } + + std::size_t dist = 1u + static_cast(geo_dist_(rng_)); + if (dist > max_dist) + dist = 1u + (dist % max_dist); // ensure in-range + + // Length: min_match + geometric, truncated by end and max_match + std::size_t max_len = + std::min(p_.max_match, n_bytes - out.size()); + if (max_len < p_.min_match) { + out.push_back(sample_literal()); + return; + } + + std::size_t len = p_.min_match + static_cast(geo_len_(rng_)); + if (len > max_len) + len = max_len; + + std::size_t start = out.size() - dist; + for (std::size_t i = 0; i < len && out.size() < n_bytes; ++i) { + unsigned char c = static_cast(out[start + i]); + if (bern_mut_(rng_)) { + c = static_cast(sample_literal()); + } + out.push_back(static_cast(c)); + } + } + + lz_params p_; + std::mt19937_64 rng_; + + std::vector text_alphabet_; + std::vector text_weights_; + std::discrete_distribution text_dist_; + + std::uniform_int_distribution binary_dist_{0, 255}; + + std::bernoulli_distribution bern_copy_; + std::bernoulli_distribution bern_mut_; + std::geometric_distribution geo_len_; + std::geometric_distribution geo_dist_; +}; + +} // namespace dwarfs::test diff --git a/test/test_helpers.cpp b/test/test_helpers.cpp index ad9f0c84..d9b3beba 100644 --- a/test/test_helpers.cpp +++ b/test/test_helpers.cpp @@ -38,6 +38,7 @@ #include #include "loremipsum.h" +#include "lz_synthetic_generator.h" #include "mmap_mock.h" #include "test_helpers.h" @@ -271,9 +272,10 @@ void os_access_mock::add_file(fs::path const& path, size_t size, bool random) { if (random) { thread_local std::mt19937_64 rng{42}; - std::uniform_int_distribution<> choice_dist{0, 3}; + std::uniform_int_distribution<> choice_dist{0, 4}; + auto choice = choice_dist(rng); - switch (choice_dist(rng)) { + switch (choice) { default: break; @@ -281,6 +283,18 @@ void os_access_mock::add_file(fs::path const& path, size_t size, bool random) { add(path, st, [size, seed = rng()] { return create_random_string(size, seed); }); return; + + case 1: + case 2: { + add(path, st, [size, seed = rng(), text_mode = choice == 1] { + lz_params lzp{}; + lzp.text_mode = text_mode; + lzp.seed = seed; + lz_synthetic_generator gen{lzp}; + return gen.generate(size); + }); + return; + } } } diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index 4e7305ba..6c15fe92 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -67,6 +67,7 @@ #include "filter_test_data.h" #include "loremipsum.h" +#include "lz_synthetic_generator.h" #include "mmap_mock.h" #include "test_helpers.h" #include "test_logger.h" @@ -258,6 +259,15 @@ class mkdwarfs_tester : public tester_common { return test::create_random_string(size, 'A', 'Z', rng); }; + test::lz_params text_lzp{}; + test::lz_params binary_lzp{}; + text_lzp.text_mode = true; + binary_lzp.text_mode = false; + text_lzp.seed = rng(); + binary_lzp.seed = rng(); + test::lz_synthetic_generator text_gen{text_lzp}; + test::lz_synthetic_generator binary_gen{binary_lzp}; + for (int x = 0; x < opt.dimension; ++x) { fs::path d1{random_path_component() + std::to_string(x)}; os->add_dir(d1); @@ -268,13 +278,24 @@ class mkdwarfs_tester : public tester_common { for (int z = 0; z < opt.dimension; ++z) { fs::path f{d2 / (random_path_component() + std::to_string(z))}; - auto size = std::min(max_size, static_cast(size_dist(rng))); + auto const size = + std::min(max_size, static_cast(size_dist(rng))); std::string data; - if (size < 1024 * 1024 && rng() % 2 == 0) { + auto const choice = rng() % 4; + switch (choice) { + case 0: data = test::create_random_string(size, rng); - } else { + break; + case 1: data = test::loremipsum(size); + break; + case 3: + data = text_gen.generate(size); + break; + case 4: + data = binary_gen.generate(size); + break; } os->add_file(f, data);