/* vim:set ts=2 sw=2 sts=2 et: */ /** * \author Marcus Holland-Moritz (github@mhxnet.de) * \copyright Copyright (c) Marcus Holland-Moritz * * This file is part of dwarfs. * * dwarfs is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * dwarfs is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with dwarfs. If not, see . * * SPDX-License-Identifier: GPL-3.0-or-later */ #pragma once #include #include #include #include #include #include namespace dwarfs::test { struct lz_params { // Probability of choosing a "copy from the past" step vs. emitting a literal double copy_prob = 0.70; // Max distance for backreferences (typical LZ77 windows are 32–64 KiB) std::size_t window = 1u << 15; // 32 KiB // Copy lengths ~ truncated geometric around this mean (controls // repetitiveness) std::size_t min_match = 4; std::size_t max_match = 128; double target_match_mean = 20.0; // average copy length // Geometric distribution for distance (smaller distances more likely) double distance_mean = 128.0; // Chance each character in a copy mutates into a random literal (adds // “noise”) double mutation_rate = 0.005; // If true, literals look like English-ish text; if false, literals are 0–255 // bytes bool text_mode = true; // RNG seed for reproducibility std::uint64_t seed = 0x1234'5678'9abc'def0ULL; }; class lz_synthetic_generator { public: explicit lz_synthetic_generator(lz_params p = {}) : p_{p} , rng_{p.seed} { if (p_.text_mode) { init_text_alphabet(); } else { init_binary_alphabet(); } // geometric_distribution parameterization: mean of failures = (1-p)/p // We want E[min_match + failures] ≈ target_match_mean => E[failures] ≈ // target - min double mean_fail = std::max(1.0, p_.target_match_mean - static_cast(p_.min_match)); double p_len = 1.0 / (mean_fail + 1.0); geo_len_ = std::geometric_distribution(p_len); double mean_dist_fail = std::max(1.0, p_.distance_mean); double p_dist = 1.0 / (mean_dist_fail + 1.0); geo_dist_ = std::geometric_distribution(p_dist); bern_copy_ = std::bernoulli_distribution(p_.copy_prob); bern_mut_ = std::bernoulli_distribution(p_.mutation_rate); } std::string generate(std::size_t n_bytes) { std::string out; out.reserve(n_bytes); while (out.size() < n_bytes) { bool const can_copy = out.size() >= p_.min_match; if (can_copy && bern_copy_(rng_)) { emit_copy(out, n_bytes); } else { out.push_back(sample_literal()); } } return out; } private: void init_text_alphabet() { // Rough English-ish frequencies via "etaoin shrdlu..." ranking. // Higher rank => higher weight. We include space/newline/punct/digits. static constexpr std::string_view freq_rank = " etaoinshrdlucmfwypvbgkqjxz"; // space first (most frequent) // Map ranks to weights (largest for rank 0). std::array weights{}; for (int i = 0; i < 256; ++i) { weights[i] = 1; } auto apply_rank = [&](char c, size_t rank_base) { int r = std::max(1, static_cast(freq_rank.size()) - static_cast(rank_base)); weights[static_cast(c)] += r; }; for (size_t i = 0; i < freq_rank.size(); ++i) { char c = freq_rank[i]; apply_rank(c, i); if (c >= 'a' && c <= 'z') { apply_rank(char(c - 'a' + 'A'), i + 6); // uppercase similar but rarer } } // Common punctuation and digits std::string const punct = ".,;:-()[]{}!?\"'"; for (char c : punct) { weights[static_cast(c)] += 8; } for (char c = '0'; c <= '9'; ++c) { weights[static_cast(c)] += 4; } // Newlines and tabs, for “document” feel weights['\n'] += 6; weights['\t'] += 2; // Build alphabet and weight vector for std::discrete_distribution for (int i = 0; i < 256; ++i) { if (weights[i] > 0) { text_alphabet_.push_back(static_cast(i)); text_weights_.push_back(weights[i]); } } text_dist_ = std::discrete_distribution(text_weights_.begin(), text_weights_.end()); } void init_binary_alphabet() { binary_dist_ = std::uniform_int_distribution(0, 255); } char sample_literal() { if (p_.text_mode) { int idx = text_dist_(rng_); return static_cast(text_alphabet_[static_cast(idx)]); } return static_cast(binary_dist_(rng_)); } void emit_copy(std::string& out, std::size_t n_bytes) { // Distance: 1 + geometric, truncated to current size and window std::size_t max_dist = std::min(p_.window, out.size()); if (max_dist == 0) { out.push_back(sample_literal()); return; } std::size_t dist = 1u + static_cast(geo_dist_(rng_)); if (dist > max_dist) dist = 1u + (dist % max_dist); // ensure in-range // Length: min_match + geometric, truncated by end and max_match std::size_t max_len = std::min(p_.max_match, n_bytes - out.size()); if (max_len < p_.min_match) { out.push_back(sample_literal()); return; } std::size_t len = p_.min_match + static_cast(geo_len_(rng_)); if (len > max_len) len = max_len; std::size_t start = out.size() - dist; for (std::size_t i = 0; i < len && out.size() < n_bytes; ++i) { unsigned char c = static_cast(out[start + i]); if (bern_mut_(rng_)) { c = static_cast(sample_literal()); } out.push_back(static_cast(c)); } } lz_params p_; std::mt19937_64 rng_; std::vector text_alphabet_; std::vector text_weights_; std::discrete_distribution text_dist_; std::uniform_int_distribution binary_dist_{0, 255}; std::bernoulli_distribution bern_copy_; std::bernoulli_distribution bern_mut_; std::geometric_distribution geo_len_; std::geometric_distribution geo_dist_; }; } // namespace dwarfs::test