mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-13 06:16:55 -04:00
test: add lz_synthetic_generator
for more interesting random data
This commit is contained in:
parent
0c0bfd0cc7
commit
8549136da4
216
test/lz_synthetic_generator.h
Normal file
216
test/lz_synthetic_generator.h
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||||
|
/**
|
||||||
|
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||||
|
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||||
|
*
|
||||||
|
* This file is part of dwarfs.
|
||||||
|
*
|
||||||
|
* dwarfs is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* dwarfs is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: GPL-3.0-only
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace dwarfs::test {
|
||||||
|
|
||||||
|
struct lz_params {
|
||||||
|
// Probability of choosing a "copy from the past" step vs. emitting a literal
|
||||||
|
double copy_prob = 0.70;
|
||||||
|
|
||||||
|
// Max distance for backreferences (typical LZ77 windows are 32–64 KiB)
|
||||||
|
std::size_t window = 1u << 15; // 32 KiB
|
||||||
|
|
||||||
|
// Copy lengths ~ truncated geometric around this mean (controls
|
||||||
|
// repetitiveness)
|
||||||
|
std::size_t min_match = 4;
|
||||||
|
std::size_t max_match = 128;
|
||||||
|
double target_match_mean = 20.0; // average copy length
|
||||||
|
|
||||||
|
// Geometric distribution for distance (smaller distances more likely)
|
||||||
|
double distance_mean = 128.0;
|
||||||
|
|
||||||
|
// Chance each character in a copy mutates into a random literal (adds
|
||||||
|
// “noise”)
|
||||||
|
double mutation_rate = 0.005;
|
||||||
|
|
||||||
|
// If true, literals look like English-ish text; if false, literals are 0–255
|
||||||
|
// bytes
|
||||||
|
bool text_mode = true;
|
||||||
|
|
||||||
|
// RNG seed for reproducibility
|
||||||
|
std::uint64_t seed = 0x1234'5678'9abc'def0ULL;
|
||||||
|
};
|
||||||
|
|
||||||
|
class lz_synthetic_generator {
|
||||||
|
public:
|
||||||
|
explicit lz_synthetic_generator(lz_params p = {})
|
||||||
|
: p_{p}
|
||||||
|
, rng_{p.seed} {
|
||||||
|
if (p_.text_mode) {
|
||||||
|
init_text_alphabet();
|
||||||
|
} else {
|
||||||
|
init_binary_alphabet();
|
||||||
|
}
|
||||||
|
|
||||||
|
// geometric_distribution parameterization: mean of failures = (1-p)/p
|
||||||
|
// We want E[min_match + failures] ≈ target_match_mean => E[failures] ≈
|
||||||
|
// target - min
|
||||||
|
double mean_fail =
|
||||||
|
std::max(1.0, p_.target_match_mean - static_cast<double>(p_.min_match));
|
||||||
|
double p_len = 1.0 / (mean_fail + 1.0);
|
||||||
|
geo_len_ = std::geometric_distribution<int>(p_len);
|
||||||
|
|
||||||
|
double mean_dist_fail = std::max(1.0, p_.distance_mean);
|
||||||
|
double p_dist = 1.0 / (mean_dist_fail + 1.0);
|
||||||
|
geo_dist_ = std::geometric_distribution<int>(p_dist);
|
||||||
|
|
||||||
|
bern_copy_ = std::bernoulli_distribution(p_.copy_prob);
|
||||||
|
bern_mut_ = std::bernoulli_distribution(p_.mutation_rate);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string generate(std::size_t n_bytes) {
|
||||||
|
std::string out;
|
||||||
|
out.reserve(n_bytes);
|
||||||
|
|
||||||
|
while (out.size() < n_bytes) {
|
||||||
|
bool const can_copy = out.size() >= p_.min_match;
|
||||||
|
if (can_copy && bern_copy_(rng_)) {
|
||||||
|
emit_copy(out, n_bytes);
|
||||||
|
} else {
|
||||||
|
out.push_back(sample_literal());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void init_text_alphabet() {
|
||||||
|
// Rough English-ish frequencies via "etaoin shrdlu..." ranking.
|
||||||
|
// Higher rank => higher weight. We include space/newline/punct/digits.
|
||||||
|
static constexpr std::string_view freq_rank =
|
||||||
|
" etaoinshrdlucmfwypvbgkqjxz"; // space first (most frequent)
|
||||||
|
// Map ranks to weights (largest for rank 0).
|
||||||
|
std::array<int, 256> weights{};
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
weights[i] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto apply_rank = [&](char c, size_t rank_base) {
|
||||||
|
int r = std::max(1, static_cast<int>(freq_rank.size()) -
|
||||||
|
static_cast<int>(rank_base));
|
||||||
|
weights[static_cast<unsigned char>(c)] += r;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t i = 0; i < freq_rank.size(); ++i) {
|
||||||
|
char c = freq_rank[i];
|
||||||
|
apply_rank(c, i);
|
||||||
|
if (c >= 'a' && c <= 'z') {
|
||||||
|
apply_rank(char(c - 'a' + 'A'), i + 6); // uppercase similar but rarer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common punctuation and digits
|
||||||
|
std::string const punct = ".,;:-()[]{}!?\"'";
|
||||||
|
for (char c : punct) {
|
||||||
|
weights[static_cast<unsigned char>(c)] += 8;
|
||||||
|
}
|
||||||
|
for (char c = '0'; c <= '9'; ++c) {
|
||||||
|
weights[static_cast<unsigned char>(c)] += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Newlines and tabs, for “document” feel
|
||||||
|
weights['\n'] += 6;
|
||||||
|
weights['\t'] += 2;
|
||||||
|
|
||||||
|
// Build alphabet and weight vector for std::discrete_distribution
|
||||||
|
for (int i = 0; i < 256; ++i) {
|
||||||
|
if (weights[i] > 0) {
|
||||||
|
text_alphabet_.push_back(static_cast<unsigned char>(i));
|
||||||
|
text_weights_.push_back(weights[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
text_dist_ = std::discrete_distribution<int>(text_weights_.begin(),
|
||||||
|
text_weights_.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
void init_binary_alphabet() {
|
||||||
|
binary_dist_ = std::uniform_int_distribution<int>(0, 255);
|
||||||
|
}
|
||||||
|
|
||||||
|
char sample_literal() {
|
||||||
|
if (p_.text_mode) {
|
||||||
|
int idx = text_dist_(rng_);
|
||||||
|
return static_cast<char>(text_alphabet_[static_cast<std::size_t>(idx)]);
|
||||||
|
}
|
||||||
|
return static_cast<char>(binary_dist_(rng_));
|
||||||
|
}
|
||||||
|
|
||||||
|
void emit_copy(std::string& out, std::size_t n_bytes) {
|
||||||
|
// Distance: 1 + geometric, truncated to current size and window
|
||||||
|
std::size_t max_dist = std::min<std::size_t>(p_.window, out.size());
|
||||||
|
if (max_dist == 0) {
|
||||||
|
out.push_back(sample_literal());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t dist = 1u + static_cast<std::size_t>(geo_dist_(rng_));
|
||||||
|
if (dist > max_dist)
|
||||||
|
dist = 1u + (dist % max_dist); // ensure in-range
|
||||||
|
|
||||||
|
// Length: min_match + geometric, truncated by end and max_match
|
||||||
|
std::size_t max_len =
|
||||||
|
std::min<std::size_t>(p_.max_match, n_bytes - out.size());
|
||||||
|
if (max_len < p_.min_match) {
|
||||||
|
out.push_back(sample_literal());
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t len = p_.min_match + static_cast<std::size_t>(geo_len_(rng_));
|
||||||
|
if (len > max_len)
|
||||||
|
len = max_len;
|
||||||
|
|
||||||
|
std::size_t start = out.size() - dist;
|
||||||
|
for (std::size_t i = 0; i < len && out.size() < n_bytes; ++i) {
|
||||||
|
unsigned char c = static_cast<unsigned char>(out[start + i]);
|
||||||
|
if (bern_mut_(rng_)) {
|
||||||
|
c = static_cast<unsigned char>(sample_literal());
|
||||||
|
}
|
||||||
|
out.push_back(static_cast<char>(c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lz_params p_;
|
||||||
|
std::mt19937_64 rng_;
|
||||||
|
|
||||||
|
std::vector<unsigned char> text_alphabet_;
|
||||||
|
std::vector<int> text_weights_;
|
||||||
|
std::discrete_distribution<int> text_dist_;
|
||||||
|
|
||||||
|
std::uniform_int_distribution<int> binary_dist_{0, 255};
|
||||||
|
|
||||||
|
std::bernoulli_distribution bern_copy_;
|
||||||
|
std::bernoulli_distribution bern_mut_;
|
||||||
|
std::geometric_distribution<int> geo_len_;
|
||||||
|
std::geometric_distribution<int> geo_dist_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace dwarfs::test
|
@ -38,6 +38,7 @@
|
|||||||
#include <dwarfs/util.h>
|
#include <dwarfs/util.h>
|
||||||
|
|
||||||
#include "loremipsum.h"
|
#include "loremipsum.h"
|
||||||
|
#include "lz_synthetic_generator.h"
|
||||||
#include "mmap_mock.h"
|
#include "mmap_mock.h"
|
||||||
#include "test_helpers.h"
|
#include "test_helpers.h"
|
||||||
|
|
||||||
@ -271,9 +272,10 @@ void os_access_mock::add_file(fs::path const& path, size_t size, bool random) {
|
|||||||
if (random) {
|
if (random) {
|
||||||
thread_local std::mt19937_64 rng{42};
|
thread_local std::mt19937_64 rng{42};
|
||||||
|
|
||||||
std::uniform_int_distribution<> choice_dist{0, 3};
|
std::uniform_int_distribution<> choice_dist{0, 4};
|
||||||
|
auto choice = choice_dist(rng);
|
||||||
|
|
||||||
switch (choice_dist(rng)) {
|
switch (choice) {
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
||||||
@ -281,6 +283,18 @@ void os_access_mock::add_file(fs::path const& path, size_t size, bool random) {
|
|||||||
add(path, st,
|
add(path, st,
|
||||||
[size, seed = rng()] { return create_random_string(size, seed); });
|
[size, seed = rng()] { return create_random_string(size, seed); });
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
case 1:
|
||||||
|
case 2: {
|
||||||
|
add(path, st, [size, seed = rng(), text_mode = choice == 1] {
|
||||||
|
lz_params lzp{};
|
||||||
|
lzp.text_mode = text_mode;
|
||||||
|
lzp.seed = seed;
|
||||||
|
lz_synthetic_generator gen{lzp};
|
||||||
|
return gen.generate(size);
|
||||||
|
});
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,6 +67,7 @@
|
|||||||
|
|
||||||
#include "filter_test_data.h"
|
#include "filter_test_data.h"
|
||||||
#include "loremipsum.h"
|
#include "loremipsum.h"
|
||||||
|
#include "lz_synthetic_generator.h"
|
||||||
#include "mmap_mock.h"
|
#include "mmap_mock.h"
|
||||||
#include "test_helpers.h"
|
#include "test_helpers.h"
|
||||||
#include "test_logger.h"
|
#include "test_logger.h"
|
||||||
@ -258,6 +259,15 @@ class mkdwarfs_tester : public tester_common {
|
|||||||
return test::create_random_string(size, 'A', 'Z', rng);
|
return test::create_random_string(size, 'A', 'Z', rng);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
test::lz_params text_lzp{};
|
||||||
|
test::lz_params binary_lzp{};
|
||||||
|
text_lzp.text_mode = true;
|
||||||
|
binary_lzp.text_mode = false;
|
||||||
|
text_lzp.seed = rng();
|
||||||
|
binary_lzp.seed = rng();
|
||||||
|
test::lz_synthetic_generator text_gen{text_lzp};
|
||||||
|
test::lz_synthetic_generator binary_gen{binary_lzp};
|
||||||
|
|
||||||
for (int x = 0; x < opt.dimension; ++x) {
|
for (int x = 0; x < opt.dimension; ++x) {
|
||||||
fs::path d1{random_path_component() + std::to_string(x)};
|
fs::path d1{random_path_component() + std::to_string(x)};
|
||||||
os->add_dir(d1);
|
os->add_dir(d1);
|
||||||
@ -268,13 +278,24 @@ class mkdwarfs_tester : public tester_common {
|
|||||||
|
|
||||||
for (int z = 0; z < opt.dimension; ++z) {
|
for (int z = 0; z < opt.dimension; ++z) {
|
||||||
fs::path f{d2 / (random_path_component() + std::to_string(z))};
|
fs::path f{d2 / (random_path_component() + std::to_string(z))};
|
||||||
auto size = std::min(max_size, static_cast<size_t>(size_dist(rng)));
|
auto const size =
|
||||||
|
std::min(max_size, static_cast<size_t>(size_dist(rng)));
|
||||||
std::string data;
|
std::string data;
|
||||||
|
|
||||||
if (size < 1024 * 1024 && rng() % 2 == 0) {
|
auto const choice = rng() % 4;
|
||||||
|
switch (choice) {
|
||||||
|
case 0:
|
||||||
data = test::create_random_string(size, rng);
|
data = test::create_random_string(size, rng);
|
||||||
} else {
|
break;
|
||||||
|
case 1:
|
||||||
data = test::loremipsum(size);
|
data = test::loremipsum(size);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
data = text_gen.generate(size);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
data = binary_gen.generate(size);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
os->add_file(f, data);
|
os->add_file(f, data);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user