mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-19 01:07:00 -04:00
Nuke nilsimsa v1
This commit is contained in:
parent
7891608c82
commit
2546cc94f4
@ -98,21 +98,11 @@ struct filesystem_writer_options {
|
||||
};
|
||||
|
||||
// TODO: rename
|
||||
enum class file_order_mode {
|
||||
NONE,
|
||||
PATH,
|
||||
SCRIPT,
|
||||
SIMILARITY,
|
||||
NILSIMSA,
|
||||
NILSIMSA2
|
||||
};
|
||||
enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA2 };
|
||||
|
||||
// TODO: rename
|
||||
struct file_order_options {
|
||||
file_order_mode mode{file_order_mode::NONE};
|
||||
int nilsimsa_depth{20000};
|
||||
int nilsimsa_min_depth{1000};
|
||||
int nilsimsa_limit{255};
|
||||
int nilsimsa2_max_children{8192};
|
||||
int nilsimsa2_max_cluster_size{8192};
|
||||
};
|
||||
|
@ -40,7 +40,6 @@ const std::map<std::string_view, file_order_mode> order_choices{
|
||||
{"script", file_order_mode::SCRIPT},
|
||||
#endif
|
||||
{"similarity", file_order_mode::SIMILARITY},
|
||||
{"nilsimsa", file_order_mode::NILSIMSA},
|
||||
{"nilsimsa2", file_order_mode::NILSIMSA2},
|
||||
};
|
||||
|
||||
@ -100,26 +99,6 @@ file_order_options fragment_order_parser::parse(std::string_view arg) const {
|
||||
auto ordname = order_opts[0];
|
||||
|
||||
switch (rv.mode) {
|
||||
case file_order_mode::NILSIMSA:
|
||||
if (order_opts.size() > 4) {
|
||||
throw std::runtime_error(fmt::format(
|
||||
"too many options for inode order mode '{}'", ordname));
|
||||
}
|
||||
|
||||
parse_order_option(ordname, order_opts[1], rv.nilsimsa_limit, "limit",
|
||||
0, 255);
|
||||
|
||||
if (order_opts.size() > 2) {
|
||||
parse_order_option(ordname, order_opts[2], rv.nilsimsa_depth, "depth",
|
||||
0);
|
||||
|
||||
if (order_opts.size() > 3) {
|
||||
parse_order_option(ordname, order_opts[3], rv.nilsimsa_min_depth,
|
||||
"min depth", 0);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case file_order_mode::NILSIMSA2:
|
||||
if (order_opts.size() > 4) {
|
||||
throw std::runtime_error(fmt::format(
|
||||
@ -163,11 +142,6 @@ fragment_order_parser::to_string(file_order_options const& opts) const {
|
||||
case file_order_mode::SIMILARITY:
|
||||
return "similarity";
|
||||
|
||||
case file_order_mode::NILSIMSA:
|
||||
return fmt::format("nilsimsa (limit={}, depth={}, min_depth={})",
|
||||
opts.nilsimsa_limit, opts.nilsimsa_depth,
|
||||
opts.nilsimsa_min_depth);
|
||||
|
||||
case file_order_mode::NILSIMSA2:
|
||||
return fmt::format("nilsimsa2 (max_children={}, max_cluster_size={})",
|
||||
opts.nilsimsa2_max_children,
|
||||
|
@ -63,41 +63,6 @@
|
||||
|
||||
namespace dwarfs {
|
||||
|
||||
#define DWARFS_FIND_SIMILAR_INODE_IMPL \
|
||||
std::pair<int_fast32_t, int_fast32_t> find_similar_inode( \
|
||||
uint64_t const* ref_hash, \
|
||||
std::vector<std::shared_ptr<inode>> const& inodes, \
|
||||
std::vector<uint32_t> const& index, int_fast32_t const limit, \
|
||||
int_fast32_t const end) { \
|
||||
int_fast32_t max_sim = 0; \
|
||||
int_fast32_t max_sim_ix = 0; \
|
||||
\
|
||||
for (int_fast32_t i = index.size() - 1; i >= end; --i) { \
|
||||
auto const* test_hash = \
|
||||
inodes[index[i]]->nilsimsa_similarity_hash().data(); \
|
||||
int sim; \
|
||||
DWARFS_NILSIMSA_SIMILARITY(sim =, ref_hash, test_hash); \
|
||||
\
|
||||
if (sim > max_sim) [[unlikely]] { \
|
||||
max_sim = sim; \
|
||||
max_sim_ix = i; \
|
||||
\
|
||||
if (max_sim >= limit) [[unlikely]] { \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
return {max_sim_ix, max_sim}; \
|
||||
} \
|
||||
static_assert(true, "")
|
||||
|
||||
#ifdef DWARFS_MULTIVERSIONING
|
||||
__attribute__((target("popcnt"))) DWARFS_FIND_SIMILAR_INODE_IMPL;
|
||||
__attribute__((target("default")))
|
||||
#endif
|
||||
DWARFS_FIND_SIMILAR_INODE_IMPL;
|
||||
|
||||
namespace {
|
||||
|
||||
class inode_ : public inode {
|
||||
@ -380,7 +345,6 @@ class inode_ : public inode {
|
||||
case file_order_mode::SIMILARITY:
|
||||
sc.try_emplace(f.category());
|
||||
break;
|
||||
case file_order_mode::NILSIMSA:
|
||||
case file_order_mode::NILSIMSA2:
|
||||
nc.try_emplace(f.category());
|
||||
break;
|
||||
@ -443,7 +407,6 @@ class inode_ : public inode {
|
||||
similarity_.emplace<uint32_t>(sc.finalize());
|
||||
} break;
|
||||
|
||||
case file_order_mode::NILSIMSA:
|
||||
case file_order_mode::NILSIMSA2: {
|
||||
nilsimsa nc;
|
||||
scan_range(mm, 0, mm->size(), nc);
|
||||
@ -580,7 +543,6 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
|
||||
return opts.fragment_order.any_is([](auto const& order) {
|
||||
return order.mode == file_order_mode::SIMILARITY ||
|
||||
order.mode == file_order_mode::NILSIMSA ||
|
||||
order.mode == file_order_mode::NILSIMSA2;
|
||||
});
|
||||
}
|
||||
@ -606,7 +568,6 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
void presort_index(std::vector<std::shared_ptr<inode>>& inodes,
|
||||
std::vector<uint32_t>& index);
|
||||
|
||||
void order_inodes_by_nilsimsa(inode_manager::order_cb const& fn);
|
||||
void order_inodes_by_nilsimsa2(worker_group& wg);
|
||||
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
@ -684,15 +645,6 @@ void inode_manager_<LoggerPolicy>::order_inodes(
|
||||
break;
|
||||
}
|
||||
|
||||
case file_order_mode::NILSIMSA: {
|
||||
LOG_INFO << "ordering " << count()
|
||||
<< " inodes using nilsimsa similarity...";
|
||||
auto ti = LOG_CPU_TIMED_INFO;
|
||||
order_inodes_by_nilsimsa(fn);
|
||||
ti << count() << " inodes ordered";
|
||||
return;
|
||||
}
|
||||
|
||||
case file_order_mode::NILSIMSA2: {
|
||||
LOG_INFO << "ordering " << count()
|
||||
<< " inodes using new nilsimsa similarity...";
|
||||
@ -750,122 +702,6 @@ void inode_manager_<LoggerPolicy>::presort_index(
|
||||
<< " path lookups)";
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
|
||||
inode_manager::order_cb const& fn) {
|
||||
auto count = inodes_.size();
|
||||
|
||||
if (auto fname = ::getenv("DWARFS_NILSIMSA_DUMP")) {
|
||||
std::ofstream ofs{fname};
|
||||
|
||||
for (auto const& i : inodes_) {
|
||||
auto const& h = i->nilsimsa_similarity_hash();
|
||||
if (!h.empty()) {
|
||||
ofs << fmt::format("{0:016x}{1:016x}{2:016x}{3:016x}\t{4}\t{5}\n", h[0],
|
||||
h[1], h[2], h[3], i->size(), i->any()->name());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<inode>> inodes;
|
||||
inodes.swap(inodes_);
|
||||
inodes_.reserve(count);
|
||||
std::vector<uint32_t> index;
|
||||
index.resize(count);
|
||||
std::iota(index.begin(), index.end(), 0);
|
||||
|
||||
auto finalize_inode = [&]() {
|
||||
inodes_.push_back(std::move(inodes[index.back()]));
|
||||
index.pop_back();
|
||||
return fn(inodes_.back());
|
||||
};
|
||||
|
||||
{
|
||||
auto empty = std::partition(index.begin(), index.end(),
|
||||
[&](auto i) { return inodes[i]->size() > 0; });
|
||||
|
||||
if (empty != index.end()) {
|
||||
auto count = std::distance(empty, index.end());
|
||||
|
||||
LOG_DEBUG << "finalizing " << count << " empty inodes...";
|
||||
|
||||
for (auto n = count; n > 0; --n) {
|
||||
finalize_inode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto unhashed = std::partition(index.begin(), index.end(), [&](auto i) {
|
||||
auto const& sh = inodes[i]->nilsimsa_similarity_hash();
|
||||
return std::any_of(sh.begin(), sh.end(), [](auto v) { return v != 0; });
|
||||
});
|
||||
|
||||
if (unhashed != index.end()) {
|
||||
auto count = std::distance(unhashed, index.end());
|
||||
|
||||
std::sort(unhashed, index.end(), [&inodes](auto a, auto b) {
|
||||
return inodes[a]->size() < inodes[b]->size();
|
||||
});
|
||||
|
||||
LOG_INFO << "finalizing " << count << " unhashed inodes...";
|
||||
|
||||
for (auto n = count; n > 0; --n) {
|
||||
finalize_inode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!index.empty()) {
|
||||
auto const& file_order = opts_.fragment_order.get(); // TODO
|
||||
const int_fast32_t max_depth = file_order.nilsimsa_depth;
|
||||
const int_fast32_t min_depth =
|
||||
std::min<int32_t>(file_order.nilsimsa_min_depth, max_depth);
|
||||
const int_fast32_t limit = file_order.nilsimsa_limit;
|
||||
int_fast32_t depth = max_depth;
|
||||
int64_t processed = 0;
|
||||
|
||||
LOG_INFO << "nilsimsa: depth=" << depth << " (" << min_depth
|
||||
<< "), limit=" << limit;
|
||||
|
||||
presort_index(inodes, index);
|
||||
|
||||
finalize_inode();
|
||||
|
||||
while (!index.empty()) {
|
||||
auto [max_sim_ix, max_sim] = find_similar_inode(
|
||||
inodes_.back()->nilsimsa_similarity_hash().data(), inodes, index,
|
||||
limit, int(index.size()) > depth ? index.size() - depth : 0);
|
||||
|
||||
LOG_TRACE << max_sim << " @ " << max_sim_ix << "/" << index.size();
|
||||
|
||||
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
|
||||
index.end());
|
||||
|
||||
auto fill = finalize_inode();
|
||||
|
||||
if (++processed >= 4096 && processed % 32 == 0) {
|
||||
constexpr int64_t smooth = 512;
|
||||
auto target_depth = fill * max_depth / 2048;
|
||||
|
||||
depth = ((smooth - 1) * depth + target_depth) / smooth;
|
||||
|
||||
if (depth > max_depth) {
|
||||
depth = max_depth;
|
||||
} else if (depth < min_depth) {
|
||||
depth = min_depth;
|
||||
}
|
||||
}
|
||||
|
||||
prog_.nilsimsa_depth = depth;
|
||||
}
|
||||
}
|
||||
|
||||
if (count != inodes_.size()) {
|
||||
DWARFS_THROW(runtime_error, "internal error: nilsimsa ordering failed");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(worker_group& wg) {
|
||||
auto const& file_order = opts_.fragment_order.get(); // TODO
|
||||
|
@ -45,8 +45,8 @@ std::ostream& operator<<(std::ostream& os, file_order_mode mode) {
|
||||
case file_order_mode::SIMILARITY:
|
||||
modestr = "similarity";
|
||||
break;
|
||||
case file_order_mode::NILSIMSA:
|
||||
modestr = "nilsimsa";
|
||||
case file_order_mode::NILSIMSA2:
|
||||
modestr = "nilsimsa2";
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -258,10 +258,10 @@ constexpr std::array<level_defaults, 10> levels{{
|
||||
/* 3 */ {21, ALG_DATA_3, ALG_SCHEMA, "null", 12, 1, "similarity"},
|
||||
/* 4 */ {22, ALG_DATA_4, ALG_SCHEMA, "null", 12, 2, "similarity"},
|
||||
/* 5 */ {23, ALG_DATA_5, ALG_SCHEMA, "null", 12, 2, "similarity"},
|
||||
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", 12, 3, "nilsimsa"},
|
||||
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, 12, 3, "nilsimsa"},
|
||||
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
|
||||
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
|
||||
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", 12, 3, "nilsimsa2"},
|
||||
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, 12, 3, "nilsimsa2"},
|
||||
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa2"},
|
||||
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa2"},
|
||||
// clang-format on
|
||||
}};
|
||||
|
||||
|
@ -157,7 +157,6 @@ void basic_end_to_end_test(std::string const& compressor,
|
||||
auto mm = std::make_shared<test::mmap_mock>(std::move(fsimage));
|
||||
|
||||
bool similarity = file_order == file_order_mode::SIMILARITY ||
|
||||
file_order == file_order_mode::NILSIMSA ||
|
||||
file_order == file_order_mode::NILSIMSA2;
|
||||
|
||||
size_t const num_fail_empty = access_fail ? 1 : 0;
|
||||
@ -599,8 +598,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(compressions), ::testing::Values(12, 15, 20, 28),
|
||||
::testing::Values(file_order_mode::NONE, file_order_mode::PATH,
|
||||
file_order_mode::SCRIPT, file_order_mode::NILSIMSA,
|
||||
file_order_mode::NILSIMSA2,
|
||||
file_order_mode::SCRIPT, file_order_mode::NILSIMSA2,
|
||||
file_order_mode::SIMILARITY),
|
||||
::testing::Values(std::nullopt, "xxh3-128")));
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user