Nuke nilsimsa v1

This commit is contained in:
Marcus Holland-Moritz 2023-08-12 20:33:54 +02:00
parent 7891608c82
commit 2546cc94f4
6 changed files with 8 additions and 210 deletions

View File

@ -98,21 +98,11 @@ struct filesystem_writer_options {
};
// TODO: rename
enum class file_order_mode {
NONE,
PATH,
SCRIPT,
SIMILARITY,
NILSIMSA,
NILSIMSA2
};
enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY, NILSIMSA2 };
// TODO: rename
struct file_order_options {
file_order_mode mode{file_order_mode::NONE};
int nilsimsa_depth{20000};
int nilsimsa_min_depth{1000};
int nilsimsa_limit{255};
int nilsimsa2_max_children{8192};
int nilsimsa2_max_cluster_size{8192};
};

View File

@ -40,7 +40,6 @@ const std::map<std::string_view, file_order_mode> order_choices{
{"script", file_order_mode::SCRIPT},
#endif
{"similarity", file_order_mode::SIMILARITY},
{"nilsimsa", file_order_mode::NILSIMSA},
{"nilsimsa2", file_order_mode::NILSIMSA2},
};
@ -100,26 +99,6 @@ file_order_options fragment_order_parser::parse(std::string_view arg) const {
auto ordname = order_opts[0];
switch (rv.mode) {
case file_order_mode::NILSIMSA:
if (order_opts.size() > 4) {
throw std::runtime_error(fmt::format(
"too many options for inode order mode '{}'", ordname));
}
parse_order_option(ordname, order_opts[1], rv.nilsimsa_limit, "limit",
0, 255);
if (order_opts.size() > 2) {
parse_order_option(ordname, order_opts[2], rv.nilsimsa_depth, "depth",
0);
if (order_opts.size() > 3) {
parse_order_option(ordname, order_opts[3], rv.nilsimsa_min_depth,
"min depth", 0);
}
}
break;
case file_order_mode::NILSIMSA2:
if (order_opts.size() > 4) {
throw std::runtime_error(fmt::format(
@ -163,11 +142,6 @@ fragment_order_parser::to_string(file_order_options const& opts) const {
case file_order_mode::SIMILARITY:
return "similarity";
case file_order_mode::NILSIMSA:
return fmt::format("nilsimsa (limit={}, depth={}, min_depth={})",
opts.nilsimsa_limit, opts.nilsimsa_depth,
opts.nilsimsa_min_depth);
case file_order_mode::NILSIMSA2:
return fmt::format("nilsimsa2 (max_children={}, max_cluster_size={})",
opts.nilsimsa2_max_children,

View File

@ -63,41 +63,6 @@
namespace dwarfs {
#define DWARFS_FIND_SIMILAR_INODE_IMPL \
std::pair<int_fast32_t, int_fast32_t> find_similar_inode( \
uint64_t const* ref_hash, \
std::vector<std::shared_ptr<inode>> const& inodes, \
std::vector<uint32_t> const& index, int_fast32_t const limit, \
int_fast32_t const end) { \
int_fast32_t max_sim = 0; \
int_fast32_t max_sim_ix = 0; \
\
for (int_fast32_t i = index.size() - 1; i >= end; --i) { \
auto const* test_hash = \
inodes[index[i]]->nilsimsa_similarity_hash().data(); \
int sim; \
DWARFS_NILSIMSA_SIMILARITY(sim =, ref_hash, test_hash); \
\
if (sim > max_sim) [[unlikely]] { \
max_sim = sim; \
max_sim_ix = i; \
\
if (max_sim >= limit) [[unlikely]] { \
break; \
} \
} \
} \
\
return {max_sim_ix, max_sim}; \
} \
static_assert(true, "")
#ifdef DWARFS_MULTIVERSIONING
__attribute__((target("popcnt"))) DWARFS_FIND_SIMILAR_INODE_IMPL;
__attribute__((target("default")))
#endif
DWARFS_FIND_SIMILAR_INODE_IMPL;
namespace {
class inode_ : public inode {
@ -380,7 +345,6 @@ class inode_ : public inode {
case file_order_mode::SIMILARITY:
sc.try_emplace(f.category());
break;
case file_order_mode::NILSIMSA:
case file_order_mode::NILSIMSA2:
nc.try_emplace(f.category());
break;
@ -443,7 +407,6 @@ class inode_ : public inode {
similarity_.emplace<uint32_t>(sc.finalize());
} break;
case file_order_mode::NILSIMSA:
case file_order_mode::NILSIMSA2: {
nilsimsa nc;
scan_range(mm, 0, mm->size(), nc);
@ -580,7 +543,6 @@ class inode_manager_ final : public inode_manager::impl {
return opts.fragment_order.any_is([](auto const& order) {
return order.mode == file_order_mode::SIMILARITY ||
order.mode == file_order_mode::NILSIMSA ||
order.mode == file_order_mode::NILSIMSA2;
});
}
@ -606,7 +568,6 @@ class inode_manager_ final : public inode_manager::impl {
void presort_index(std::vector<std::shared_ptr<inode>>& inodes,
std::vector<uint32_t>& index);
void order_inodes_by_nilsimsa(inode_manager::order_cb const& fn);
void order_inodes_by_nilsimsa2(worker_group& wg);
LOG_PROXY_DECL(LoggerPolicy);
@ -684,15 +645,6 @@ void inode_manager_<LoggerPolicy>::order_inodes(
break;
}
case file_order_mode::NILSIMSA: {
LOG_INFO << "ordering " << count()
<< " inodes using nilsimsa similarity...";
auto ti = LOG_CPU_TIMED_INFO;
order_inodes_by_nilsimsa(fn);
ti << count() << " inodes ordered";
return;
}
case file_order_mode::NILSIMSA2: {
LOG_INFO << "ordering " << count()
<< " inodes using new nilsimsa similarity...";
@ -750,122 +702,6 @@ void inode_manager_<LoggerPolicy>::presort_index(
<< " path lookups)";
}
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa(
inode_manager::order_cb const& fn) {
auto count = inodes_.size();
if (auto fname = ::getenv("DWARFS_NILSIMSA_DUMP")) {
std::ofstream ofs{fname};
for (auto const& i : inodes_) {
auto const& h = i->nilsimsa_similarity_hash();
if (!h.empty()) {
ofs << fmt::format("{0:016x}{1:016x}{2:016x}{3:016x}\t{4}\t{5}\n", h[0],
h[1], h[2], h[3], i->size(), i->any()->name());
}
}
}
std::vector<std::shared_ptr<inode>> inodes;
inodes.swap(inodes_);
inodes_.reserve(count);
std::vector<uint32_t> index;
index.resize(count);
std::iota(index.begin(), index.end(), 0);
auto finalize_inode = [&]() {
inodes_.push_back(std::move(inodes[index.back()]));
index.pop_back();
return fn(inodes_.back());
};
{
auto empty = std::partition(index.begin(), index.end(),
[&](auto i) { return inodes[i]->size() > 0; });
if (empty != index.end()) {
auto count = std::distance(empty, index.end());
LOG_DEBUG << "finalizing " << count << " empty inodes...";
for (auto n = count; n > 0; --n) {
finalize_inode();
}
}
}
{
auto unhashed = std::partition(index.begin(), index.end(), [&](auto i) {
auto const& sh = inodes[i]->nilsimsa_similarity_hash();
return std::any_of(sh.begin(), sh.end(), [](auto v) { return v != 0; });
});
if (unhashed != index.end()) {
auto count = std::distance(unhashed, index.end());
std::sort(unhashed, index.end(), [&inodes](auto a, auto b) {
return inodes[a]->size() < inodes[b]->size();
});
LOG_INFO << "finalizing " << count << " unhashed inodes...";
for (auto n = count; n > 0; --n) {
finalize_inode();
}
}
}
if (!index.empty()) {
auto const& file_order = opts_.fragment_order.get(); // TODO
const int_fast32_t max_depth = file_order.nilsimsa_depth;
const int_fast32_t min_depth =
std::min<int32_t>(file_order.nilsimsa_min_depth, max_depth);
const int_fast32_t limit = file_order.nilsimsa_limit;
int_fast32_t depth = max_depth;
int64_t processed = 0;
LOG_INFO << "nilsimsa: depth=" << depth << " (" << min_depth
<< "), limit=" << limit;
presort_index(inodes, index);
finalize_inode();
while (!index.empty()) {
auto [max_sim_ix, max_sim] = find_similar_inode(
inodes_.back()->nilsimsa_similarity_hash().data(), inodes, index,
limit, int(index.size()) > depth ? index.size() - depth : 0);
LOG_TRACE << max_sim << " @ " << max_sim_ix << "/" << index.size();
std::rotate(index.begin() + max_sim_ix, index.begin() + max_sim_ix + 1,
index.end());
auto fill = finalize_inode();
if (++processed >= 4096 && processed % 32 == 0) {
constexpr int64_t smooth = 512;
auto target_depth = fill * max_depth / 2048;
depth = ((smooth - 1) * depth + target_depth) / smooth;
if (depth > max_depth) {
depth = max_depth;
} else if (depth < min_depth) {
depth = min_depth;
}
}
prog_.nilsimsa_depth = depth;
}
}
if (count != inodes_.size()) {
DWARFS_THROW(runtime_error, "internal error: nilsimsa ordering failed");
}
}
template <typename LoggerPolicy>
void inode_manager_<LoggerPolicy>::order_inodes_by_nilsimsa2(worker_group& wg) {
auto const& file_order = opts_.fragment_order.get(); // TODO

View File

@ -45,8 +45,8 @@ std::ostream& operator<<(std::ostream& os, file_order_mode mode) {
case file_order_mode::SIMILARITY:
modestr = "similarity";
break;
case file_order_mode::NILSIMSA:
modestr = "nilsimsa";
case file_order_mode::NILSIMSA2:
modestr = "nilsimsa2";
break;
default:
break;

View File

@ -258,10 +258,10 @@ constexpr std::array<level_defaults, 10> levels{{
/* 3 */ {21, ALG_DATA_3, ALG_SCHEMA, "null", 12, 1, "similarity"},
/* 4 */ {22, ALG_DATA_4, ALG_SCHEMA, "null", 12, 2, "similarity"},
/* 5 */ {23, ALG_DATA_5, ALG_SCHEMA, "null", 12, 2, "similarity"},
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", 12, 3, "nilsimsa"},
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, 12, 3, "nilsimsa"},
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa"},
/* 6 */ {24, ALG_DATA_6, ALG_SCHEMA, "null", 12, 3, "nilsimsa2"},
/* 7 */ {24, ALG_DATA_7, ALG_SCHEMA, ALG_METADATA_7, 12, 3, "nilsimsa2"},
/* 8 */ {24, ALG_DATA_8, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa2"},
/* 9 */ {26, ALG_DATA_9, ALG_SCHEMA, ALG_METADATA_9, 12, 4, "nilsimsa2"},
// clang-format on
}};

View File

@ -157,7 +157,6 @@ void basic_end_to_end_test(std::string const& compressor,
auto mm = std::make_shared<test::mmap_mock>(std::move(fsimage));
bool similarity = file_order == file_order_mode::SIMILARITY ||
file_order == file_order_mode::NILSIMSA ||
file_order == file_order_mode::NILSIMSA2;
size_t const num_fail_empty = access_fail ? 1 : 0;
@ -599,8 +598,7 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Combine(
::testing::ValuesIn(compressions), ::testing::Values(12, 15, 20, 28),
::testing::Values(file_order_mode::NONE, file_order_mode::PATH,
file_order_mode::SCRIPT, file_order_mode::NILSIMSA,
file_order_mode::NILSIMSA2,
file_order_mode::SCRIPT, file_order_mode::NILSIMSA2,
file_order_mode::SIMILARITY),
::testing::Values(std::nullopt, "xxh3-128")));