Update mkdwarfs options with level-dependent ordering

This commit is contained in:
Marcus Holland-Moritz 2020-12-09 17:55:50 +01:00
parent 2efc231393
commit a2377ead41
2 changed files with 53 additions and 39 deletions

View File

@ -131,18 +131,23 @@ Most other options are concerned with compression tuning:
the `mtime` field in order to save metadata space. If you want to save
`atime` and `ctime` as well, use this option.
* `--order=none`|`path`|`similarity`|`nilsimsa`|`script`:
The order in which files will be written to the filesystem. Choosing `none`,
the files will be stored in the order in which they are discovered. With
`path`, they will be sorted asciibetically by path name. With `similarity`,
they will be ordered using a simple, yet fast and efficient, similarity
hash function. This is the default, as it will cause similar files to be
located close to each other, which means compression will be better.
`nilsimsa` ordering uses a different similarity function that is *likely*
even better than `similarity`, but is comparatively slow to compute. It
*can* be very slow, though it shouldn't be for typical inputs. YMMV.
Last but not least, if scripting support is built into `mkdwarfs`, you
can choose `script` to let the script determine the order.
* `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`:
The order in which inodes will be written to the file system. Choosing `none`,
the inodes will be stored in the order in which they are discovered. With
`path`, they will be sorted asciibetically by path name of the first file
representing this inode. With `similarity`, they will be ordered using a
simple, yet fast and efficient, similarity hash function. `nilsimsa` ordering
uses a more sophisticated similarity function that is typically better than
`similarity`, but is significantly slower to compute. However, computation
can happen in the background while already building the file system.
`nilsimsa` ordering can be further tweaked by specifying a *limit* and
*depth*. The *limit* determines how soon an inode is considered similar
enough for adding. A *limit* of 255 means "essentially identical", whereas
a *limit* of 0 means "not similar at all". The *depth* determines up to
how many inodes can be checked at most while searching for a similar one.
The default if you omit these values is a *limit* of 255 and a *depth*
of 25000. Last but not least, if scripting support is built into `mkdwarfs`,
you can choose `script` to let the script determine the order.
* `--blockhash-window-sizes=`*value*[,*value*]...:
Window sizes used for block hashing. These sizes, separated by commas,

View File

@ -175,6 +175,7 @@ struct level_defaults {
char const* schema_compression;
char const* metadata_compression;
char const* window_sizes;
char const* order;
};
#if defined(DWARFS_HAVE_LIBLZ4)
@ -252,16 +253,18 @@ struct level_defaults {
#endif
constexpr std::array<level_defaults, 10> levels{{
/* 0 */ {20, "null", "null", "null", "-"},
/* 1 */ {20, ALG_DATA_LEVEL1, ALG_SCHEMA, "null", "-"},
/* 2 */ {20, ALG_DATA_LEVEL2, ALG_SCHEMA, "null", "-"},
/* 3 */ {20, ALG_DATA_LEVEL3, ALG_SCHEMA, "null", "13"},
/* 4 */ {21, ALG_DATA_LEVEL4, ALG_SCHEMA, "null", "11"},
/* 5 */ {22, ALG_DATA_LEVEL5, ALG_SCHEMA, "null", "11"},
/* 6 */ {23, ALG_DATA_LEVEL6, ALG_SCHEMA, "null", "15,11"},
/* 7 */ {24, ALG_DATA_LEVEL7, ALG_SCHEMA, "null", "17,15,13,11"},
/* 8 */ {24, ALG_DATA_LEVEL8, ALG_SCHEMA, ALG_METADATA, "17,15,13,11"},
/* 9 */ {24, ALG_DATA_LEVEL9, ALG_SCHEMA, ALG_METADATA, "17,15,13,11"},
// clang-format off
/* 0 */ {20, "null", "null" , "null", "-", "none"},
/* 1 */ {20, ALG_DATA_LEVEL1, ALG_SCHEMA, "null", "-", "path"},
/* 2 */ {20, ALG_DATA_LEVEL2, ALG_SCHEMA, "null", "-", "path"},
/* 3 */ {20, ALG_DATA_LEVEL3, ALG_SCHEMA, "null", "13", "similarity"},
/* 4 */ {21, ALG_DATA_LEVEL4, ALG_SCHEMA, "null", "11", "similarity"},
/* 5 */ {22, ALG_DATA_LEVEL5, ALG_SCHEMA, "null", "11", "similarity"},
/* 6 */ {23, ALG_DATA_LEVEL6, ALG_SCHEMA, "null", "15,11", "nilsimsa:250:10000"},
/* 7 */ {24, ALG_DATA_LEVEL7, ALG_SCHEMA, "null", "17,15,13,11", "nilsimsa"},
/* 8 */ {24, ALG_DATA_LEVEL8, ALG_SCHEMA, ALG_METADATA, "17,15,13,11", "nilsimsa"},
/* 9 */ {24, ALG_DATA_LEVEL9, ALG_SCHEMA, ALG_METADATA, "17,15,13,11", "nilsimsa"},
// clang-format on
}};
constexpr unsigned default_level = 7;
@ -283,7 +286,7 @@ int mkdwarfs(int argc, char** argv) {
scanner_options options;
auto order_desc =
"file order (" + (from(order_choices) | get<0>() | unsplit(", ")) + ")";
"inode order (" + (from(order_choices) | get<0>() | unsplit(", ")) + ")";
auto resolution_desc = "time resolution in seconds or (" +
(from(time_resolutions) | get<0>() | unsplit(", ")) +
@ -341,8 +344,7 @@ int mkdwarfs(int argc, char** argv) {
po::value<std::string>(&time_resolution)->default_value("sec"),
resolution_desc.c_str())
("order",
po::value<std::string>(&order)
->default_value("similarity"),
po::value<std::string>(&order),
order_desc.c_str())
#ifdef DWARFS_HAVE_PYTHON
("script",
@ -390,33 +392,36 @@ int mkdwarfs(int argc, char** argv) {
}
if (vm.count("help") or !vm.count("input") or !vm.count("output")) {
size_t l_dc = 0, l_sc = 0, l_mc = 0, l_ws = 0;
size_t l_dc = 0, l_sc = 0, l_mc = 0, l_ws = 0, l_or = 0;
for (auto const& l : levels) {
l_dc = std::max(l_dc, ::strlen(l.data_compression));
l_sc = std::max(l_sc, ::strlen(l.schema_compression));
l_mc = std::max(l_mc, ::strlen(l.metadata_compression));
l_ws = std::max(l_ws, ::strlen(l.window_sizes));
l_or = std::max(l_or, ::strlen(l.order));
}
std::string sep(21 + l_dc + l_sc + l_mc + l_ws, '-');
std::string sep(22 + l_dc + l_sc + l_mc + l_ws + l_or, '-');
std::cout << "mkdwarfs (" << DWARFS_VERSION << ")\n" << opts << std::endl;
std::cout << "Compression level defaults:\n"
<< " " << sep << "\n"
<< fmt::format(" Level Block {:{}s} Window Sizes\n",
"Compression Algorithm", 4 + l_dc + l_sc + l_mc)
<< fmt::format(" Size {:{}s} {:{}s} {:{}s}\n",
<< fmt::format(" Level Block {:{}s} {:{}s} Inode Order\n",
"Compression Algorithm", 4 + l_dc + l_sc + l_mc,
"Window", l_ws)
<< fmt::format(" Size {:{}s} {:{}s} {:{}s} {:{}s}\n",
"Block Data", l_dc, "Schema", l_sc, "Metadata",
l_mc)
l_mc, "Sizes", l_ws)
<< " " << sep << std::endl;
int level = 0;
for (auto const& l : levels) {
std::cout << fmt::format(
" {:1d} {:2d} {:{}s} {:{}s} {:{}s} {:{}s}",
level, l.block_size_bits, l.data_compression, l_dc,
l.schema_compression, l_sc, l.metadata_compression, l_mc,
l.window_sizes, l_ws)
std::cout << fmt::format(" {:1d} {:2d} {:{}s} {:{}s} {:{}s} "
"{:{}s} {:{}s}",
level, l.block_size_bits, l.data_compression,
l_dc, l.schema_compression, l_sc,
l.metadata_compression, l_mc, l.window_sizes,
l_ws, l.order, l_or)
<< std::endl;
++level;
}
@ -475,6 +480,10 @@ int mkdwarfs(int argc, char** argv) {
window_sizes = defaults.window_sizes;
}
if (!vm.count("order")) {
order = defaults.order;
}
std::vector<std::string> order_opts;
boost::split(order_opts, order, boost::is_any_of(":"));
if (auto it = order_choices.find(order_opts.front());
@ -483,12 +492,12 @@ int mkdwarfs(int argc, char** argv) {
if (order_opts.size() > 1) {
if (options.file_order.mode != file_order_mode::NILSIMSA) {
throw std::runtime_error(
fmt::format("file order mode '{}' does not support options",
fmt::format("inode order mode '{}' does not support options",
order_opts.front()));
}
if (order_opts.size() > 3) {
throw std::runtime_error(fmt::format(
"too many options for file order mode '{}'", order_opts.front()));
"too many options for inode order mode '{}'", order_opts.front()));
}
options.file_order.nilsimsa_limit = folly::to<int>(order_opts[1]);
if (options.file_order.nilsimsa_limit < 0 ||
@ -506,7 +515,7 @@ int mkdwarfs(int argc, char** argv) {
}
}
} else {
throw std::runtime_error("invalid file order mode: " + order);
throw std::runtime_error("invalid inode order mode: " + order);
}
size_t mem_limit = parse_size_with_unit(memory_limit);