Update mkdwarfs options with level-dependent ordering

This commit is contained in:
Marcus Holland-Moritz 2020-12-09 17:55:50 +01:00
parent 2efc231393
commit a2377ead41
2 changed files with 53 additions and 39 deletions

View File

@ -131,18 +131,23 @@ Most other options are concerned with compression tuning:
the `mtime` field in order to save metadata space. If you want to save the `mtime` field in order to save metadata space. If you want to save
`atime` and `ctime` as well, use this option. `atime` and `ctime` as well, use this option.
* `--order=none`|`path`|`similarity`|`nilsimsa`|`script`: * `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`:
The order in which files will be written to the filesystem. Choosing `none`, The order in which inodes will be written to the file system. Choosing `none`,
the files will be stored in the order in which they are discovered. With the inodes will be stored in the order in which they are discovered. With
`path`, they will be sorted asciibetically by path name. With `similarity`, `path`, they will be sorted asciibetically by path name of the first file
they will be ordered using a simple, yet fast and efficient, similarity representing this inode. With `similarity`, they will be ordered using a
hash function. This is the default, as it will cause similar files to be simple, yet fast and efficient, similarity hash function. `nilsimsa` ordering
located close to each other, which means compression will be better. uses a more sophisticated similarity function that is typically better than
`nilsimsa` ordering uses a different similarity function that is *likely* `similarity`, but is significantly slower to compute. However, computation
even better than `similarity`, but is comparatively slow to compute. It can happen in the background while already building the file system.
*can* be very slow, though it shouldn't be for typical inputs. YMMV. `nilsimsa` ordering can be further tweaked by specifying a *limit* and
Last but not least, if scripting support is built into `mkdwarfs`, you *depth*. The *limit* determines how soon an inode is considered similar
can choose `script` to let the script determine the order. enough for adding. A *limit* of 255 means "essentially identical", whereas
a *limit* of 0 means "not similar at all". The *depth* determines up to
how many inodes can be checked at most while searching for a similar one.
The default if you omit these values is a *limit* of 255 and a *depth*
of 25000. Last but not least, if scripting support is built into `mkdwarfs`,
you can choose `script` to let the script determine the order.
* `--blockhash-window-sizes=`*value*[,*value*]...: * `--blockhash-window-sizes=`*value*[,*value*]...:
Window sizes used for block hashing. These sizes, separated by commas, Window sizes used for block hashing. These sizes, separated by commas,

View File

@ -175,6 +175,7 @@ struct level_defaults {
char const* schema_compression; char const* schema_compression;
char const* metadata_compression; char const* metadata_compression;
char const* window_sizes; char const* window_sizes;
char const* order;
}; };
#if defined(DWARFS_HAVE_LIBLZ4) #if defined(DWARFS_HAVE_LIBLZ4)
@ -252,16 +253,18 @@ struct level_defaults {
#endif #endif
constexpr std::array<level_defaults, 10> levels{{ constexpr std::array<level_defaults, 10> levels{{
/* 0 */ {20, "null", "null", "null", "-"}, // clang-format off
/* 1 */ {20, ALG_DATA_LEVEL1, ALG_SCHEMA, "null", "-"}, /* 0 */ {20, "null", "null" , "null", "-", "none"},
/* 2 */ {20, ALG_DATA_LEVEL2, ALG_SCHEMA, "null", "-"}, /* 1 */ {20, ALG_DATA_LEVEL1, ALG_SCHEMA, "null", "-", "path"},
/* 3 */ {20, ALG_DATA_LEVEL3, ALG_SCHEMA, "null", "13"}, /* 2 */ {20, ALG_DATA_LEVEL2, ALG_SCHEMA, "null", "-", "path"},
/* 4 */ {21, ALG_DATA_LEVEL4, ALG_SCHEMA, "null", "11"}, /* 3 */ {20, ALG_DATA_LEVEL3, ALG_SCHEMA, "null", "13", "similarity"},
/* 5 */ {22, ALG_DATA_LEVEL5, ALG_SCHEMA, "null", "11"}, /* 4 */ {21, ALG_DATA_LEVEL4, ALG_SCHEMA, "null", "11", "similarity"},
/* 6 */ {23, ALG_DATA_LEVEL6, ALG_SCHEMA, "null", "15,11"}, /* 5 */ {22, ALG_DATA_LEVEL5, ALG_SCHEMA, "null", "11", "similarity"},
/* 7 */ {24, ALG_DATA_LEVEL7, ALG_SCHEMA, "null", "17,15,13,11"}, /* 6 */ {23, ALG_DATA_LEVEL6, ALG_SCHEMA, "null", "15,11", "nilsimsa:250:10000"},
/* 8 */ {24, ALG_DATA_LEVEL8, ALG_SCHEMA, ALG_METADATA, "17,15,13,11"}, /* 7 */ {24, ALG_DATA_LEVEL7, ALG_SCHEMA, "null", "17,15,13,11", "nilsimsa"},
/* 9 */ {24, ALG_DATA_LEVEL9, ALG_SCHEMA, ALG_METADATA, "17,15,13,11"}, /* 8 */ {24, ALG_DATA_LEVEL8, ALG_SCHEMA, ALG_METADATA, "17,15,13,11", "nilsimsa"},
/* 9 */ {24, ALG_DATA_LEVEL9, ALG_SCHEMA, ALG_METADATA, "17,15,13,11", "nilsimsa"},
// clang-format on
}}; }};
constexpr unsigned default_level = 7; constexpr unsigned default_level = 7;
@ -283,7 +286,7 @@ int mkdwarfs(int argc, char** argv) {
scanner_options options; scanner_options options;
auto order_desc = auto order_desc =
"file order (" + (from(order_choices) | get<0>() | unsplit(", ")) + ")"; "inode order (" + (from(order_choices) | get<0>() | unsplit(", ")) + ")";
auto resolution_desc = "time resolution in seconds or (" + auto resolution_desc = "time resolution in seconds or (" +
(from(time_resolutions) | get<0>() | unsplit(", ")) + (from(time_resolutions) | get<0>() | unsplit(", ")) +
@ -341,8 +344,7 @@ int mkdwarfs(int argc, char** argv) {
po::value<std::string>(&time_resolution)->default_value("sec"), po::value<std::string>(&time_resolution)->default_value("sec"),
resolution_desc.c_str()) resolution_desc.c_str())
("order", ("order",
po::value<std::string>(&order) po::value<std::string>(&order),
->default_value("similarity"),
order_desc.c_str()) order_desc.c_str())
#ifdef DWARFS_HAVE_PYTHON #ifdef DWARFS_HAVE_PYTHON
("script", ("script",
@ -390,33 +392,36 @@ int mkdwarfs(int argc, char** argv) {
} }
if (vm.count("help") or !vm.count("input") or !vm.count("output")) { if (vm.count("help") or !vm.count("input") or !vm.count("output")) {
size_t l_dc = 0, l_sc = 0, l_mc = 0, l_ws = 0; size_t l_dc = 0, l_sc = 0, l_mc = 0, l_ws = 0, l_or = 0;
for (auto const& l : levels) { for (auto const& l : levels) {
l_dc = std::max(l_dc, ::strlen(l.data_compression)); l_dc = std::max(l_dc, ::strlen(l.data_compression));
l_sc = std::max(l_sc, ::strlen(l.schema_compression)); l_sc = std::max(l_sc, ::strlen(l.schema_compression));
l_mc = std::max(l_mc, ::strlen(l.metadata_compression)); l_mc = std::max(l_mc, ::strlen(l.metadata_compression));
l_ws = std::max(l_ws, ::strlen(l.window_sizes)); l_ws = std::max(l_ws, ::strlen(l.window_sizes));
l_or = std::max(l_or, ::strlen(l.order));
} }
std::string sep(21 + l_dc + l_sc + l_mc + l_ws, '-'); std::string sep(22 + l_dc + l_sc + l_mc + l_ws + l_or, '-');
std::cout << "mkdwarfs (" << DWARFS_VERSION << ")\n" << opts << std::endl; std::cout << "mkdwarfs (" << DWARFS_VERSION << ")\n" << opts << std::endl;
std::cout << "Compression level defaults:\n" std::cout << "Compression level defaults:\n"
<< " " << sep << "\n" << " " << sep << "\n"
<< fmt::format(" Level Block {:{}s} Window Sizes\n", << fmt::format(" Level Block {:{}s} {:{}s} Inode Order\n",
"Compression Algorithm", 4 + l_dc + l_sc + l_mc) "Compression Algorithm", 4 + l_dc + l_sc + l_mc,
<< fmt::format(" Size {:{}s} {:{}s} {:{}s}\n", "Window", l_ws)
<< fmt::format(" Size {:{}s} {:{}s} {:{}s} {:{}s}\n",
"Block Data", l_dc, "Schema", l_sc, "Metadata", "Block Data", l_dc, "Schema", l_sc, "Metadata",
l_mc) l_mc, "Sizes", l_ws)
<< " " << sep << std::endl; << " " << sep << std::endl;
int level = 0; int level = 0;
for (auto const& l : levels) { for (auto const& l : levels) {
std::cout << fmt::format( std::cout << fmt::format(" {:1d} {:2d} {:{}s} {:{}s} {:{}s} "
" {:1d} {:2d} {:{}s} {:{}s} {:{}s} {:{}s}", "{:{}s} {:{}s}",
level, l.block_size_bits, l.data_compression, l_dc, level, l.block_size_bits, l.data_compression,
l.schema_compression, l_sc, l.metadata_compression, l_mc, l_dc, l.schema_compression, l_sc,
l.window_sizes, l_ws) l.metadata_compression, l_mc, l.window_sizes,
l_ws, l.order, l_or)
<< std::endl; << std::endl;
++level; ++level;
} }
@ -475,6 +480,10 @@ int mkdwarfs(int argc, char** argv) {
window_sizes = defaults.window_sizes; window_sizes = defaults.window_sizes;
} }
if (!vm.count("order")) {
order = defaults.order;
}
std::vector<std::string> order_opts; std::vector<std::string> order_opts;
boost::split(order_opts, order, boost::is_any_of(":")); boost::split(order_opts, order, boost::is_any_of(":"));
if (auto it = order_choices.find(order_opts.front()); if (auto it = order_choices.find(order_opts.front());
@ -483,12 +492,12 @@ int mkdwarfs(int argc, char** argv) {
if (order_opts.size() > 1) { if (order_opts.size() > 1) {
if (options.file_order.mode != file_order_mode::NILSIMSA) { if (options.file_order.mode != file_order_mode::NILSIMSA) {
throw std::runtime_error( throw std::runtime_error(
fmt::format("file order mode '{}' does not support options", fmt::format("inode order mode '{}' does not support options",
order_opts.front())); order_opts.front()));
} }
if (order_opts.size() > 3) { if (order_opts.size() > 3) {
throw std::runtime_error(fmt::format( throw std::runtime_error(fmt::format(
"too many options for file order mode '{}'", order_opts.front())); "too many options for inode order mode '{}'", order_opts.front()));
} }
options.file_order.nilsimsa_limit = folly::to<int>(order_opts[1]); options.file_order.nilsimsa_limit = folly::to<int>(order_opts[1]);
if (options.file_order.nilsimsa_limit < 0 || if (options.file_order.nilsimsa_limit < 0 ||
@ -506,7 +515,7 @@ int mkdwarfs(int argc, char** argv) {
} }
} }
} else { } else {
throw std::runtime_error("invalid file order mode: " + order); throw std::runtime_error("invalid inode order mode: " + order);
} }
size_t mem_limit = parse_size_with_unit(memory_limit); size_t mem_limit = parse_size_with_unit(memory_limit);