Update mkdwarfs options with level-dependent ordering

2025-09-09 12:28:13 -04:00 · 2020-12-09 17:55:50 +01:00 · 2020-12-09 17:55:50 +01:00 · a2377ead41
commit a2377ead41
parent 2efc231393
2 changed files with 53 additions and 39 deletions
--- a/doc/mkdwarfs.md
+++ b/doc/mkdwarfs.md
@ -131,18 +131,23 @@ Most other options are concerned with compression tuning:
    the `mtime` field in order to save metadata space. If you want to save
    `atime` and `ctime` as well, use this option.

-  * `--order=none`|`path`|`similarity`|`nilsimsa`|`script`:
-    The order in which files will be written to the filesystem. Choosing `none`,
-    the files will be stored in the order in which they are discovered. With
-    `path`, they will be sorted asciibetically by path name. With `similarity`,
-    they will be ordered using a simple, yet fast and efficient, similarity
-    hash function. This is the default, as it will cause similar files to be
-    located close to each other, which means compression will be better.
-    `nilsimsa` ordering uses a different similarity function that is *likely*
-    even better than `similarity`, but is comparatively slow to compute. It
-    *can* be very slow, though it shouldn't be for typical inputs. YMMV.
-    Last but not least, if scripting support is built into `mkdwarfs`, you
-    can choose `script` to let the script determine the order.
+  * `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`:
+    The order in which inodes will be written to the file system. Choosing `none`,
+    the inodes will be stored in the order in which they are discovered. With
+    `path`, they will be sorted asciibetically by path name of the first file
+    representing this inode. With `similarity`, they will be ordered using a
+    simple, yet fast and efficient, similarity hash function. `nilsimsa` ordering
+    uses a more sophisticated similarity function that is typically better than
+    `similarity`, but is significantly slower to compute. However, computation
+    can happen in the background while already building the file system.
+    `nilsimsa` ordering can be further tweaked by specifying a *limit* and
+    *depth*. The *limit* determines how soon an inode is considered similar
+    enough for adding. A *limit* of 255 means "essentially identical", whereas
+    a *limit* of 0 means "not similar at all". The *depth* determines up to
+    how many inodes can be checked at most while searching for a similar one.
+    The default if you omit these values is a *limit* of 255 and a *depth*
+    of 25000. Last but not least, if scripting support is built into `mkdwarfs`,
+    you can choose `script` to let the script determine the order.

  * `--blockhash-window-sizes=`*value*[,*value*]...:
    Window sizes used for block hashing. These sizes, separated by commas,
--- a/src/mkdwarfs.cpp
+++ b/src/mkdwarfs.cpp
@ -175,6 +175,7 @@ struct level_defaults {
  char const* schema_compression;
  char const* metadata_compression;
  char const* window_sizes;
+  char const* order;
 };

 #if defined(DWARFS_HAVE_LIBLZ4)
@ -252,16 +253,18 @@ struct level_defaults {
 #endif

 constexpr std::array<level_defaults, 10> levels{{
-    /* 0 */ {20, "null", "null", "null", "-"},
-    /* 1 */ {20, ALG_DATA_LEVEL1, ALG_SCHEMA, "null", "-"},
-    /* 2 */ {20, ALG_DATA_LEVEL2, ALG_SCHEMA, "null", "-"},
-    /* 3 */ {20, ALG_DATA_LEVEL3, ALG_SCHEMA, "null", "13"},
-    /* 4 */ {21, ALG_DATA_LEVEL4, ALG_SCHEMA, "null", "11"},
-    /* 5 */ {22, ALG_DATA_LEVEL5, ALG_SCHEMA, "null", "11"},
-    /* 6 */ {23, ALG_DATA_LEVEL6, ALG_SCHEMA, "null", "15,11"},
-    /* 7 */ {24, ALG_DATA_LEVEL7, ALG_SCHEMA, "null", "17,15,13,11"},
-    /* 8 */ {24, ALG_DATA_LEVEL8, ALG_SCHEMA, ALG_METADATA, "17,15,13,11"},
-    /* 9 */ {24, ALG_DATA_LEVEL9, ALG_SCHEMA, ALG_METADATA, "17,15,13,11"},
+    // clang-format off
+    /* 0 */ {20, "null",          "null"    , "null",       "-",           "none"},
+    /* 1 */ {20, ALG_DATA_LEVEL1, ALG_SCHEMA, "null",       "-",           "path"},
+    /* 2 */ {20, ALG_DATA_LEVEL2, ALG_SCHEMA, "null",       "-",           "path"},
+    /* 3 */ {20, ALG_DATA_LEVEL3, ALG_SCHEMA, "null",       "13",          "similarity"},
+    /* 4 */ {21, ALG_DATA_LEVEL4, ALG_SCHEMA, "null",       "11",          "similarity"},
+    /* 5 */ {22, ALG_DATA_LEVEL5, ALG_SCHEMA, "null",       "11",          "similarity"},
+    /* 6 */ {23, ALG_DATA_LEVEL6, ALG_SCHEMA, "null",       "15,11",       "nilsimsa:250:10000"},
+    /* 7 */ {24, ALG_DATA_LEVEL7, ALG_SCHEMA, "null",       "17,15,13,11", "nilsimsa"},
+    /* 8 */ {24, ALG_DATA_LEVEL8, ALG_SCHEMA, ALG_METADATA, "17,15,13,11", "nilsimsa"},
+    /* 9 */ {24, ALG_DATA_LEVEL9, ALG_SCHEMA, ALG_METADATA, "17,15,13,11", "nilsimsa"},
+    // clang-format on
 }};

 constexpr unsigned default_level = 7;
@ -283,7 +286,7 @@ int mkdwarfs(int argc, char** argv) {
  scanner_options options;

  auto order_desc =
-      "file order (" + (from(order_choices) | get<0>() | unsplit(", ")) + ")";
+      "inode order (" + (from(order_choices) | get<0>() | unsplit(", ")) + ")";

  auto resolution_desc = "time resolution in seconds or (" +
                         (from(time_resolutions) | get<0>() | unsplit(", ")) +
@ -341,8 +344,7 @@ int mkdwarfs(int argc, char** argv) {
        po::value<std::string>(&time_resolution)->default_value("sec"),
        resolution_desc.c_str())
    ("order",
-        po::value<std::string>(&order)
-            ->default_value("similarity"),
+        po::value<std::string>(&order),
        order_desc.c_str())
 #ifdef DWARFS_HAVE_PYTHON
    ("script",
@ -390,33 +392,36 @@ int mkdwarfs(int argc, char** argv) {
  }

  if (vm.count("help") or !vm.count("input") or !vm.count("output")) {
-    size_t l_dc = 0, l_sc = 0, l_mc = 0, l_ws = 0;
+    size_t l_dc = 0, l_sc = 0, l_mc = 0, l_ws = 0, l_or = 0;
    for (auto const& l : levels) {
      l_dc = std::max(l_dc, ::strlen(l.data_compression));
      l_sc = std::max(l_sc, ::strlen(l.schema_compression));
      l_mc = std::max(l_mc, ::strlen(l.metadata_compression));
      l_ws = std::max(l_ws, ::strlen(l.window_sizes));
+      l_or = std::max(l_or, ::strlen(l.order));
    }

-    std::string sep(21 + l_dc + l_sc + l_mc + l_ws, '-');
+    std::string sep(22 + l_dc + l_sc + l_mc + l_ws + l_or, '-');

    std::cout << "mkdwarfs (" << DWARFS_VERSION << ")\n" << opts << std::endl;
    std::cout << "Compression level defaults:\n"
              << "  " << sep << "\n"
-              << fmt::format("  Level  Block  {:{}s}  Window Sizes\n",
-                             "Compression Algorithm", 4 + l_dc + l_sc + l_mc)
-              << fmt::format("         Size   {:{}s}  {:{}s}  {:{}s}\n",
+              << fmt::format("  Level  Block  {:{}s}  {:{}s}  Inode Order\n",
+                             "Compression Algorithm", 4 + l_dc + l_sc + l_mc,
+                             "Window", l_ws)
+              << fmt::format("         Size   {:{}s}  {:{}s}  {:{}s}  {:{}s}\n",
                             "Block Data", l_dc, "Schema", l_sc, "Metadata",
-                             l_mc)
+                             l_mc, "Sizes", l_ws)
              << "  " << sep << std::endl;

    int level = 0;
    for (auto const& l : levels) {
-      std::cout << fmt::format(
-                       "  {:1d}      {:2d}     {:{}s}  {:{}s}  {:{}s}  {:{}s}",
-                       level, l.block_size_bits, l.data_compression, l_dc,
-                       l.schema_compression, l_sc, l.metadata_compression, l_mc,
-                       l.window_sizes, l_ws)
+      std::cout << fmt::format("  {:1d}      {:2d}     {:{}s}  {:{}s}  {:{}s}  "
+                               "{:{}s}  {:{}s}",
+                               level, l.block_size_bits, l.data_compression,
+                               l_dc, l.schema_compression, l_sc,
+                               l.metadata_compression, l_mc, l.window_sizes,
+                               l_ws, l.order, l_or)
                << std::endl;
      ++level;
    }
@ -475,6 +480,10 @@ int mkdwarfs(int argc, char** argv) {
    window_sizes = defaults.window_sizes;
  }

+  if (!vm.count("order")) {
+    order = defaults.order;
+  }
+
  std::vector<std::string> order_opts;
  boost::split(order_opts, order, boost::is_any_of(":"));
  if (auto it = order_choices.find(order_opts.front());
@ -483,12 +492,12 @@ int mkdwarfs(int argc, char** argv) {
    if (order_opts.size() > 1) {
      if (options.file_order.mode != file_order_mode::NILSIMSA) {
        throw std::runtime_error(
-            fmt::format("file order mode '{}' does not support options",
+            fmt::format("inode order mode '{}' does not support options",
                        order_opts.front()));
      }
      if (order_opts.size() > 3) {
        throw std::runtime_error(fmt::format(
-            "too many options for file order mode '{}'", order_opts.front()));
+            "too many options for inode order mode '{}'", order_opts.front()));
      }
      options.file_order.nilsimsa_limit = folly::to<int>(order_opts[1]);
      if (options.file_order.nilsimsa_limit < 0 ||
@ -506,7 +515,7 @@ int mkdwarfs(int argc, char** argv) {
      }
    }
  } else {
-    throw std::runtime_error("invalid file order mode: " + order);
+    throw std::runtime_error("invalid inode order mode: " + order);
  }

  size_t mem_limit = parse_size_with_unit(memory_limit);