From 201b6ddb3bcadb6f1c965acf083e1bdaa9bda04b Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 30 Dec 2020 13:24:50 +0100 Subject: [PATCH] Dynamically adapt nilsimsa search depth --- doc/mkdwarfs.md | 14 ++++-- include/dwarfs/console_writer.h | 1 + include/dwarfs/filesystem_writer.h | 3 ++ include/dwarfs/inode_manager.h | 8 +-- include/dwarfs/options.h | 1 + include/dwarfs/progress.h | 3 ++ src/dwarfs/console_writer.cpp | 26 +++++++++- src/dwarfs/filesystem_writer.cpp | 1 + src/dwarfs/inode_manager.cpp | 63 ++++++++++++++++-------- src/dwarfs/scanner.cpp | 10 +++- src/mkdwarfs.cpp | 78 ++++++++++++++++++++---------- 11 files changed, 152 insertions(+), 56 deletions(-) diff --git a/doc/mkdwarfs.md b/doc/mkdwarfs.md index aafd65bf..42ffa2d2 100644 --- a/doc/mkdwarfs.md +++ b/doc/mkdwarfs.md @@ -154,7 +154,7 @@ Most other options are concerned with compression tuning: the `mtime` field in order to save metadata space. If you want to save `atime` and `ctime` as well, use this option. - * `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*]]|`script`: + * `--order=none`|`path`|`similarity`|`nilsimsa`[`:`*limit*[`:`*depth*[`:`*mindepth*]]]|`script`: The order in which inodes will be written to the file system. Choosing `none`, the inodes will be stored in the order in which they are discovered. With `path`, they will be sorted asciibetically by path name of the first file @@ -168,9 +168,15 @@ Most other options are concerned with compression tuning: enough for adding. A *limit* of 255 means "essentially identical", whereas a *limit* of 0 means "not similar at all". The *depth* determines up to how many inodes can be checked at most while searching for a similar one. - The default if you omit these values is a *limit* of 255 and a *depth* - of 20000. Last but not least, if scripting support is built into `mkdwarfs`, - you can choose `script` to let the script determine the order. + To avoid nilsimsa ordering to become a bottleneck when ordering lots of + small files, the *depth* is adjusted dynamically to keep the input queue + to the segmentation/compression stages adequately filled. You can specify + how much the *depth* can be adjusted by also specifying *mindepth*. + The default if you omit these values is a *limit* of 255, a *depth* + of 20000 and a *mindepth* of 1000. Note that if you want reproducible + results, you need to set *depth* and *mindepth* to the same value. + Last but not least, if scripting support is built into `mkdwarfs`, you can + choose `script` to let the script determine the order. * `--blockhash-window-sizes=`*value*[,*value*]...: Window sizes used for block hashing. These sizes, separated by commas, diff --git a/include/dwarfs/console_writer.h b/include/dwarfs/console_writer.h index 04d91147..48f9c8c5 100644 --- a/include/dwarfs/console_writer.h +++ b/include/dwarfs/console_writer.h @@ -61,5 +61,6 @@ class console_writer : public logger { display_mode const mode_; bool const color_; bool const with_context_; + bool const debug_progress_; }; } // namespace dwarfs diff --git a/include/dwarfs/filesystem_writer.h b/include/dwarfs/filesystem_writer.h index 5ce4883e..4445d910 100644 --- a/include/dwarfs/filesystem_writer.h +++ b/include/dwarfs/filesystem_writer.h @@ -87,6 +87,8 @@ class filesystem_writer { size_t size() const { return impl_->size(); } + int queue_fill() const { return impl_->queue_fill(); } + class impl { public: virtual ~impl() = default; @@ -99,6 +101,7 @@ class filesystem_writer { folly::ByteRange data) = 0; virtual void flush() = 0; virtual size_t size() const = 0; + virtual int queue_fill() const = 0; }; private: diff --git a/include/dwarfs/inode_manager.h b/include/dwarfs/inode_manager.h index a807681a..f840fccb 100644 --- a/include/dwarfs/inode_manager.h +++ b/include/dwarfs/inode_manager.h @@ -29,6 +29,7 @@ namespace dwarfs { class inode; class logger; +class progress; class script; struct file_order_options; @@ -36,8 +37,9 @@ struct file_order_options; class inode_manager { public: using inode_cb = std::function const&)>; + using order_cb = std::function const&)>; - inode_manager(logger& lgr); + inode_manager(logger& lgr, progress& prog); std::shared_ptr create_inode() { return impl_->create_inode(); } @@ -45,7 +47,7 @@ class inode_manager { void order_inodes(std::shared_ptr