Small refactor to avoid excessive vector allocation

This commit is contained in:
Marcus Holland-Moritz 2021-03-22 10:21:07 +01:00
parent 6c24e55897
commit a22aa99729
4 changed files with 16 additions and 14 deletions

View File

@ -27,6 +27,7 @@
#include <folly/small_vector.h> #include <folly/small_vector.h>
#include "dwarfs/nilsimsa.h"
#include "dwarfs/object.h" #include "dwarfs/object.h"
namespace dwarfs { namespace dwarfs {
@ -50,7 +51,7 @@ class inode : public object {
virtual void set_num(uint32_t num) = 0; virtual void set_num(uint32_t num) = 0;
virtual uint32_t num() const = 0; virtual uint32_t num() const = 0;
virtual uint32_t similarity_hash() const = 0; virtual uint32_t similarity_hash() const = 0;
virtual std::vector<uint64_t> const& nilsimsa_similarity_hash() const = 0; virtual nilsimsa::hash_type const& nilsimsa_similarity_hash() const = 0;
virtual size_t size() const = 0; virtual size_t size() const = 0;
virtual file const* any() const = 0; virtual file const* any() const = 0;
virtual files_vector const& files() const = 0; virtual files_vector const& files() const = 0;

View File

@ -21,10 +21,10 @@
#pragma once #pragma once
#include <array>
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
#include <type_traits> #include <type_traits>
#include <vector>
#include "dwarfs/compiler.h" #include "dwarfs/compiler.h"
@ -45,11 +45,13 @@ namespace dwarfs {
class nilsimsa { class nilsimsa {
public: public:
using hash_type = std::array<uint64_t, 4>;
nilsimsa(); nilsimsa();
~nilsimsa(); ~nilsimsa();
void update(uint8_t const* data, size_t size); void update(uint8_t const* data, size_t size);
std::vector<uint64_t> finalize() const; void finalize(hash_type& hash) const;
#ifdef DWARFS_MULTIVERSIONING #ifdef DWARFS_MULTIVERSIONING
__attribute__((target("popcnt"))) static int __attribute__((target("popcnt"))) static int

View File

@ -93,6 +93,10 @@ class inode_ : public inode {
public: public:
using chunk_type = thrift::metadata::chunk; using chunk_type = thrift::metadata::chunk;
inode_() {
std::fill(nilsimsa_similarity_hash_.begin(), nilsimsa_similarity_hash_.end(), 0);
}
void set_num(uint32_t num) override { void set_num(uint32_t num) override {
DWARFS_CHECK(!num_, "attempt to set inode number multiple times"); DWARFS_CHECK(!num_, "attempt to set inode number multiple times");
num_ = num; num_ = num;
@ -107,7 +111,7 @@ class inode_ : public inode {
return similarity_hash_; return similarity_hash_;
} }
std::vector<uint64_t> const& nilsimsa_similarity_hash() const override { nilsimsa::hash_type const& nilsimsa_similarity_hash() const override {
if (files_.empty()) { if (files_.empty()) {
DWARFS_THROW(runtime_error, "inode has no file"); DWARFS_THROW(runtime_error, "inode has no file");
} }
@ -156,7 +160,7 @@ class inode_ : public inode {
} }
if (opts.with_nilsimsa) { if (opts.with_nilsimsa) {
nilsimsa_similarity_hash_ = nc.finalize(); nc.finalize(nilsimsa_similarity_hash_);
} }
} }
} }
@ -189,7 +193,7 @@ class inode_ : public inode {
uint32_t similarity_hash_{0}; uint32_t similarity_hash_{0};
files_vector files_; files_vector files_;
std::vector<chunk_type> chunks_; std::vector<chunk_type> chunks_;
std::vector<uint64_t> nilsimsa_similarity_hash_; nilsimsa::hash_type nilsimsa_similarity_hash_;
}; };
} // namespace } // namespace

View File

@ -19,8 +19,6 @@
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>. * along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/ */
#include <array>
#include "dwarfs/compiler.h" #include "dwarfs/compiler.h"
#include "dwarfs/nilsimsa.h" #include "dwarfs/nilsimsa.h"
@ -76,7 +74,7 @@ class nilsimsa::impl {
update_fast(data, size); update_fast(data, size);
} }
std::vector<uint64_t> finalize() const { void finalize(hash_type& hash) const {
size_t total = 0; size_t total = 0;
if (size_ == 3) { if (size_ == 3) {
@ -89,16 +87,13 @@ class nilsimsa::impl {
size_t threshold = total / acc_.size(); size_t threshold = total / acc_.size();
std::vector<uint64_t> hash; std::fill(hash.begin(), hash.end(), 0);
hash.resize(4);
for (size_t i = 0; i < acc_.size(); i++) { for (size_t i = 0; i < acc_.size(); i++) {
if (acc_[i] > threshold) { if (acc_[i] > threshold) {
hash[i >> 6] |= UINT64_C(1) << (i & 0x3F); hash[i >> 6] |= UINT64_C(1) << (i & 0x3F);
} }
} }
return hash;
} }
private: private:
@ -195,7 +190,7 @@ void nilsimsa::update(uint8_t const* data, size_t size) {
impl_->update(data, size); impl_->update(data, size);
} }
std::vector<uint64_t> nilsimsa::finalize() const { return impl_->finalize(); } void nilsimsa::finalize(hash_type& hash) const { impl_->finalize(hash); }
#ifdef DWARFS_MULTIVERSIONING #ifdef DWARFS_MULTIVERSIONING
__attribute__((target("popcnt"))) int __attribute__((target("popcnt"))) int