diff --git a/include/dwarfs/offset_cache.h b/include/dwarfs/offset_cache.h new file mode 100644 index 00000000..6db3028b --- /dev/null +++ b/include/dwarfs/offset_cache.h @@ -0,0 +1,203 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace dwarfs { + +template +class basic_offset_cache { + public: + using inode_type = InodeT; + using file_offset_type = FileOffsetT; + using chunk_index_type = ChunkIndexT; + + static constexpr size_t const chunk_index_interval = ChunkIndexInterval; + + class updater; + + class chunk_offsets { + public: + chunk_offsets(chunk_index_type total_chunks) { + // TODO: we can use an offset as we don't have to cache (0, 0) + // also, we may not have to cache the largest offset + // + // TODO: potentially add an `open` call to filesystem_v2 and + // use that to cache the last position; this should be + // more efficient as it won't require any mutexes; still + // keep the offset cache, but don't overcomplicate it + // with the "last_*" stuff + offsets_.reserve(total_chunks / chunk_index_interval - 1); + } + + void update(chunk_index_type first_index, + std::span offsets) { + std::lock_guard lock(mx_); + + if (first_index + offsets.size() > offsets_.size()) { + assert(first_index <= offsets_.size()); + auto new_offsets = offsets.subspan(offsets_.size() - first_index); + std::copy(new_offsets.begin(), new_offsets.end(), + std::back_inserter(offsets_)); + } + } + + void update(updater const& upd) { + update(upd.first_index(), upd.offsets()); + } + + // void set_last(chunk_index_type chunk_index, file_offset_type file_offset) + // { + // std::lock_guard lock(mx_); + // last_chunk_index_ = chunk_index; + // last_file_offset_ = file_offset; + // } + + std::pair + find(file_offset_type offset, updater& upd) { + std::lock_guard lock(mx_); + + upd.set_first_index(offsets_.size()); + + if (!offsets_.empty()) { + chunk_index_type best_index = offsets_.size(); + + if (offset < offsets_.back()) { + auto it = std::lower_bound(offsets_.begin(), offsets_.end(), offset); + + if (it != offsets_.end()) { + best_index = std::distance(offsets_.begin(), it); + } + } + + if (best_index > 0) { + return {chunk_index_interval * best_index, offsets_[best_index - 1]}; + } + } + + return {0, 0}; + } + + void dump(std::ostream& os) const { + std::vector offsets; + { + std::lock_guard lock(mx_); + offsets = offsets_; + } + for (auto off : offsets) { + os << " " << off << "\n"; + } + } + + private: + // chunk_index_type last_chunk_index_; + // file_offset_type last_file_offset_; + std::vector offsets_; + std::mutex mutable mx_; + }; + + using value_type = std::shared_ptr; + + class updater { + public: + static constexpr size_t const max_inline_offsets = UpdaterMaxInlineOffsets; + + void set_first_index(chunk_index_type first_ix) { first_index_ = first_ix; } + + void add_offset(chunk_index_type index, file_offset_type offset) { + if (index < chunk_index_interval || index % chunk_index_interval != 0) + [[likely]] { + return; + } + + auto ix = index / chunk_index_interval - 1; + assert(ix <= first_index_ + offset_.size()); + + if (ix == first_index_ + offsets_.size()) { + offsets_.push_back(offset); + } + } + + chunk_index_type first_index() const { return first_index_; } + + std::span offsets() const { return offsets_; } + + private: + folly::small_vector offsets_; + chunk_index_type first_index_{0}; + }; + + basic_offset_cache(size_t cache_size) + : cache_{cache_size} {} + + value_type find(inode_type inode, chunk_index_type num_chunks) const { + { + std::lock_guard lock(mx_); + + if (auto it = cache_.find(inode); it != cache_.end()) { + return it->second; + } + } + + return std::make_shared(num_chunks); + } + + void set(inode_type inode, value_type value) { + std::lock_guard lock(mx_); + cache_.set(inode, std::move(value)); + } + + void dump(std::ostream& os) const { + std::vector> + contents; + + { + std::lock_guard lock(mx_); + std::copy(cache_.begin(), cache_.end(), std::back_inserter(contents)); + } + + for (auto const& [inode, ent] : contents) { + os << "inode " << inode << ":\n"; + ent->dump(os); + } + } + + private: + using cache_type = folly::EvictingCacheMap; + + cache_type mutable cache_; + std::mutex mutable mx_; +}; + +} // namespace dwarfs diff --git a/test/utils_test.cpp b/test/utils_test.cpp index cd006e1f..42598b50 100644 --- a/test/utils_test.cpp +++ b/test/utils_test.cpp @@ -21,14 +21,16 @@ #include +#include +#include +#include #include +#include "dwarfs/offset_cache.h" #include "dwarfs/util.h" using namespace dwarfs; -namespace {} // namespace - TEST(utf8_display_width, basic) { EXPECT_EQ(0, utf8_display_width("")); EXPECT_EQ(1, utf8_display_width(u8string_to_string(u8"a"))); @@ -154,3 +156,133 @@ TEST(shorten_path, string_utf8) { } } } + +namespace { + +using cache_type = basic_offset_cache; +constexpr std::array const test_chunks{ + 3, 15, 13, 1, 11, 6, 9, 15, 1, 16, 1, 13, 11, 16, 10, 14, + 4, 14, 4, 16, 8, 12, 16, 2, 16, 10, 15, 15, 2, 15, 5, 8, +}; +constexpr cache_type::inode_type const test_inode = 42; +constexpr size_t const total_size = + std::accumulate(test_chunks.begin(), test_chunks.end(), 0); + +std::tuple +find_file_position(cache_type::inode_type const inode, + std::span chunks, + cache_type::file_offset_type file_offset, + cache_type* cache = nullptr) { + cache_type::value_type ent; + + if (cache) { + ent = cache->find(inode, chunks.size()); + + if (!ent) { + throw std::runtime_error("find() did not return an object"); + } + } + + auto upd = cache_type::updater(); + auto it = chunks.begin(); + auto end = chunks.end(); + cache_type::chunk_index_type chunk_index = 0; + cache_type::file_offset_type chunk_offset = 0; + + if (ent) { + std::tie(chunk_index, chunk_offset) = ent->find(file_offset, upd); + std::advance(it, chunk_index); + } + + auto remaining_offset = file_offset - chunk_offset; + size_t num_lookups = 0; + + while (it < end) { + ++num_lookups; + auto chunk_size = *it; + + if (remaining_offset < chunk_size) { + break; + } + + remaining_offset -= chunk_size; + chunk_offset += chunk_size; + ++it; + + upd.add_offset(++chunk_index, chunk_offset); + } + + if (ent) { + ent->update(upd); + cache->set(inode, ent); + } + + return {chunk_index, remaining_offset, num_lookups}; +} + +} // namespace + +TEST(offset_cache, basic) { + cache_type cache(4); + + size_t total_ref_lookups = 0; + size_t total_test_lookups = 0; + + for (cache_type::file_offset_type offset = 0; offset < total_size; ++offset) { + auto [ref_ix, ref_off, ref_lookups] = + find_file_position(test_inode, test_chunks, offset); + + auto [test_ix, test_off, test_lookups] = + find_file_position(test_inode, test_chunks, offset, &cache); + + auto ref_offset = std::accumulate(test_chunks.begin(), + test_chunks.begin() + ref_ix, ref_off); + + EXPECT_EQ(offset, ref_offset); + + EXPECT_EQ(ref_ix + 1, ref_lookups); + EXPECT_LE(test_lookups, 5); + + EXPECT_EQ(ref_ix, test_ix); + EXPECT_EQ(ref_off, test_off); + + total_ref_lookups += ref_lookups; + total_test_lookups += test_lookups; + } + + EXPECT_GT(total_test_lookups, 0); + EXPECT_LT(total_test_lookups, total_ref_lookups); + + for (cache_type::file_offset_type offset = total_size; offset-- > 0;) { + auto [ref_ix, ref_off, ref_lookups] = + find_file_position(test_inode, test_chunks, offset); + + auto [test_ix, test_off, test_lookups] = + find_file_position(test_inode, test_chunks, offset, &cache); + + auto ref_offset = std::accumulate(test_chunks.begin(), + test_chunks.begin() + ref_ix, ref_off); + + EXPECT_EQ(offset, ref_offset); + + EXPECT_EQ(ref_ix + 1, ref_lookups); + EXPECT_LE(test_lookups, 5); + + EXPECT_EQ(ref_ix, test_ix); + EXPECT_EQ(ref_off, test_off); + + total_ref_lookups += ref_lookups; + total_test_lookups += test_lookups; + } +} + +TEST(offset_cache, prefill) { + cache_type prefilled_cache(4); + + auto [prefill_ix, prefill_off, prefill_lookups] = find_file_position( + test_inode, test_chunks, total_size - 1, &prefilled_cache); + + EXPECT_EQ(test_chunks.size(), prefill_lookups); + EXPECT_EQ(test_chunks.size() - 1, prefill_ix); + EXPECT_EQ(test_chunks.back() - 1, prefill_off); +}