Skip to content

Commit

Permalink
fix(mkdwarfs): make --max-similarity-size work again + tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mhx committed Jan 9, 2024
1 parent 08a7887 commit 26b15cb
Show file tree
Hide file tree
Showing 7 changed files with 245 additions and 37 deletions.
6 changes: 4 additions & 2 deletions include/dwarfs/inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <iosfwd>
#include <memory>
#include <optional>
#include <vector>

#include <folly/small_vector.h>
Expand Down Expand Up @@ -54,8 +55,9 @@ class inode : public object {
virtual void set_num(uint32_t num) = 0;
virtual uint32_t num() const = 0;
virtual bool has_category(fragment_category cat) const = 0;
virtual uint32_t similarity_hash(fragment_category cat) const = 0;
virtual nilsimsa::hash_type const&
virtual std::optional<uint32_t>
similarity_hash(fragment_category cat) const = 0;
virtual nilsimsa::hash_type const*
nilsimsa_similarity_hash(fragment_category cat) const = 0;
virtual size_t size() const = 0;
virtual file const* any() const = 0;
Expand Down
3 changes: 2 additions & 1 deletion include/dwarfs/inode_ordering.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ class logger;
class progress;
class worker_group;

struct inode_options;
struct similarity_ordering_options;

class inode_ordering {
public:
inode_ordering(logger& lgr, progress& prog);
inode_ordering(logger& lgr, progress& prog, inode_options const& opts);

void by_inode_number(sortable_inode_span& sp) const {
impl_->by_inode_number(sp);
Expand Down
7 changes: 1 addition & 6 deletions include/dwarfs/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,7 @@ struct file_order_options {
};

struct inode_options {
// TODO: - clean this all up and name properly
// - the file_order thing should really be "fragment_order"
// - it should all belong into inode_options, where scanner
// can still access it
// - python scripts need to die
std::optional<size_t> max_similarity_scan_size; // TODO: not sure about this?
std::optional<size_t> max_similarity_scan_size;
std::shared_ptr<categorizer_manager> categorizer_mgr;
categorized_option<file_order_options> fragment_order{file_order_options()};
};
Expand Down
3 changes: 2 additions & 1 deletion src/dwarfs/inode_element_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ inode_element_view::inode_element_view(
, cat_{cat} {
hash_cache_.resize(inodes_.size());
for (auto i : index) {
hash_cache_[i] = &inodes_[i]->nilsimsa_similarity_hash(cat);
hash_cache_[i] = inodes_[i]->nilsimsa_similarity_hash(cat);
}
}

Expand Down Expand Up @@ -84,6 +84,7 @@ std::string inode_element_view::description(size_t i) const {
}

nilsimsa::hash_type const& inode_element_view::get_bits(size_t i) const {
assert(hash_cache_[i] != nullptr);
return *hash_cache_[i];
}

Expand Down
37 changes: 27 additions & 10 deletions src/dwarfs/inode_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,15 @@ class inode_ : public inode {
fragments_, [cat](auto const& f) { return f.category() == cat; });
}

uint32_t similarity_hash(fragment_category cat) const override {
return find_similarity<uint32_t>(cat);
std::optional<uint32_t>
similarity_hash(fragment_category cat) const override {
if (auto sim = find_similarity<uint32_t>(cat)) {
return *sim;
}
return std::nullopt;
}

nilsimsa::hash_type const&
nilsimsa::hash_type const*
nilsimsa_similarity_hash(fragment_category cat) const override {
return find_similarity<nilsimsa::hash_type>(cat);
}
Expand Down Expand Up @@ -290,24 +294,26 @@ class inode_ : public inode {
}

template <typename T>
T const& find_similarity(fragment_category cat) const {
T const* find_similarity(fragment_category cat) const {
if (fragments_.empty()) [[unlikely]] {
DWARFS_THROW(runtime_error, fmt::format("inode has no fragments ({})",
folly::demangle(typeid(T))));
}
if (std::holds_alternative<std::monostate>(similarity_)) {
return nullptr;
}
if (fragments_.size() == 1) {
if (fragments_.get_single_category() != cat) [[unlikely]] {
DWARFS_THROW(runtime_error, fmt::format("category mismatch ({})",
folly::demangle(typeid(T))));
}
return std::get<T>(similarity_);
return &std::get<T>(similarity_);
}
auto& m = std::get<similarity_map_type>(similarity_);
if (auto it = m.find(cat); it != m.end()) {
return std::get<T>(it->second);
return &std::get<T>(it->second);
}
DWARFS_THROW(runtime_error, fmt::format("category not found ({})",
folly::demangle(typeid(T))));
return nullptr;
}

template <typename T>
Expand Down Expand Up @@ -344,6 +350,11 @@ class inode_ : public inode {
std::unordered_map<fragment_category, nilsimsa> nc;

for (auto const& f : fragments_.span()) {
if (auto max = opts.max_similarity_scan_size;
max && static_cast<size_t>(f.size()) > *max) {
continue;
}

switch (opts.fragment_order.get(f.category()).mode) {
case file_order_mode::NONE:
case file_order_mode::PATH:
Expand Down Expand Up @@ -396,6 +407,12 @@ class inode_ : public inode {
size_t chunk_size) {
assert(fragments_.size() <= 1);

if (mm) {
if (auto max = opts.max_similarity_scan_size; max && mm->size() > *max) {
return;
}
}

auto order_mode =
fragments_.empty()
? opts.fragment_order.get().mode
Expand Down Expand Up @@ -477,7 +494,7 @@ class inode_manager_ final : public inode_manager::impl {
const override {
auto span = sortable_span();
span.all();
inode_ordering(LOG_GET_LOGGER, prog_).by_inode_number(span);
inode_ordering(LOG_GET_LOGGER, prog_, opts_).by_inode_number(span);
for (auto const& i : span) {
fn(i);
}
Expand Down Expand Up @@ -613,7 +630,7 @@ auto inode_manager_<LoggerPolicy>::ordered_span(fragment_category cat,
auto span = sortable_span();
span.select([cat](auto const& v) { return v->has_category(cat); });

inode_ordering order(LOG_GET_LOGGER, prog_);
inode_ordering order(LOG_GET_LOGGER, prog_, opts_);

switch (opts.mode) {
case file_order_mode::NONE:
Expand Down
98 changes: 81 additions & 17 deletions src/dwarfs/inode_ordering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "dwarfs/inode_element_view.h"
#include "dwarfs/inode_ordering.h"
#include "dwarfs/logger.h"
#include "dwarfs/options.h"
#include "dwarfs/promise_receiver.h"
#include "dwarfs/similarity_ordering.h"
#include "dwarfs/worker_group.h"
Expand All @@ -33,12 +34,19 @@ namespace dwarfs {

namespace {

bool inode_less_by_size(inode const* a, inode const* b) {
auto sa = a->size();
auto sb = b->size();
return sa > sb || (sa == sb && a->any()->less_revpath(*b->any()));
}

template <typename LoggerPolicy>
class inode_ordering_ final : public inode_ordering::impl {
public:
inode_ordering_(logger& lgr, progress& prog)
inode_ordering_(logger& lgr, progress& prog, inode_options const& opts)
: LOG_PROXY_INIT(lgr)
, prog_{prog} {}
, prog_{prog}
, opts_{opts} {}

void by_inode_number(sortable_inode_span& sp) const override;
void by_path(sortable_inode_span& sp) const override;
Expand All @@ -50,8 +58,14 @@ class inode_ordering_ final : public inode_ordering::impl {
sortable_inode_span& sp, fragment_category cat) const override;

private:
void
by_nilsimsa_impl(worker_group& wg, similarity_ordering_options const& opts,
std::span<std::shared_ptr<inode> const> inodes,
std::vector<uint32_t>& index, fragment_category cat) const;

LOG_PROXY_DECL(LoggerPolicy);
progress& prog_;
inode_options const& opts_;
};

template <typename LoggerPolicy>
Expand Down Expand Up @@ -93,20 +107,42 @@ void inode_ordering_<LoggerPolicy>::by_reverse_path(
template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp,
fragment_category cat) const {
std::vector<uint32_t> hash_cache;
std::vector<std::optional<uint32_t>> hash_cache;

auto raw = sp.raw();
auto& index = sp.index();
bool any_missing = false;

hash_cache.resize(raw.size());

for (auto i : index) {
hash_cache[i] = raw[i]->similarity_hash(cat);
auto& cache = hash_cache[i];
cache = raw[i]->similarity_hash(cat);
if (!cache.has_value()) {
any_missing = true;
}
}

std::sort(index.begin(), index.end(), [&](auto a, auto b) {
auto const ca = hash_cache[a];
auto const cb = hash_cache[b];
auto size_pred = [&](auto a, auto b) {
return inode_less_by_size(raw[a].get(), raw[b].get());
};

auto start = index.begin();

if (any_missing) {
start = std::stable_partition(index.begin(), index.end(), [&](auto i) {
return !hash_cache[i].has_value();
});

std::sort(index.begin(), start, size_pred);
}

std::sort(start, index.end(), [&](auto a, auto b) {
assert(hash_cache[a].has_value());
assert(hash_cache[b].has_value());

auto const ca = *hash_cache[a];
auto const cb = *hash_cache[b];

if (ca < cb) {
return true;
Expand All @@ -116,31 +152,59 @@ void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp,
return false;
}

auto ia = raw[a].get();
auto ib = raw[b].get();

return ia->size() > ib->size() ||
(ia->size() == ib->size() && ia->any()->less_revpath(*ib->any()));
return size_pred(a, b);
});
}

template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_nilsimsa(
worker_group& wg, similarity_ordering_options const& opts,
sortable_inode_span& sp, fragment_category cat) const {
auto ev = inode_element_view(sp.raw(), sp.index(), cat);
auto raw = sp.raw();
auto& index = sp.index();

if (opts_.max_similarity_scan_size) {
auto mid = std::stable_partition(index.begin(), index.end(), [&](auto i) {
return !raw[i]->nilsimsa_similarity_hash(cat);
});

if (mid != index.begin()) {
std::sort(index.begin(), mid, [&](auto a, auto b) {
return inode_less_by_size(raw[a].get(), raw[b].get());
});

if (mid != index.end()) {
std::vector<uint32_t> small_index(mid, index.end());
by_nilsimsa_impl(wg, opts, raw, small_index, cat);
std::copy(small_index.begin(), small_index.end(), mid);
}

return;
}
}

by_nilsimsa_impl(wg, opts, raw, index, cat);
}

template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_nilsimsa_impl(
worker_group& wg, similarity_ordering_options const& opts,
std::span<std::shared_ptr<inode> const> inodes,
std::vector<uint32_t>& index, fragment_category cat) const {
auto ev = inode_element_view(inodes, index, cat);
std::promise<std::vector<uint32_t>> promise;
auto future = promise.get_future();
auto sim_order = similarity_ordering(LOG_GET_LOGGER, prog_, wg, opts);
sim_order.order_nilsimsa(ev, make_receiver(std::move(promise)),
std::move(sp.index()));
future.get().swap(sp.index());
std::move(index));
future.get().swap(index);
}

} // namespace

inode_ordering::inode_ordering(logger& lgr, progress& prog)
inode_ordering::inode_ordering(logger& lgr, progress& prog,
inode_options const& opts)
: impl_(make_unique_logging_object<impl, inode_ordering_, logger_policies>(
lgr, prog)) {}
lgr, prog, opts)) {}

} // namespace dwarfs
Loading

0 comments on commit 26b15cb

Please sign in to comment.