diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e64e305..5408dbb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,7 @@ project(dwarfs) cmake_minimum_required(VERSION 3.13.4) option(WITH_TESTS "build with tests" OFF) +option(WITH_PYTHON "build with Python scripting support" OFF) set(default_build_type "Release") @@ -38,10 +39,25 @@ set(DWARFS_VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" ) -find_package(PkgConfig REQUIRED) +list(APPEND DWARFS_BOOST_MODULES date_time filesystem program_options system) -find_package(Boost 1.67 REQUIRED COMPONENTS date_time filesystem - program_options system) +if(WITH_PYTHON) + # TODO: would be nicer to be able to support a range of python versions + find_package(Python3 ${WITH_PYTHON_VERSION} EXACT REQUIRED COMPONENTS Development) + list(APPEND DWARFS_BOOST_MODULES + "python${Python3_VERSION_MAJOR}${Python3_VERSION_MINOR}") + message(STATUS "Enabling support for Python ${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}") +endif() + +find_package(Boost 1.67 REQUIRED COMPONENTS ${DWARFS_BOOST_MODULES}) + +if(WITH_PYTHON) + set(BOOST_PYTHON_LIBS ${Boost_LIBRARIES}) + list(FILTER Boost_LIBRARIES EXCLUDE REGEX python) + list(FILTER BOOST_PYTHON_LIBS INCLUDE REGEX python) +endif() + +find_package(PkgConfig REQUIRED) pkg_check_modules(FUSE3 REQUIRED IMPORTED_TARGET fuse3>=3.4.1) pkg_check_modules(LIBLZ4 IMPORTED_TARGET liblz4>=1.8.3) @@ -117,6 +133,9 @@ list( src/dwarfs/util.cpp src/dwarfs/worker_group.cpp) +if(WITH_PYTHON) + list(APPEND LIBDWARFS_SRC src/dwarfs/python_script.cpp) +endif() add_library(dwarfs ${LIBDWARFS_SRC}) @@ -251,7 +270,7 @@ target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS}) add_dependencies(metadata_thrift thrift_light) foreach(tgt dwarfs ${BINARY_TARGETS}) - target_include_directories(${tgt} SYSTEM PRIVATE ${Boost_INCLUDE_DIRS} + target_include_directories(${tgt} SYSTEM PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS}) target_include_directories(${tgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include) @@ -261,7 +280,8 @@ foreach(tgt dwarfs ${BINARY_TARGETS}) PRIVATE DWARFS_VERSION=\"${DWARFS_VERSION}\" $<$:DWARFS_HAVE_LIBLZ4> $<$:DWARFS_HAVE_LIBLZMA> - $<$:DWARFS_HAVE_LIBZSTD>) + $<$:DWARFS_HAVE_LIBZSTD> + $<$:DWARFS_HAVE_PYTHON>) target_compile_options(${tgt} PRIVATE -Wall -Wextra -pedantic) @@ -287,6 +307,9 @@ foreach(tgt ${BINARY_TARGETS}) PkgConfig::LIBLZMA PkgConfig::LIBZSTD) + if(WITH_PYTHON) + target_link_libraries(${tgt} ${BOOST_PYTHON_LIBS} ${Python3_LIBRARIES}) + endif() endforeach() target_link_libraries(dwarfs-bin PkgConfig::FUSE3) diff --git a/include/dwarfs/python_script.h b/include/dwarfs/python_script.h new file mode 100644 index 00000000..e06db6f1 --- /dev/null +++ b/include/dwarfs/python_script.h @@ -0,0 +1,53 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include + +#include "dwarfs/inode.h" +#include "dwarfs/script.h" + +namespace dwarfs { + +class logger; + +class python_script : public script { + public: + python_script(logger& lgr, const std::string& code, const std::string& ctor); + ~python_script(); + + bool has_configure() const override; + bool has_filter() const override; + bool has_transform() const override; + bool has_order() const override; + + void configure(options_interface const& oi) override; + bool filter(entry_interface const& ei) override; + void transform(entry_interface& ei) override; + void order(inode_vector& iv) override; + + private: + class impl; + std::unique_ptr impl_; +}; + +} // namespace dwarfs diff --git a/scripts/example.py b/scripts/example.py new file mode 100644 index 00000000..ee38bddb --- /dev/null +++ b/scripts/example.py @@ -0,0 +1,30 @@ +class mkdwarfs(object): + def __init__(self): + logger.info("this is python!") + + def configure(self, config): + config.enable_similarity() + config.set_order(file_order_mode.script, set_mode.override) + config.set_remove_empty_dirs(True, set_mode.default) + + def filter(self, entry): + logger.debug(f"filter: {entry.path()} [{entry.type()}]") + if entry.type() == 'directory' and entry.name() == 'dev': + return False + return True + + def transform(self, entry): + logger.debug(f"transform {entry.path()}") + entry.set_permissions(entry.permissions() & 0o7555) + return entry + + def order(self, inodes): + logger.info("order") + for i in inodes: + logger.debug(f"inode: {i.similarity_hash()} {i.size()} {i.refcount()}") + for p in i.paths(): + logger.debug(f" file: {p}") + return reversed(inodes) + + def _something_private(self): + pass diff --git a/src/dwarfs/python_script.cpp b/src/dwarfs/python_script.cpp new file mode 100644 index 00000000..57ea1887 --- /dev/null +++ b/src/dwarfs/python_script.cpp @@ -0,0 +1,453 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +#include "dwarfs/entry.h" +#include "dwarfs/inode.h" +#include "dwarfs/logger.h" +#include "dwarfs/options_interface.h" +#include "dwarfs/python_script.h" + +namespace dwarfs { + +namespace py = boost::python; + +namespace { + +std::unordered_set supported_methods{"configure", "filter", + "transform", "order"}; + +void init_python() { + static bool initialized = false; + if (!initialized) { + Py_Initialize(); + initialized = true; + } +} + +bool callable(py::object object) { return 1 == PyCallable_Check(object.ptr()); } + +bool hasattr(py::object obj, const char* name) { + return PyObject_HasAttrString(obj.ptr(), name); +} + +bool has_callable(py::object obj, char const* method) { + return hasattr(obj, method) && callable(obj.attr(method)); +} + +class py_logger { + public: + py_logger(logger& lgr) + : log_(lgr) {} + + void error(std::string msg) { log_.error() << "[script] " << msg; } + void warn(std::string msg) { log_.warn() << "[script] " << msg; } + void info(std::string msg) { log_.info() << "[script] " << msg; } + void debug(std::string msg) { log_.debug() << "[script] " << msg; } + void trace(std::string msg) { log_.trace() << "[script] " << msg; } + + private: + using log_proxy_t = log_proxy; + log_proxy_t log_; +}; + +template +class basic_entry_wrapper { + public: + basic_entry_wrapper(T& entry) + : entry_(&entry) {} + + size_t size() const { return entry_->size(); } + std::string path() const { return entry_->path(); } + std::string name() const { return entry_->name(); } + std::string type() const { return entry_->type_string(); } + + uint16_t permissions() const { return entry_->get_permissions(); } + void set_permissions(uint16_t perm) { entry_->set_permissions(perm); } + uint16_t uid() const { return entry_->get_uid(); } + void set_uid(uint16_t uid) { entry_->set_uid(uid); } + uint16_t gid() const { return entry_->get_gid(); } + void set_gid(uint16_t gid) { entry_->set_gid(gid); } + uint64_t atime() const { return entry_->get_atime(); } + void set_atime(uint64_t atime) { entry_->set_atime(atime); } + uint64_t mtime() const { return entry_->get_mtime(); } + void set_mtime(uint64_t mtime) { entry_->set_mtime(mtime); } + uint64_t ctime() const { return entry_->get_ctime(); } + void set_ctime(uint64_t ctime) { entry_->set_ctime(ctime); } + + private: + T* entry_; +}; + +using entry_wrapper = basic_entry_wrapper; +using mutable_entry_wrapper = basic_entry_wrapper; + +class inode_wrapper { + public: + inode_wrapper(inode const* ino) + : ino_(ino) {} + + size_t similarity_hash() const { return ino_->similarity_hash(); } + size_t refcount() const { return ino_->files().size(); } + py::list paths() const { + py::list ps; + auto& fs = ino_->files(); + for (auto& f : fs) { + ps.append(f->path()); + } + return ps; + } + size_t size() const { return ino_->any()->size(); } + inode const* get() const { return ino_; } + + private: + inode const* ino_; +}; + +} // namespace + +class python_script::impl { + public: + impl(logger& lgr, const std::string& code, const std::string& ctor); + ~impl(); + + void configure(options_interface const& oi); + bool filter(entry_interface const& ei); + void transform(entry_interface& ei); + void order(inode_vector& iv); + + bool has_configure() const { return has_configure_; } + bool has_filter() const { return has_filter_; } + bool has_transform() const { return has_transform_; } + bool has_order() const { return has_order_; } + + private: + void check_instance_methods(py::object obj) const; + void log_py_error() const; + + using log_proxy_t = log_proxy; + using clock = std::chrono::steady_clock; + + class timer { + public: + timer(clock::duration& d) + : start_(clock::now()) + , d_(d) {} + + ~timer() { d_ += clock::now() - start_; } + + private: + clock::time_point start_; + clock::duration& d_; + }; + + log_proxy_t log_; + py_logger pylog_; + bool has_configure_{false}; + bool has_filter_{false}; + bool has_transform_{false}; + bool has_order_{false}; + py::object instance_; + py::object main_module_; + py::object main_namespace_; + clock::duration configure_time_{}; + clock::duration filter_time_{}; + clock::duration transform_time_{}; + clock::duration order_time_{}; +}; + +python_script::impl::impl(logger& lgr, const std::string& code, + const std::string& ctor) + : log_(lgr) + , pylog_(lgr) { + try { + init_python(); + + main_module_ = py::import("__main__"); + main_namespace_ = main_module_.attr("__dict__"); + + py::scope scope(main_module_); + + main_namespace_["dwarfs_logger"] = + py::class_("dwarfs_logger", py::no_init) + .def("error", &py_logger::error) + .def("warn", &py_logger::warn) + .def("info", &py_logger::info) + .def("debug", &py_logger::debug) + .def("trace", &py_logger::trace); + + main_namespace_["file_order_mode"] = + py::enum_("file_order_mode") + .value("none", file_order_mode::NONE) + .value("path", file_order_mode::PATH) + .value("script", file_order_mode::SCRIPT) + .value("similarity", file_order_mode::SIMILARITY); + + main_namespace_["set_mode"] = + py::enum_("set_mode") + .value("default", options_interface::DEFAULT) + .value("override", options_interface::OVERRIDE); + + main_namespace_["dwarfs_options"] = + py::class_("dwarfs_options", + py::no_init) + .def("enable_similarity", &options_interface::enable_similarity) + .def("set_order", &options_interface::set_order) + .def("set_remove_empty_dirs", + &options_interface::set_remove_empty_dirs); + + main_namespace_["inode_wrapper"] = + py::class_>( + "inode_wrapper", py::no_init) + .def("similarity_hash", &inode_wrapper::similarity_hash) + .def("refcount", &inode_wrapper::refcount) + .def("paths", &inode_wrapper::paths) + .def("size", &inode_wrapper::size); + + main_namespace_["entry_wrapper"] = + py::class_>( + "entry_wrapper", py::no_init) + .def("name", &entry_wrapper::name) + .def("type", &entry_wrapper::type) + .def("path", &entry_wrapper::path) + .def("size", &entry_wrapper::size) + .def("permissions", &entry_wrapper::permissions) + .def("uid", &entry_wrapper::uid) + .def("gid", &entry_wrapper::gid) + .def("atime", &entry_wrapper::atime) + .def("mtime", &entry_wrapper::mtime) + .def("ctime", &entry_wrapper::ctime); + + main_namespace_["mutable_entry_wrapper"] = + py::class_>( + "mutable_entry_wrapper", py::no_init) + .def("name", &mutable_entry_wrapper::name) + .def("type", &mutable_entry_wrapper::type) + .def("path", &mutable_entry_wrapper::path) + .def("size", &mutable_entry_wrapper::size) + .def("permissions", &mutable_entry_wrapper::permissions) + .def("uid", &mutable_entry_wrapper::uid) + .def("gid", &mutable_entry_wrapper::gid) + .def("atime", &mutable_entry_wrapper::atime) + .def("mtime", &mutable_entry_wrapper::mtime) + .def("ctime", &mutable_entry_wrapper::ctime) + .def("set_permissions", &mutable_entry_wrapper::set_permissions) + .def("set_uid", &mutable_entry_wrapper::set_uid) + .def("set_gid", &mutable_entry_wrapper::set_gid) + .def("set_atime", &mutable_entry_wrapper::set_atime) + .def("set_mtime", &mutable_entry_wrapper::set_mtime) + .def("set_ctime", &mutable_entry_wrapper::set_ctime); + + main_namespace_["logger"] = py::ptr(&pylog_); + + py::exec(code.c_str(), main_namespace_); + + instance_ = py::eval(ctor.c_str(), main_namespace_); + + check_instance_methods(instance_); + + has_configure_ = has_callable(instance_, "configure"); + has_filter_ = has_callable(instance_, "filter"); + has_transform_ = has_callable(instance_, "transform"); + has_order_ = has_callable(instance_, "order"); + } catch (py::error_already_set) { + log_py_error(); + throw std::runtime_error("error initializing script"); + } +} + +void python_script::impl::log_py_error() const { + PyObject *exc, *val, *tb; + PyErr_Fetch(&exc, &val, &tb); + PyErr_NormalizeException(&exc, &val, &tb); + + py::handle<> hexc(exc), hval(py::allow_null(val)), htb(py::allow_null(tb)); + + if (!hval) { + log_.error() << std::string(py::extract(py::str(hexc))); + } else { + py::object traceback(py::import("traceback")); + py::object format_exception(traceback.attr("format_exception")); + py::list formatted_list(format_exception(hexc, hval, htb)); + for (int count = 0; count < len(formatted_list); ++count) { + log_.error() << std::string( + py::extract(formatted_list[count].slice(0, -1))); + } + } +} + +void python_script::impl::check_instance_methods(py::object obj) const { + for (py::stl_input_iterator + it(py::object(py::handle<>(PyObject_Dir(obj.ptr())))), + end; + it != end; ++it) { + if (!it->startswith("_") && callable(obj.attr(*it))) { + std::string name{py::extract(*it)}; + if (supported_methods.find(name) == supported_methods.end()) { + log_.warn() << "unknown method '" << name << "' found in Python class"; + } + } + } +} + +python_script::impl::~impl() { + std::vector timings; + auto add_timing = [&](bool flag, std::string_view name, auto const& d) { + using floatsec = std::chrono::duration; + if (flag) { + timings.push_back( + fmt::format("{0} {1:.2f}s", name, + std::chrono::duration_cast(d).count())); + } + }; + + add_timing(has_configure_, "configure", configure_time_); + add_timing(has_filter_, "filter", filter_time_); + add_timing(has_transform_, "transform", transform_time_); + add_timing(has_order_, "order", order_time_); + + log_.info() << "script time: " << boost::join(timings, ", "); + + // nothing else, really, as boost::python docs forbid using Py_Finalize +} + +void python_script::impl::configure(options_interface const& oi) { + timer tmr(configure_time_); + try { + instance_.attr("configure")(py::ptr(&oi)); + } catch (py::error_already_set) { + log_py_error(); + throw std::runtime_error("error in configure"); + } +} + +bool python_script::impl::filter(entry_interface const& ei) { + timer tmr(filter_time_); + try { + return py::extract( + instance_.attr("filter")(std::make_shared(ei))); + } catch (py::error_already_set) { + log_py_error(); + throw std::runtime_error("error filtering entry"); + } +} + +void python_script::impl::transform(entry_interface& ei) { + timer tmr(transform_time_); + try { + instance_.attr("transform")(std::make_shared(ei)); + } catch (py::error_already_set) { + log_py_error(); + throw std::runtime_error("error transforming entry"); + } +} + +void python_script::impl::order(inode_vector& iv) { + timer tmr(order_time_); + try { + py::list files; + + { + auto td = log_.timed_debug(); + + for (size_t i = 0; i < iv.size(); ++i) { + files.append(std::make_shared(iv[i].get())); + } + + td << "prepared files for ordering"; + } + + py::object ordered; + + { + auto td = log_.timed_debug(); + ordered = instance_.attr("order")(files); + td << "ordered files in script code"; + } + + google::dense_hash_map priority(iv.size()); + priority.set_empty_key(nullptr); + + auto td = log_.timed_debug(); + size_t index = 0; + + for (py::stl_input_iterator it(ordered), end; it != end; ++it) { + auto wrapper{py::extract>(*it)()}; + priority[wrapper->get()] = index++; + } + + if (index != iv.size()) { + throw std::runtime_error("order() returned different number of files"); + } + + std::sort(iv.begin(), iv.end(), + [&](inode_ptr const& a, inode_ptr const& b) { + auto ia = priority.find(a.get()); + auto ib = priority.find(b.get()); + if (ia == priority.end() || ib == priority.end()) { + throw std::runtime_error( + "invalid inode pointer while ordering files"); + } + return ia->second < ib->second; + }); + + td << "applied new inode order"; + } catch (py::error_already_set) { + log_py_error(); + throw std::runtime_error("error ordering inodes"); + } +} + +python_script::python_script(logger& lgr, const std::string& code, + const std::string& ctor) + : impl_(std::make_unique(lgr, code, ctor)) {} + +python_script::~python_script() = default; + +bool python_script::has_configure() const { return impl_->has_configure(); } +bool python_script::has_filter() const { return impl_->has_filter(); } +bool python_script::has_transform() const { return impl_->has_transform(); } +bool python_script::has_order() const { return impl_->has_order(); } + +void python_script::configure(options_interface const& oi) { + impl_->configure(oi); +} + +bool python_script::filter(entry_interface const& ei) { + return impl_->filter(ei); +} + +void python_script::transform(entry_interface& ei) { impl_->transform(ei); } + +void python_script::order(inode_vector& iv) { impl_->order(iv); } + +} // namespace dwarfs diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index 3ef85df5..b2dda100 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -341,9 +341,15 @@ scanner_::scan_tree(const std::string& path, progress& prog) { try { auto pe = entry_->create(*os_, name, parent); - if (script_ && !script_->filter(*pe)) { - log_.debug() << "skipping " << name; - continue; + if (script_) { + if (script_->has_filter() && !script_->filter(*pe)) { + log_.debug() << "skipping " << name; + continue; + } + + if (script_->has_transform()) { + script_->transform(*pe); + } } if (pe) { @@ -418,10 +424,16 @@ void scanner_::order_files(inode_manager& im) { break; } - case file_order_mode::SCRIPT: + case file_order_mode::SCRIPT: { + if (!script_->has_order()) { + throw std::runtime_error("script cannot order inodes"); + } log_.info() << "ordering " << im.count() << " inodes using script..."; + auto ti = log_.timed_info(); im.order_inodes(script_); + ti << im.count() << " inodes ordered"; break; + } case file_order_mode::SIMILARITY: { log_.info() << "ordering " << im.count() << " inodes by similarity..."; diff --git a/src/mkdwarfs.cpp b/src/mkdwarfs.cpp index 0b9f9243..778f04f8 100644 --- a/src/mkdwarfs.cpp +++ b/src/mkdwarfs.cpp @@ -45,6 +45,7 @@ #include #include +#include #include #include @@ -62,12 +63,16 @@ #include "dwarfs/logger.h" #include "dwarfs/mmap.h" #include "dwarfs/options.h" +#include "dwarfs/options_interface.h" #include "dwarfs/os_access_posix.h" #include "dwarfs/progress.h" #include "dwarfs/scanner.h" #include "dwarfs/script.h" #include "dwarfs/util.h" +#ifdef DWARFS_HAVE_PYTHON +#include "dwarfs/python_script.h" +#endif namespace po = boost::program_options; @@ -87,7 +92,11 @@ namespace { const std::map order_choices{ {"none", file_order_mode::NONE}, {"path", file_order_mode::PATH}, +#ifdef DWARFS_HAVE_PYTHON + {"script", file_order_mode::SCRIPT}, +#endif {"similarity", file_order_mode::SIMILARITY}}; + } // namespace namespace dwarfs { @@ -106,6 +115,57 @@ void validate(boost::any& v, const std::vector& values, v = boost::any(it->second); } +class script_options : public options_interface { + public: + script_options(logger& lgr, po::variables_map& vm, scanner_options& opts, + bool& force_similarity) + : log_(lgr) + , vm_(vm) + , opts_(opts) + , force_similarity_(force_similarity) {} + + void set_order(file_order_mode order_mode, set_mode mode = DEFAULT) override { + set(opts_.file_order, order_mode, "order", mode); + } + + void + set_remove_empty_dirs(bool remove_empty, set_mode mode = DEFAULT) override { + set(opts_.remove_empty_dirs, remove_empty, "remove-empty-dirs", mode); + } + + void enable_similarity() override { + log_.debug() << "script is forcing similarity hash computation"; + force_similarity_ = true; + } + + private: + template + void set(T& target, T const& value, std::string const& name, set_mode mode) { + switch (mode) { + case options_interface::DEFAULT: + if (!vm_.count(name) || vm_[name].defaulted()) { + log_.info() << "script is setting " << name << "=" << value; + target = value; + } + break; + + case options_interface::OVERRIDE: + if (vm_.count(name) && !vm_[name].defaulted()) { + log_.warn() << "script is overriding " << name << "=" << value; + } else { + log_.info() << "script is setting " << name << "=" << value; + } + target = value; + break; + } + } + + log_proxy log_; + po::variables_map& vm_; + scanner_options& opts_; + bool& force_similarity_; +}; + } // namespace dwarfs namespace { @@ -280,6 +340,11 @@ int mkdwarfs(int argc, char** argv) { po::value(&options.file_order) ->default_value(file_order_mode::SIMILARITY, "similarity"), order_desc.c_str()) +#ifdef DWARFS_HAVE_PYTHON + ("script", + po::value(&script_arg), + "Python script for customization") +#endif ("blockhash-window-sizes", po::value(&window_sizes), "window sizes for block hashing") @@ -425,6 +490,34 @@ int mkdwarfs(int argc, char** argv) { std::shared_ptr