diff --git a/CMakeLists.txt b/CMakeLists.txt index fcef43f9e0..3765ed4bd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,7 +112,6 @@ option(ENABLE_LCMS "Compile with LCMS support" ON) option(WITH_SVG2 "Compile with support for new SVG2 features" ON) option(WITH_LPETOOL "Compile with LPE Tool" OFF) option(LPE_ENABLE_TEST_EFFECTS "Compile with test experimental LPEs enabled" OFF) -option(WITH_OPENMP "Compile with OpenMP support" ON) option(WITH_PROFILING "Turn on profiling" OFF) # Set to true if compiler/linker should enable profiling option(BUILD_SHARED_LIBS "Compile libraries as shared and not static" ON) @@ -287,7 +286,6 @@ message("WITH_LIBCDR: ${WITH_LIBCDR}") message("WITH_LIBVISIO: ${WITH_LIBVISIO}") message("WITH_LIBWPG: ${WITH_LIBWPG}") message("WITH_NLS: ${WITH_NLS}") -message("WITH_OPENMP: ${WITH_OPENMP}") message("WITH_JEMALLOC: ${WITH_JEMALLOC}") message("WITH_ASAN: ${WITH_ASAN}") message("WITH_INTERNAL_2GEOM: ${WITH_INTERNAL_2GEOM}") diff --git a/CMakeScripts/DefineDependsandFlags.cmake b/CMakeScripts/DefineDependsandFlags.cmake index 454b16d59d..bebb49e5e0 100644 --- a/CMakeScripts/DefineDependsandFlags.cmake +++ b/CMakeScripts/DefineDependsandFlags.cmake @@ -391,26 +391,6 @@ list(APPEND INKSCAPE_INCS_SYS ${LIBXML2_INCLUDE_DIR}) list(APPEND INKSCAPE_LIBS ${LIBXML2_LIBRARIES}) add_definitions(${LIBXML2_DEFINITIONS}) -if(WITH_OPENMP) - find_package(OpenMP) - if(OPENMP_FOUND) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - list(APPEND INKSCAPE_CXX_FLAGS ${OpenMP_CXX_FLAGS}) - if(APPLE OR (MINGW AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")) - list(APPEND INKSCAPE_LIBS "-lomp") - endif() - mark_as_advanced(OpenMP_C_FLAGS) - mark_as_advanced(OpenMP_CXX_FLAGS) - # '-fopenmp' is in OpenMP_C_FLAGS, OpenMP_CXX_FLAGS and implies '-lgomp' - # uncomment explicit linking below if still needed: - set(HAVE_OPENMP ON) - #list(APPEND INKSCAPE_LIBS "-lgomp") # FIXME - else() - set(HAVE_OPENMP OFF) - set(WITH_OPENMP OFF) - endif() -endif() - find_package(ZLIB REQUIRED) list(APPEND INKSCAPE_INCS_SYS ${ZLIB_INCLUDE_DIRS}) list(APPEND INKSCAPE_LIBS ${ZLIB_LIBRARIES}) diff --git a/config.h.cmake b/config.h.cmake index 982547b162..be0d833ad3 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -41,9 +41,6 @@ /* Define to 1 if you have the header file. */ #cmakedefine HAVE_MALLOC_H 1 -/* Use OpenMP (via cmake) */ -#cmakedefine HAVE_OPENMP 1 - /* Use libpoppler for direct PDF import */ #cmakedefine HAVE_POPPLER 1 diff --git a/src/display/CMakeLists.txt b/src/display/CMakeLists.txt index 0e1d1c4908..0ad06d6c08 100644 --- a/src/display/CMakeLists.txt +++ b/src/display/CMakeLists.txt @@ -3,6 +3,7 @@ set(display_SRC cairo-utils.cpp curve.cpp + dispatch-pool.cpp drawing-context.cpp drawing-group.cpp drawing-image.cpp @@ -67,6 +68,7 @@ set(display_SRC cairo-templates.h cairo-utils.h curve.h + dispatch-pool.h drawing-context.h drawing-group.h drawing-image.h diff --git a/src/display/cairo-templates.h b/src/display/cairo-templates.h index b781561669..2595327265 100644 --- a/src/display/cairo-templates.h +++ b/src/display/cairo-templates.h @@ -13,17 +13,12 @@ #ifndef SEEN_INKSCAPE_DISPLAY_CAIRO_TEMPLATES_H #define SEEN_INKSCAPE_DISPLAY_CAIRO_TEMPLATES_H -#ifdef HAVE_CONFIG_H -# include "config.h" // only include where actually required! -#endif - #include -#ifdef HAVE_OPENMP -#include +#include "dispatch-pool.h" + // single-threaded operation if the number of pixels is below this threshold -static const int OPENMP_THRESHOLD = 2048; -#endif +static const int POOL_THRESHOLD = 2048; #include #include @@ -69,20 +64,14 @@ void ink_cairo_surface_blend_internal(cairo_surface_t *out, cairo_surface_t *in1 surface_accessor acc_in2(in2); // NOTE - // OpenMP probably doesn't help much here. + // This probably doesn't help much here. // It would be better to render more than 1 tile at a time. - #if HAVE_OPENMP - int const num_threads = get_num_filter_threads(); - #endif - - #if HAVE_OPENMP - #pragma omp parallel for if((w * h) > OPENMP_THRESHOLD) num_threads(num_threads) - #endif - for (int i = 0; i < h; ++i) { + auto const pool = get_global_dispatch_pool(); + pool->dispatch_threshold(h, (w * h) > POOL_THRESHOLD, [&](int i, int) { for (int j = 0; j < w; ++j) { acc_out.set(j, i, blend(acc_in1.get(j, i), acc_in2.get(j, i))); } - } + }); } template @@ -92,20 +81,14 @@ void ink_cairo_surface_filter_internal(cairo_surface_t *out, cairo_surface_t *in surface_accessor acc_in(in); // NOTE - // OpenMP probably doesn't help much here. + // This probably doesn't help much here. // It would be better to render more than 1 tile at a time. - #if HAVE_OPENMP - int const num_threads = get_num_filter_threads(); - #endif - - #if HAVE_OPENMP - #pragma omp parallel for if((w * h) > OPENMP_THRESHOLD) num_threads(num_threads) - #endif - for (int i = 0; i < h; ++i) { + auto const pool = get_global_dispatch_pool(); + pool->dispatch_threshold(h, (w * h) > POOL_THRESHOLD, [&](int i, int) { for (int j = 0; j < w; ++j) { acc_out.set(j, i, filter(acc_in.get(j, i))); } - } + }); } template @@ -114,21 +97,17 @@ void ink_cairo_surface_synthesize_internal(cairo_surface_t *out, int x0, int y0, surface_accessor acc_out(out); // NOTE - // OpenMP probably doesn't help much here. + // This probably doesn't help much here. // It would be better to render more than 1 tile at a time. - #if HAVE_OPENMP - int const num_threads = get_num_filter_threads(); - #endif - - #if HAVE_OPENMP int const limit = (x1 - x0) * (y1 - y0); - #pragma omp parallel for if(limit > OPENMP_THRESHOLD) num_threads(num_threads) - #endif - for (int i = y0; i < y1; ++i) { + auto const pool = get_global_dispatch_pool(); + pool->dispatch_threshold(y1 - y0, limit > POOL_THRESHOLD, [&](int y, int) { + int const i = y0 + y; + for (int j = x0; j < x1; ++j) { acc_out.set(j, i, synth(j, i)); } - } + }); } /** diff --git a/src/display/cairo-utils.cpp b/src/display/cairo-utils.cpp index 9271edf3a9..6ef55e4e55 100644 --- a/src/display/cairo-utils.cpp +++ b/src/display/cairo-utils.cpp @@ -1294,7 +1294,7 @@ static int ink_cairo_surface_average_color_internal(cairo_surface_t *surface, do int stride = cairo_image_surface_get_stride(surface); unsigned char *data = cairo_image_surface_get_data(surface); - /* TODO convert this to OpenMP somehow */ + // TODO parallelize this somehow for (int y = 0; y < height; ++y, data += stride) { for (int x = 0; x < width; ++x) { guint32 px = *reinterpret_cast(data + 4*x); diff --git a/src/display/dispatch-pool.cpp b/src/display/dispatch-pool.cpp new file mode 100644 index 0000000000..25daddcd3e --- /dev/null +++ b/src/display/dispatch-pool.cpp @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Liam White + * Copyright (C) 2024 Authors + * Released under GNU GPL v2+, read the file 'COPYING' for more information. + */ + +#include "dispatch-pool.h" + +#include "cairo-utils.h" + +namespace Inkscape { + +dispatch_pool::dispatch_pool(int size) +{ + int const num_threads = std::max(size, 1) - 1; + + _threads.reserve(num_threads); + + for (int i = 0; i < num_threads; ++i) { + // local_id of created threads is offset by 1 to allow calling thread to always be 0 + _threads.emplace_back([i, this] { thread_func(local_id{i + 1}); }); + } +} + +dispatch_pool::~dispatch_pool() +{ + // TODO C++20: this would be completely trivial with jthread + // TODO C++20: dispatch_pool::~dispatch_pool() = default; + { + std::scoped_lock lk(_lock); + _shutdown = true; + } + + _available_cv.notify_all(); + + for (auto &thread : _threads) { + thread.join(); + } +} + +void dispatch_pool::dispatch(int count, dispatch_func function) +{ + std::scoped_lock lk(_dispatch_lock); + std::unique_lock lk2(_lock); + + _available_work = global_id{}; + _completed_work = global_id{}; + _target_work = global_id{count}; + _function = std::move(function); + + // Execute the caller's batch, and signal to the next waiting thread + execute_batch(lk2, local_id{}, size()); + + // Wait for other threads to finish + _completed_cv.wait(lk2, [&] { return _completed_work == _target_work; }); + + // Release any extra memory held by the function + _function = {}; +} + +void dispatch_pool::thread_func(local_id id) +{ + int const thread_count = size(); + + std::unique_lock lk(_lock); + + // TODO C++20: no need for _shutdown member once stop_token is available + // TODO C++20: while (_cv.wait(lk, stop_token, [&] { ... })) + while (true) { + _available_cv.wait(lk, [&] { return _shutdown || _available_work < _target_work; }); + + if (_shutdown) { + // When shutdown is requested, stop immediately + return; + } + + // Otherwise, execute the batch + execute_batch(lk, id, thread_count); + } +} + +void dispatch_pool::execute_batch(std::unique_lock &lk, local_id id, int thread_count) +{ + // Determine how much work to take + global_id const batch_size = (_target_work + thread_count - 1) / thread_count; + global_id const start = _available_work; + global_id const end = std::min(start + batch_size, _target_work); + + // Take that much work + _available_work = end; + + // Unlock and begin executing the function + { + lk.unlock(); + + // Now that the lock is released, potentially signal work availability + // to the next waiting thread + _available_cv.notify_one(); + + // Execute the function + for (global_id index = start; index < end; index++) { + _function(index, id); + } + + lk.lock(); + } + + // Signal completion + _completed_work += (end - start); + + if (_completed_work == _target_work) { + _completed_cv.notify_one(); + } +} + +namespace { + +std::mutex g_dispatch_lock; +std::shared_ptr g_dispatch_pool; +int g_dispatch_threads; + +} // namespace + +std::shared_ptr get_global_dispatch_pool() +{ + int const num_threads = get_num_filter_threads(); + + std::scoped_lock lk(g_dispatch_lock); + + if (g_dispatch_pool && num_threads == g_dispatch_threads) { + return g_dispatch_pool; + } + + g_dispatch_pool = std::make_shared(num_threads); + g_dispatch_threads = num_threads; + + return g_dispatch_pool; +} + +} // namespace Inkscape + +/* + Local Variables: + mode:c++ + c-file-style:"stroustrup" + c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . +)) + indent-tabs-mode:nil + fill-column:99 + End: +*/ +// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:fileencoding=utf-8:textwidth=99 : diff --git a/src/display/dispatch-pool.h b/src/display/dispatch-pool.h new file mode 100644 index 0000000000..b240001948 --- /dev/null +++ b/src/display/dispatch-pool.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Liam White + * Copyright (C) 2024 Authors + * Released under GNU GPL v2+, read the file 'COPYING' for more information. + */ + +#ifndef INKSCAPE_DISPLAY_DISPATCH_POOL_H +#define INKSCAPE_DISPLAY_DISPATCH_POOL_H + +#include +#include +#include +#include +#include +#include + +namespace Inkscape { + +/** + * General-purpose, parallel thread dispatch mechanism. + * + * A dispatch is a compute job which is parameterized by a counter. It can also be thought of + * as a way to parallelize a for loop. For example, the following single-threaded loop + * + * for (int i = 0; i < count; ++i) { + * do_work(i); + * } + * + * can be rewritten to use a dispatch_pool and operate in parallel like this: + * + * pool.dispatch(count, [&](int i, int local_id) { + * do_work(i); + * }); + * + * Finally, it is also possible to perform all jobs on the calling thread unless a threshold + * condition is met (like dispatch size). This can be used if threading the operation would be + * less efficient unless the work is at least a certain size: + * + * pool.dispatch_threshold(count, count > 1024, [&](int i, int local_id) { + * do_work(i); + * }); + * + * Unlike boost's asio::thread_pool, which pushes work for threads onto a queue, this class only + * supports operation via a counter. The simpler design allows dispatching a very large amount of + * work (potentially millions of jobs, for every pixel in a megapixel image) with constant + * memory and space used. + * + * A pool's thread count is fixed upon construction and cannot change during operation. If you + * allocate work buffers for each thread in the pool, you can use the size() method to determine + * how many threads it has been created with. + * + * By design, only one dispatch may run at a time. It is safe to call dispatch() from multiple + * threads without extra locking. + * + * Terminology used is designed to loosely follow that of OpenCL kernels or GL/VK compute shaders: + * - Global ID within a dispatch refers to the 0-based counter value for a given job. + * - Local ID within a dispatch refers to the 0-based index of thread which is processing the job. + * This will always be less than the pool's size(). + * + * The first parameter to the callback is global ID. The second parameter, which is unused in the + * example, is the local ID. The local ID is primarily useful if a work buffer is allocated for + * each thread in the dispatch_pool ahead of time. + */ +class dispatch_pool +{ +public: + using global_id = int; + using local_id = int; + using dispatch_func = std::function; + + explicit dispatch_pool(int size); + ~dispatch_pool(); + + void dispatch(int count, dispatch_func function); + + template + void dispatch_threshold(int count, bool threshold, F &&function) + { + if (threshold) { + dispatch(count, std::forward(function)); + } else { + for (auto i = global_id{}; i < global_id{count}; i++) { + function(i, local_id{}); + } + } + } + + int size() const + { + // The calling thread participates in the dispatch + return _threads.size() + 1; + } + +private: + void thread_func(local_id id); + void execute_batch(std::unique_lock &lk, local_id id, int thread_count); + +private: + global_id _available_work{}; + global_id _completed_work{}; + global_id _target_work{}; + bool _shutdown{}; + + std::mutex _dispatch_lock; + std::mutex _lock; + std::condition_variable _available_cv; + std::condition_variable _completed_cv; + dispatch_func _function; + std::vector _threads; +}; + +std::shared_ptr get_global_dispatch_pool(); + +} // namespace Inkscape + +#endif // INKSCAPE_DISPLAY_DISPATCH_POOL_H + +/* + Local Variables: + mode:c++ + c-file-style:"stroustrup" + c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . +)) + indent-tabs-mode:nil + fill-column:99 + End: +*/ +// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:fileencoding=utf-8:textwidth=99 : diff --git a/src/display/nr-filter-gaussian.cpp b/src/display/nr-filter-gaussian.cpp index 4e04bfd62f..eccf3b249d 100644 --- a/src/display/nr-filter-gaussian.cpp +++ b/src/display/nr-filter-gaussian.cpp @@ -12,21 +12,15 @@ * Released under GNU GPL v2+, read the file 'COPYING' for more information. */ -#ifdef HAVE_CONFIG_H -# include "config.h" // only include where actually required! -#endif - #include #include #include #include #include #include -#if HAVE_OPENMP -#include -#endif //HAVE_OPENMP #include "display/cairo-utils.h" +#include "display/dispatch-pool.h" #include "display/nr-filter-primitive.h" #include "display/nr-filter-gaussian.h" #include "display/nr-filter-types.h" @@ -290,7 +284,7 @@ static void filter2D_IIR(PT *const dest, int const dstr1, int const dstr2, PT const *const src, int const sstr1, int const sstr2, int const n1, int const n2, IIRValue const b[N+1], double const M[N*N], - IIRValue *const tmpdata[], int const num_threads) + IIRValue *const tmpdata[], dispatch_pool &pool) { assert(src && dest); @@ -302,16 +296,7 @@ filter2D_IIR(PT *const dest, int const dstr1, int const dstr2, #define PREMUL_ALPHA_LOOP for(unsigned int c=1; c(v[0][c]); } } - } + }); } // Filters over 1st dimension @@ -367,18 +352,13 @@ template static void filter2D_FIR(PT *const dst, int const dstr1, int const dstr2, PT const *const src, int const sstr1, int const sstr2, - int const n1, int const n2, FIRValue const *const kernel, int const scr_len, int const num_threads) + int const n1, int const n2, FIRValue const *const kernel, int const scr_len, dispatch_pool &pool) { assert(src && dst); - // Past pixels seen (to enable in-place operation) - PT history[scr_len+1][PC]; - -INK_UNUSED(num_threads); // suppresses unused argument compiler warning -#if HAVE_OPENMP -#pragma omp parallel for num_threads(num_threads) private(history) -#endif // HAVE_OPENMP - for ( int c2 = 0 ; c2 < n2 ; c2++ ) { + pool.dispatch(n2, [&](int c2, int) { + // Past pixels seen (to enable in-place operation) + PT history[scr_len + 1][PC]; // corresponding line in the source buffer int const src_line = c2 * sstr2; @@ -465,12 +445,12 @@ INK_UNUSED(num_threads); // suppresses unused argument compiler warning } } } - } + }); } static void gaussian_pass_IIR(Geom::Dim2 d, double deviation, cairo_surface_t *src, cairo_surface_t *dest, - IIRValue **tmpdata, int num_threads) + IIRValue **tmpdata, dispatch_pool &pool) { // Filter variables IIRValue b[N+1]; // scaling coefficient + filter coefficients (can be 10.21 fixed point) @@ -500,13 +480,13 @@ gaussian_pass_IIR(Geom::Dim2 d, double deviation, cairo_surface_t *src, cairo_su filter2D_IIR( cairo_image_surface_get_data(dest), d == Geom::X ? 1 : stride, d == Geom::X ? stride : 1, cairo_image_surface_get_data(src), d == Geom::X ? 1 : stride, d == Geom::X ? stride : 1, - w, h, b, M, tmpdata, num_threads); + w, h, b, M, tmpdata, pool); break; case CAIRO_FORMAT_ARGB32: ///< Premultiplied 8 bit RGBA filter2D_IIR( cairo_image_surface_get_data(dest), d == Geom::X ? 4 : stride, d == Geom::X ? stride : 4, cairo_image_surface_get_data(src), d == Geom::X ? 4 : stride, d == Geom::X ? stride : 4, - w, h, b, M, tmpdata, num_threads); + w, h, b, M, tmpdata, pool); break; default: g_warning("gaussian_pass_IIR: unsupported image format"); @@ -515,7 +495,7 @@ gaussian_pass_IIR(Geom::Dim2 d, double deviation, cairo_surface_t *src, cairo_su static void gaussian_pass_FIR(Geom::Dim2 d, double deviation, cairo_surface_t *src, cairo_surface_t *dest, - int num_threads) + dispatch_pool &pool) { int scr_len = _effect_area_scr(deviation); // Filter kernel for x direction @@ -533,13 +513,13 @@ gaussian_pass_FIR(Geom::Dim2 d, double deviation, cairo_surface_t *src, cairo_su filter2D_FIR( cairo_image_surface_get_data(dest), d == Geom::X ? 1 : stride, d == Geom::X ? stride : 1, cairo_image_surface_get_data(src), d == Geom::X ? 1 : stride, d == Geom::X ? stride : 1, - w, h, &kernel[0], scr_len, num_threads); + w, h, &kernel[0], scr_len, pool); break; case CAIRO_FORMAT_ARGB32: ///< Premultiplied 8 bit RGBA filter2D_FIR( cairo_image_surface_get_data(dest), d == Geom::X ? 4 : stride, d == Geom::X ? stride : 4, cairo_image_surface_get_data(src), d == Geom::X ? 4 : stride, d == Geom::X ? stride : 4, - w, h, &kernel[0], scr_len, num_threads); + w, h, &kernel[0], scr_len, pool); break; default: g_warning("gaussian_pass_FIR: unsupported image format"); @@ -597,8 +577,9 @@ void FilterGaussian::render_cairo(FilterSlot &slot) const bytes_per_pixel = 4; break; } + auto const pool = get_global_dispatch_pool(); int quality = slot.get_blurquality(); - int threads = get_num_filter_threads(); + int threads = pool->size(); int x_step = 1 << _effect_subsample_step_log2(deviation_x_orig, quality); int y_step = 1 << _effect_subsample_step_log2(deviation_y_orig, quality); bool resampling = x_step > 1 || y_step > 1; @@ -647,17 +628,17 @@ void FilterGaussian::render_cairo(FilterSlot &slot) const if (scr_len_x > 0) { if (use_IIR_x) { - gaussian_pass_IIR(Geom::X, deviation_x, downsampled, downsampled, tmpdata, threads); + gaussian_pass_IIR(Geom::X, deviation_x, downsampled, downsampled, tmpdata, *pool); } else { - gaussian_pass_FIR(Geom::X, deviation_x, downsampled, downsampled, threads); + gaussian_pass_FIR(Geom::X, deviation_x, downsampled, downsampled, *pool); } } if (scr_len_y > 0) { if (use_IIR_y) { - gaussian_pass_IIR(Geom::Y, deviation_y, downsampled, downsampled, tmpdata, threads); + gaussian_pass_IIR(Geom::Y, deviation_y, downsampled, downsampled, tmpdata, *pool); } else { - gaussian_pass_FIR(Geom::Y, deviation_y, downsampled, downsampled, threads); + gaussian_pass_FIR(Geom::Y, deviation_y, downsampled, downsampled, *pool); } } diff --git a/src/display/nr-filter-morphology.cpp b/src/display/nr-filter-morphology.cpp index 1cc2aadea7..52f09ad3fd 100644 --- a/src/display/nr-filter-morphology.cpp +++ b/src/display/nr-filter-morphology.cpp @@ -65,11 +65,9 @@ void morphologicalFilter1D(cairo_surface_t * const input, cairo_surface_t * cons int ri = round(radius); // TODO: Support fractional radii? int wi = 2*ri+1; - #if HAVE_OPENMP - int limit = w * h; - #pragma omp parallel for if(limit > OPENMP_THRESHOLD) num_threads(get_num_filter_threads()) - #endif // HAVE_OPENMP - for (int i = 0; i < h; ++i) { + int const limit = w * h; + auto const pool = get_global_dispatch_pool(); + pool->dispatch_threshold(h, limit > POOL_THRESHOLD, [&](int i, int) { // TODO: Store position and value in one 32 bit integer? 24 bits should be enough for a position, it would be quite strange to have an image with a width/height of more than 16 million(!). std::deque> vals[BPP]; // In my tests it was actually slightly faster to allocate it here than allocate it once for all threads and retrieving the correct set based on the thread id. @@ -148,7 +146,7 @@ void morphologicalFilter1D(cairo_surface_t * const input, cairo_surface_t * cons } if (axis == Geom::Y) out_p += strideout - BPP; } - } + }); cairo_surface_mark_dirty(out); }