diff --git a/README.md b/README.md index ebb7886..7492114 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,16 @@ Crumsort and Quadsort in C++ ![build status](https://github.com/psadda/crumsort-cpp/actions/workflows/test.yaml/badge.svg) -This is a C99 to C++17 port of Igor van den Hoven's crumsort and quadsort. +This is a [lightning fast](https://github.com/psadda/crumsort-cpp/blob/main/bench/README.md) C++17 port of Igor van den Hoven's crumsort and quadsort. Porting crumsort and quadsort to C++ is not as trivial as one might expect. The original crumsort and quadsort have many C-isms that don't map well to modern C++: - They take raw pointers as input, not random access iterators. That means they only work for arrays of contiguous memory, like `std::vector`, and not on discontiguous containers, like `std::deque`. - They use C99 variable length arrays, which are not part of the C++ standard. Some C++ compilers support VLAs as a language extension, but others (MSVC) do not. -- They assume that that the sorted type is [trivial](https://en.cppreference.com/w/cpp/named_req/TrivialType). That rules out huge swaths of types that you'd probably want to sort, like `std::string` and `std::unique_ptr`. +- They assume that that the sorted type is [trivial](https://en.cppreference.com/w/cpp/named_req/TrivialType). That rules out huge swaths of types that you'd probably like to be able to sort, like `std::string` and `std::unique_ptr`. This respository fixes those all those issues and more, allowing you to use crumsort and quadsort as drop in replacements for `std::sort` and `std::stable_sort`, respectively. -See the original C implementations at [scandum/crumsort](https://github.com/scandum/crumsort) and [scandum/quadsort](https://github.com/scandum/quadsort) for detailed descriptions of the algorithms and their properties. - Example ------- @@ -38,6 +36,16 @@ int main(int argc, char** argv) { } ``` +Benchmarks +---------- + +Available [here](https://github.com/psadda/crumsort-cpp/blob/main/bench/README.md). + +Algorithm +--------- + +See the original C implementations at [scandum/crumsort](https://github.com/scandum/crumsort) and [scandum/quadsort](https://github.com/scandum/quadsort) for detailed descriptions of the algorithms and their properties. + Progress -------- @@ -49,8 +57,7 @@ Progress - [x] Support types that do not a have a trivial default constructor - [x] Support types that do not have *any* default constructor - [x] Support move only types -- [ ] Re-enable optimizations for primitive types -- [ ] Update benchmarks +- [x] Update benchmarks License ------- diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt index 6b62b2b..e1e50c4 100644 --- a/bench/CMakeLists.txt +++ b/bench/CMakeLists.txt @@ -9,8 +9,16 @@ file(DOWNLOAD https://raw.githubusercontent.com/skarupke/ska_sort/master/ska_sor file(DOWNLOAD https://raw.githubusercontent.com/timsort/cpp-TimSort/master/include/gfx/timsort.hpp ${CMAKE_CURRENT_BINARY_DIR}/timsort.hpp) -FetchContent_Declare(wolfsort GIT_REPOSITORY https://github.com/scandum/wolfsort.git) -FetchContent_MakeAvailable(wolfsort) +SET(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "" FORCE) +SET(HWY_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) +SET(HWY_ENABLE_TESTS OFF CACHE BOOL "" FORCE) +SET(HWY_SYSTEM_GTEST ON CACHE BOOL "" FORCE) +SET(INSTALL_GTEST OFF CACHE BOOL "" FORCE) + +FetchContent_Declare(simdsort GIT_REPOSITORY https://github.com/intel/x86-simd-sort GIT_TAG main GIT_SHALLOW TRUE) +FetchContent_Declare(highway GIT_REPOSITORY https://github.com/google/highway GIT_SHALLOW TRUE) +FetchContent_Declare(wolfsort GIT_REPOSITORY https://github.com/scandum/wolfsort.git GIT_SHALLOW TRUE) +FetchContent_MakeAvailable(simdsort highway wolfsort) add_executable(benchmarks bench.cpp @@ -18,10 +26,17 @@ add_executable(benchmarks scandum_sorts.c ${CMAKE_CURRENT_BINARY_DIR}/rhsort.c) -target_include_directories(benchmarks PUBLIC +target_include_directories(benchmarks PRIVATE ${CMAKE_CURRENT_BINARY_DIR} + ${FETCHCONTENT_BASE_DIR}/simdsort-src/src ${FETCHCONTENT_BASE_DIR}/wolfsort-src/src) -target_link_libraries(benchmarks crumsortcpp) +target_link_libraries(benchmarks crumsortcpp hwy hwy_contrib) + +if(MSVC) +target_compile_options(benchmarks PRIVATE /arch:AVX2) +else() +target_compile_options(benchmarks PRIVATE -mavx2) +endif() set_property(TARGET benchmarks PROPERTY COMPILE_WARNING_AS_ERROR OFF) diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 0000000..2b3a428 --- /dev/null +++ b/bench/README.md @@ -0,0 +1,99 @@ +Benchmarks +========== + +The results shown here were obtained from a release build compiled with ClangCL with optimization flags `/O3` and `/DNDEBUG`. + +## Results ## + +#### Random #### + +*The input array is composed of unsorted random integers.* + +graph of benchmark results on unsorted random integers + +#### Random High Bits #### + +*The input array is composed of random unsorted integers, but with the randomness in the high (most significant) bits of the integer rather than the low bits.* + +graph of benchmark results on unsorted random strings + +#### Random Half #### + +*Half of the input array is already in sorted order; the other half is unsorted.* + +graph of benchmark results on half unsorted, half sorted integers + +#### Ascending #### + +*The input array is already in sorted order.* + +graph of benchmark results on sorted, ascending integers + +#### Descending #### + +*The input array is already sorted in reverse order.* + +graph of benchmark results on sorted, descending integers + +#### Ascending Saw #### + +*The input array is composed of alternating runs of sorted and unsorted integers.* + +graph of benchmark results on alternating runs of sorted and unsorted integers + +#### Ascending Tiles #### + +*The input array is divided into two chunks, each of which are internally sorted.* + +graph of benchmark results on ascending tiles + +#### Pipe Organ #### + +*The first half of the input array is sorted in ascending order and the second half is sorted in descending order.* + +graph of benchmark results on a pipe organ array + +#### Bit Reversal #### + +*The input array is composed of sequential integers that have been.* + +graph of benchmark results on a pipe organ array + +## A Deeper Look at the Candidates ## + +### General Purpose Sorts: `crumsort`, `quadsort`, `pdqsort`, `timsort` ### + +These functions accept any comparison predicate (as long as that) and can sort any type of movable data. `pdsort` and the C++ version of `crumsort` can be used as drop in replacements for `std::sort`. `timsort` and the C++ version of `quadsort` can be used as drop in replacments for `std::stable_sort`. + +The C versions of `crumsort` and `quadsort` can be compiled with a `cmp` macro that replaces the general purpose predicate with a hardcoded greater-than or less-than predicate for numeric types, slightly improving performance. They were _**not**_ compiled with `cmp` in this benchmark. + +### Numeric-Only Sorts: `ska_sort`, `rhsort`, `x86-simd-sort`, `vqsort` ### + +These sorts incoporate a radix sort or counting sort. That makes them very fast for sorting numeric data, but it also comes with some limitations: + +* They are _**not**_ general purpose sorts: the key has to be numeric, and the sorting predicate has to be a simple numeric less-than or greater-than. + + `ska_sort` is slightly more flexible than the others — it also accepts arrays of numbers as the key. (This includes strings, but thanks to the limitation on the predicate you wouldn't be able to do say, a case-insensitive string sort with `ska_sort` without first transforming the input array.) + +* Their performance is sensitive to the bit-length of the key. They may be limited to keys of a particular size. +* `x86-simd-sort` and `vqsort` are SIMD accelerated and will only work on CPUs that implement one of their supported instruction sets. +* All of the above mean that these functions are _**not**_ drop in replacements for `std::sort`, although they could be used to optimize specializations of `std::sort`. + +| Algorithm | Time (Avg) | Space (Avg) | Stable | Key Type | Key Size | Portability | +| --------------- | ---------- | ----------- | -------| --------------------------- | ------------ | ------------ | +| `ska_sort` | O(n) | O(n) | no | numeric or array of numeric | any | any | +| `rhsort` | O(n) | O(n) | yes | numeric only | 32 bit only | any | +| `x86-simd-sort` | O(n) | O(1)/O(n)† | no | numeric only | 32 or 64 bit | AVX2, AVX512 | +| `vqsort` | O(n) | O(n) | no | numeric only | 32 or 64 bit | AVX2, AVX512, NEON | + +† `x86-simd-sort` uses O(1) space when sorting numeric data. It uses O(n) space when sorting arbitrary data with a numeric key. + +This benchmark uses the AVX2 flavors of `x86-simd-sort` and `vqsort`. + +## Concluding Remarks ## + +* The C++ versions of `crumsort` and `quadsort` are competetive across the entire suite of tests — almost always best or second best. Performance is generally on par with that of the C versions. +* `std::sort` and `std::stable_sort` (at least the Microsoft implementations tested in this benchmark) have generally okay performance. If sorting isn't a bottleneck, it's very reasonable to stick with the standard library sorts to avoid introducing a new dependency. +* `qsort`, on the other hand is absolutely terrible. But this may be Microsoft specific, as C and the C standard library are very much second class citizens in MSVC space. +* `pdqsort` is pretty good across the board. It's a step ahead of `timsort`, which struggles with some data patterns. It's also a much simpler algorithm than `crumsort`, so it's a good option for those who are looking to balance runtime performance with binary size and code complexity. +* If you only need to sort numeric data, `rhsort` is very fast. `x86-simd-sort` and `vqsort` are sometimes faster if you can rely on compatible vector extensions being present. However, the more general purpose `crumsort` and `quadsort` still perform better in several benchmarks. diff --git a/bench/bench.cpp b/bench/bench.cpp index bc4c9a1..22017e7 100644 --- a/bench/bench.cpp +++ b/bench/bench.cpp @@ -7,8 +7,8 @@ #include #ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# define NOMINMAX +# define WIN32_LEAN_AND_MEAN 1 +# define NOMINMAX 1 # include # include // for QueryPerformanceFrequency and QueryPerformanceCounter #else @@ -30,7 +30,9 @@ const char *sorts[] = { "pdqsort", "rhsort", "skasort", - "timsort" + "timsort", + "simdsort", + "vqsort" }; //#define SKIP_STRINGS @@ -56,6 +58,13 @@ extern "C" void rhsort32(int* array, size_t n); #include #include +#include +#include +#include +#include + +#include + #include typedef void SRTFUNC(void *array, size_t nmemb, size_t size, CMPFUNC *cmpf); @@ -66,9 +75,9 @@ typedef void SRTFUNC(void *array, size_t nmemb, size_t size, CMPFUNC *cmpf); size_t comparisons; -#define COMPARISON_PP comparisons++ +#define COMPARISON_PP //comparisons++ -#define NO_INLINE __attribute__ ((noinline)) +#define NO_INLINE //__attribute__ ((noinline)) // primitive type comparison functions @@ -288,6 +297,12 @@ void test_sort(void *array, void *unsorted, void *valid, int minimum, int maximu #ifdef SCANDUM_QUADSORT_HPP case 'c' + 'x' * 32 + 'q' * 1024: if (size == sizeof(int)) scandum::quadsort(pta, pta + max); else if (size == sizeof(long long)) scandum::quadsort(ptla, ptla + max); else scandum::quadsort(ptda, ptda + max); break; #endif +#ifdef AVX2_QSORT_32BIT + case 's' + 'i' * 32 + 'm' * 1024: if (size == sizeof(int)) avx2_qsort(pta, max); else if (size == sizeof(long long)) avx2_qsort(ptla, max); else avx2_qsort(ptda, max, true); break; +#endif +#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ + case 'v' + 'q' * 32 + 's' * 1024: if (size == sizeof(int32_t)) hwy::VQSort((int32_t*)pta, max, hwy::SortAscending()); else if (size == sizeof(int64_t)) hwy::VQSort((int64_t*)pta, max, hwy::SortAscending()); else hwy::VQSort((double*)pta, max, hwy::SortAscending()); break; +#endif #ifdef BLITSORT_H case 'b' + 'l' * 32 + 'i' * 1024: blitsort(array, max, size, cmpf); break; #endif @@ -343,20 +358,8 @@ void test_sort(void *array, void *unsorted, void *valid, int minimum, int maximu case 't' + 'i' * 32 + 'm' * 1024: if (size == sizeof(int)) gfx::timsort(pta, pta + max, cpp_cmp_int); else if (size == sizeof(long long)) gfx::timsort(ptla, ptla + max); else gfx::timsort(ptda, ptda + max); break; #endif default: - switch (name32) - { - case 's' + 'o' * 32 + 'r' * 1024: - case 's' + 't' * 32 + 'a' * 1024: - case 'p' + 'd' * 32 + 'q' * 1024: - case 'r' + 'h' * 32 + 's' * 1024: - case 's' + 'k' * 32 + 'a' * 1024: - case 't' + 'i' * 32 + 'm' * 1024: - printf("unknown sort: %s (compile with g++ instead of gcc?)\n", name); - return; - default: - printf("unknown sort: %s\n", name); - return; - } + printf("unknown sort: %s\n", name); + return; } average_comp += comparisons; @@ -753,8 +756,8 @@ void range_test(int max, int samples, int repetitions, int seed) int main(int argc, char **argv) { - int max = 100000; - int samples = 10; + int max = 10000; + int samples = 1000; int repetitions = 1; int seed = 0; int cnt, mem; diff --git a/bench/plot.py b/bench/plot.py new file mode 100644 index 0000000..9bbfa3a --- /dev/null +++ b/bench/plot.py @@ -0,0 +1,116 @@ +import matplotlib.pyplot as plt +import numpy as np +from tqdm import tqdm +import fileinput + +TESTS = { + 'random int' : 'random', + 'random order' : 'random high bits', + 'random half' : 'random half', + 'ascending order' : 'ascending', + 'descending order' : 'descending', + 'ascending saw' : 'ascending saw', + 'ascending tiles' : 'ascending tiles', + 'pipe organ' : 'pipe organ', + 'bit reversal' : 'bit reversal' +} + +ALGORITHMS = { + 'crumsort' : 'crumsort (C)', + 'quadsort' : 'quadsort (C)', + 'cxcrumsort' : 'crumsort (C++)', + 'cxquadsort' : 'quadsort (C++)', + 'qsort' : 'qsort', + 'sort' : 'std::sort', + 'stablesort' : 'std::stable_sort', + 'pdqsort' : 'pdqsort', + 'timsort' : 'timsort', + 'skasort' : 'ska_sort', + 'rhsort' : 'rhsort', + 'simdsort' : 'x86-simd-sort (AVX2)', + 'vqsort' : 'vqsort' +} + +EXPECTED_X_VALUES = [ 10, 100, 1000, 10000, 100000 ] + +benchmark_results = {} + +for line in fileinput.input(): + + line = line.strip() + + if len(line) > 0 and line[0] == '|' and line[-1] == '|': # This line actually has a table row + + # Parse the table row + cells = line.split('|')[1:-1] # Throw out empty first and last elements + cells = [c.strip() for c in cells] + if cells[0] == 'Name' or len(cells[0]) == 0 or cells[0][0] == '-': # Throw out header rows + continue + + # Extract the benchmark results + algorithm = cells[0] + array_len = int(cells[1]) + time = float(cells[4]) + test_name = cells[-1] + + if not algorithm in ALGORITHMS or not test_name in TESTS: + continue + algorithm = ALGORITHMS[algorithm] + test_name = TESTS[test_name] + + # Store results in the global result table + benchmark_results.setdefault(test_name, {}) + benchmark_results[test_name].setdefault(algorithm, {}) + benchmark_results[test_name][algorithm][array_len] = time + +# Bar graphs +for test in tqdm(TESTS.values()): + + fig, ax = plt.subplots() + + labels = [] + values = [] + + for algorithm in reversed(ALGORITHMS.values()): + + if not algorithm in benchmark_results[test]: + continue + + labels.append(algorithm) + values.append(benchmark_results[test][algorithm][10000]) + + y_pos = np.arange(len(labels)) + plt.barh(y_pos, values, align='center') + plt.yticks(y_pos, labels) + plt.xlabel('run time (ns/value)') + plt.title(test + ' (10,000 elements)') + plt.tight_layout() + + fig.savefig(test + " 10000.png") + plt.close() + +# Line/scaling graphs +for test in tqdm(TESTS.values()): + + fig, ax = plt.subplots() + ax.set_xscale('log') + ax.set_yscale('log') + + ax.set(xlabel='array length', ylabel='run time (ns/value)', title=test) + + for algorithm in reversed(ALGORITHMS.values()): + + if not algorithm in benchmark_results[test]: + continue + + times = benchmark_results[test][algorithm] + array_lens = [key for key in times] + array_lens.sort() + datapoints = [times[array_len] for array_len in array_lens] + datapoints = np.array(datapoints) + + ax.plot(array_lens, datapoints, label=algorithm) + + ax.legend() + fig.savefig(test + ".png") + plt.close() diff --git a/bench/results/ascending 10000.png b/bench/results/ascending 10000.png new file mode 100644 index 0000000..511d34a Binary files /dev/null and b/bench/results/ascending 10000.png differ diff --git a/bench/results/ascending saw 10000.png b/bench/results/ascending saw 10000.png new file mode 100644 index 0000000..f3edec0 Binary files /dev/null and b/bench/results/ascending saw 10000.png differ diff --git a/bench/results/ascending saw.png b/bench/results/ascending saw.png new file mode 100644 index 0000000..b6ba69f Binary files /dev/null and b/bench/results/ascending saw.png differ diff --git a/bench/results/ascending tiles 10000.png b/bench/results/ascending tiles 10000.png new file mode 100644 index 0000000..a4820ba Binary files /dev/null and b/bench/results/ascending tiles 10000.png differ diff --git a/bench/results/ascending tiles.png b/bench/results/ascending tiles.png new file mode 100644 index 0000000..b631a16 Binary files /dev/null and b/bench/results/ascending tiles.png differ diff --git a/bench/results/ascending.png b/bench/results/ascending.png new file mode 100644 index 0000000..9fa3b52 Binary files /dev/null and b/bench/results/ascending.png differ diff --git a/bench/results/bit reversal 10000.png b/bench/results/bit reversal 10000.png new file mode 100644 index 0000000..5223cf6 Binary files /dev/null and b/bench/results/bit reversal 10000.png differ diff --git a/bench/results/bit reversal.png b/bench/results/bit reversal.png new file mode 100644 index 0000000..4dd9443 Binary files /dev/null and b/bench/results/bit reversal.png differ diff --git a/bench/results/descending 10000.png b/bench/results/descending 10000.png new file mode 100644 index 0000000..0c97e71 Binary files /dev/null and b/bench/results/descending 10000.png differ diff --git a/bench/results/descending.png b/bench/results/descending.png new file mode 100644 index 0000000..911aa17 Binary files /dev/null and b/bench/results/descending.png differ diff --git a/bench/results/pipe organ 10000.png b/bench/results/pipe organ 10000.png new file mode 100644 index 0000000..338733c Binary files /dev/null and b/bench/results/pipe organ 10000.png differ diff --git a/bench/results/pipe organ.png b/bench/results/pipe organ.png new file mode 100644 index 0000000..bb1155c Binary files /dev/null and b/bench/results/pipe organ.png differ diff --git a/bench/results/random 10000.png b/bench/results/random 10000.png new file mode 100644 index 0000000..635239d Binary files /dev/null and b/bench/results/random 10000.png differ diff --git a/bench/results/random half 10000.png b/bench/results/random half 10000.png new file mode 100644 index 0000000..216db5b Binary files /dev/null and b/bench/results/random half 10000.png differ diff --git a/bench/results/random half.png b/bench/results/random half.png new file mode 100644 index 0000000..ef2ed16 Binary files /dev/null and b/bench/results/random half.png differ diff --git a/bench/results/random high bits 10000.png b/bench/results/random high bits 10000.png new file mode 100644 index 0000000..7f2ddeb Binary files /dev/null and b/bench/results/random high bits 10000.png differ diff --git a/bench/results/random high bits.png b/bench/results/random high bits.png new file mode 100644 index 0000000..a6eb160 Binary files /dev/null and b/bench/results/random high bits.png differ diff --git a/bench/results/random.png b/bench/results/random.png new file mode 100644 index 0000000..eb6d389 Binary files /dev/null and b/bench/results/random.png differ