From fc224c0ff434b1c749350da137f2084fe7d8c43c Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 14:46:28 +0000 Subject: [PATCH 1/8] [fix] Fixed remote and added testing documentation. --- .circleci/config.yml | 20 ++++++++++++++++---- Makefile | 15 +++++++++++++-- README.md | 17 +++++++++++++---- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 132bf6b..4e0cb8f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -32,10 +32,22 @@ jobs: name: Build & Test command: | make clean - make unit_tests - cd build - make test ARGS="-V" make coverage bash <(curl -s https://codecov.io/bash) -f coverage.info -X gcov -x gcov-7 || echo "Codecov did not collect coverage reports" - + + +workflows: + commit: + jobs: + - build + nightly: + triggers: + - schedule: + cron: "0 0 * * *" + filters: + branches: + only: + - master + jobs: + - build diff --git a/Makefile b/Makefile index 7375e77..a802f58 100644 --- a/Makefile +++ b/Makefile @@ -69,9 +69,20 @@ library_shared: library_all: ( cd build ; cmake $(CMAKE_LIBRARY_OPTIONS) .. ; $(MAKE) ) +# just build the static and shared libraries and produce measurements +# of accuracy versus compression factor for fixed data size +# TODO: + # just build the static and shared libraries and tests -unit_tests: - ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ) +unit_tests: + ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test ) + +test: + $(MAKE) unit_tests + +coverage: + ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; ) + # build all full: diff --git a/README.md b/README.md index dadb263..cb92557 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ ![License](https://img.shields.io/badge/License-MIT-blue.svg) -[![CircleCI](https://circleci.com/gh/RedisBloom/tdigest.svg?style=svg)](https://circleci.com/gh/RedisBloom/tdigest) -[![codecov](https://codecov.io/gh/filipecosta90/tdigest/branch/master/graph/badge.svg)](https://codecov.io/gh/filipecosta90/tdigest) +[![CircleCI](https://circleci.com/gh/RedisBloom/t-digest-c.svg?style=svg)](https://circleci.com/gh/RedisBloom/t-digest-c) +[![codecov](https://codecov.io/gh/RedisBloom/t-digest-c/branch/master/graph/badge.svg)](https://codecov.io/gh/RedisBloom/t-digest-c) + # T-Digest Adaptive histogram based on something like streaming k-means crossed with Q-digest. @@ -52,11 +53,19 @@ The following functions are implemented: ``` # Build -git clone https://github.com/filipecosta90/tdigest.git -cd tdigest/ +git clone https://github.com/RedisBloom/t-digest-c.git +cd t-digest-c/ git submodule update --init --recursive make ``` + +## Testing +Assuming you've followed the previous build steps, it should be as easy as: +``` +# Run the unit tests +make test +``` + ## Benchmarking Assuming you've followed the previous build steps, it should be as easy as: From 82c668b3bb86106660948fe2610e75c2ab94a6f3 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 14:51:04 +0000 Subject: [PATCH 2/8] [add] Added release drafter automation --- .github/release-drafter-config.yml | 21 +++++++++++++++++++++ .github/workflows/release-drafter.yml | 19 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 .github/release-drafter-config.yml create mode 100644 .github/workflows/release-drafter.yml diff --git a/.github/release-drafter-config.yml b/.github/release-drafter-config.yml new file mode 100644 index 0000000..9c05778 --- /dev/null +++ b/.github/release-drafter-config.yml @@ -0,0 +1,21 @@ +name-template: 'Version $NEXT_PATCH_VERSION' +tag-template: 'v$NEXT_PATCH_VERSION' +categories: + - title: 'Features' + labels: + - 'feature' + - 'enhancement' + - title: 'Bug Fixes' + labels: + - 'fix' + - 'bugfix' + - 'bug' + - title: 'Maintenance' + label: 'chore' +change-template: '- $TITLE @$AUTHOR (#$NUMBER)' +exclude-labels: + - 'skip-changelog' +template: | + ## Changes + + $CHANGES \ No newline at end of file diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 0000000..c205014 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,19 @@ +name: Release Drafter + +on: + push: + # branches to consider in the event; optional, defaults to all + branches: + - master + +jobs: + update_release_draft: + runs-on: ubuntu-latest + steps: + # Drafts your next Release notes as Pull Requests are merged into "master" + - uses: release-drafter/release-drafter@v5 + with: + # (Optional) specify config name to use, relative to .github/. Default: release-drafter.yml + config-name: release-drafter-config.yml + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file From ed6bb4788bbceb6868509818de57653290a29724 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 15:48:12 +0000 Subject: [PATCH 3/8] [fix] Fixed missing coverage info --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4e0cb8f..3d816ce 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -33,7 +33,7 @@ jobs: command: | make clean make coverage - bash <(curl -s https://codecov.io/bash) -f coverage.info -X gcov -x gcov-7 || echo "Codecov did not collect coverage reports" + cd build && bash <(curl -s https://codecov.io/bash) -f coverage.info -X gcov -x gcov-7 || echo "Codecov did not collect coverage reports" From eb9378ae586ad126879be4ae7a2d2e1048e74c23 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 15:57:02 +0000 Subject: [PATCH 4/8] [add] Applied clang format --- .clang-format | 5 + Makefile | 15 ++ src/CMakeLists.txt | 16 +- src/tdigest.c | 415 +++++++++++++++++++++------------------------ src/tdigest.h | 165 +++++++++--------- 5 files changed, 301 insertions(+), 315 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..b0138d9 --- /dev/null +++ b/.clang-format @@ -0,0 +1,5 @@ +IndentWidth: 4 +ColumnLimit: 100 +SortIncludes: false +AlignEscapedNewlinesLeft: false +SpacesBeforeTrailingComments: 1 \ No newline at end of file diff --git a/Makefile b/Makefile index a802f58..7b8785e 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,12 @@ # Use CMAKE_LIBRARY_OPTIONS,CMAKE_LIBRARY_SHARED_OPTIONS,CMAKE_LIBRARY_STATIC_OPTIONS or CMAKE_FULL_OPTIONS argument to this Makefile to pass options to cmake. #---------------------------------------------------------------------------------------------------- +CC?=gcc +INFER?=./deps/infer +INFER_DOCKER?=redisbench/infer-linux64:1.0.0 +ROOT=$(shell pwd) +SRCDIR := $(ROOT)/src + ifndef CMAKE_LIBRARY_SHARED_OPTIONS CMAKE_LIBRARY_SHARED_OPTIONS=\ -DBUILD_SHARED=ON \ @@ -83,11 +89,20 @@ test: coverage: ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; ) +format: + clang-format -style=file -i $(SRCDIR)/* + +lint: + clang-format -style=file -Werror -n $(SRCDIR)/* # build all full: ( cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) ) +static-analysis-docker: + $(MAKE) clean + docker run -v $(ROOT)/:/RedisBloom/ --user "$(username):$(usergroup)" $(INFER_DOCKER) bash -c "cd RedisBloom && CC=clang infer run --fail-on-issue --biabduction --skip-analysis-in-path ".*rmutil.*" -- make" + clean: distclean distclean: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index eec0a66..579b1ba 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,15 +2,11 @@ FILE(GLOB c_files "*.c") FILE(GLOB header_files "*.h") if (BUILD_SHARED) - add_library(tdigest SHARED ${c_files} ${header_files}) - target_link_libraries(tdigest m) +add_library(tdigest SHARED ${c_files} ${header_files}) target_link_libraries(tdigest m) target_include_directories(tdigest SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - install(TARGETS tdigest DESTINATION lib${LIB_SUFFIX}) -endif (BUILD_SHARED) + install(TARGETS tdigest DESTINATION lib${LIB_SUFFIX}) endif(BUILD_SHARED) -if (BUILD_STATIC) - add_library(tdigest_static STATIC ${c_files} ${header_files}) - target_link_libraries(tdigest_static m) - target_include_directories(tdigest_static SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - install(TARGETS tdigest_static DESTINATION lib${LIB_SUFFIX}) -endif (BUILD_STATIC) \ No newline at end of file + if (BUILD_STATIC) add_library(tdigest_static STATIC ${c_files} ${ + header_files}) target_link_libraries(tdigest_static m) + target_include_directories(tdigest_static SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + install(TARGETS tdigest_static DESTINATION lib${LIB_SUFFIX}) endif(BUILD_STATIC) \ No newline at end of file diff --git a/src/tdigest.c b/src/tdigest.c index 9d4d088..e99a8dc 100644 --- a/src/tdigest.c +++ b/src/tdigest.c @@ -4,39 +4,28 @@ #include #include "tdigest.h" -void bbzero(void *to, size_t count) { - memset(to, 0, count); -} +void bbzero(void *to, size_t count) { memset(to, 0, count); } -static bool is_very_small(double val) { - return !(val > .000000001 || val < -.000000001); -} +static bool is_very_small(double val) { return !(val > .000000001 || val < -.000000001); } -static int cap_from_compression(double compression) { - return (6 * (int)(compression)) + 10; -} +static int cap_from_compression(double compression) { return (6 * (int)(compression)) + 10; } static bool should_td_compress(td_histogram_t *h) { - return ((h->merged_nodes + h->unmerged_nodes) == h->cap); + return ((h->merged_nodes + h->unmerged_nodes) == h->cap); } -static int next_node(td_histogram_t *h) { - return h->merged_nodes + h->unmerged_nodes; -} +static int next_node(td_histogram_t *h) { return h->merged_nodes + h->unmerged_nodes; } void td_compress(td_histogram_t *h); -int td_number_centroids(td_histogram_t *h){ - return next_node(h); -} +int td_number_centroids(td_histogram_t *h) { return next_node(h); } //////////////////////////////////////////////////////////////////////////////// // Constructors //////////////////////////////////////////////////////////////////////////////// static size_t td_required_buf_size(double compression) { - return sizeof(td_histogram_t) + - (cap_from_compression(compression) * sizeof(node_t)); + return sizeof(td_histogram_t) + (cap_from_compression(compression) * sizeof(node_t)); } // td_init will initialize a td_histogram_t inside buf which is buf_size bytes. @@ -46,241 +35,223 @@ static size_t td_required_buf_size(double compression) { // In general use td_required_buf_size to figure out what size buffer to // pass. static td_histogram_t *td_init(double compression, size_t buf_size, char *buf) { - td_histogram_t *h = (td_histogram_t *)(buf); - if (!h) { - return NULL; - } - bbzero((void *)(h), buf_size); - *h = (td_histogram_t) { - .compression = compression, - .cap = (buf_size - sizeof(td_histogram_t)) / sizeof(node_t), - .min = __DBL_MAX__, - .max = __DBL_MIN__, - .total_compressions = 0, - .merged_nodes = 0, - .merged_weight = 0, - .unmerged_nodes = 0, - .unmerged_weight = 0, - }; - return h; + td_histogram_t *h = (td_histogram_t *)(buf); + if (!h) { + return NULL; + } + bbzero((void *)(h), buf_size); + *h = (td_histogram_t){ + .compression = compression, + .cap = (buf_size - sizeof(td_histogram_t)) / sizeof(node_t), + .min = __DBL_MAX__, + .max = __DBL_MIN__, + .total_compressions = 0, + .merged_nodes = 0, + .merged_weight = 0, + .unmerged_nodes = 0, + .unmerged_weight = 0, + }; + return h; } td_histogram_t *td_new(double compression) { - size_t memsize = td_required_buf_size(compression); - return td_init(compression, memsize, (char *)(malloc(memsize))); + size_t memsize = td_required_buf_size(compression); + return td_init(compression, memsize, (char *)(malloc(memsize))); } -void td_free(td_histogram_t *h) { - free((void *)(h)); -} +void td_free(td_histogram_t *h) { free((void *)(h)); } void td_merge(td_histogram_t *into, td_histogram_t *from) { - td_compress(into); - td_compress(from); - for (int i = 0; i < from->merged_nodes; i++) { - node_t *n = &from->nodes[i]; - td_add(into, n->mean, n->count); - } + td_compress(into); + td_compress(from); + for (int i = 0; i < from->merged_nodes; i++) { + node_t *n = &from->nodes[i]; + td_add(into, n->mean, n->count); + } } -int td_centroid_count(td_histogram_t *h){ - return next_node(h); -} +int td_centroid_count(td_histogram_t *h) { return next_node(h); } -double td_size(td_histogram_t *h) { - return h->merged_weight + h->unmerged_weight; -} +double td_size(td_histogram_t *h) { return h->merged_weight + h->unmerged_weight; } double td_cdf(td_histogram_t *h, double val) { - td_compress(h); - if (h->merged_nodes == 0) { - return NAN; - } - double k = 0; - int i = 0; - node_t *n = NULL; - for (i = 0; i < h->merged_nodes; i++) { - n = &h->nodes[i]; - if (n->mean >= val) { - break; - } - k += n->count; - } - if (val == n->mean) { - // technically this needs to find all of the nodes which contain this value and sum their weight - double count_at_value = n->count; - for (i += 1; i < h->merged_nodes && h->nodes[i].mean == n->mean; i++) { - count_at_value += h->nodes[i].count; - } - return (k + (count_at_value/2)) / h->merged_weight; - } else if (val > n->mean) { // past the largest - return 1; - } else if (i == 0) { - return 0; - } - // we want to figure out where along the line from the prev node to this node, the value falls - node_t *nr = n; - node_t *nl = n-1; - k -= (nl->count/2); - // we say that at zero we're at nl->mean - // and at (nl->count/2 + nr->count/2) we're at nr - double m = (nr->mean - nl->mean) / (nl->count/2 + nr->count/2); - double x = (val - nl->mean) / m; - return (k + x) / h->merged_weight; + td_compress(h); + if (h->merged_nodes == 0) { + return NAN; + } + double k = 0; + int i = 0; + node_t *n = NULL; + for (i = 0; i < h->merged_nodes; i++) { + n = &h->nodes[i]; + if (n->mean >= val) { + break; + } + k += n->count; + } + if (val == n->mean) { + // technically this needs to find all of the nodes which contain this value and sum their + // weight + double count_at_value = n->count; + for (i += 1; i < h->merged_nodes && h->nodes[i].mean == n->mean; i++) { + count_at_value += h->nodes[i].count; + } + return (k + (count_at_value / 2)) / h->merged_weight; + } else if (val > n->mean) { // past the largest + return 1; + } else if (i == 0) { + return 0; + } + // we want to figure out where along the line from the prev node to this node, the value falls + node_t *nr = n; + node_t *nl = n - 1; + k -= (nl->count / 2); + // we say that at zero we're at nl->mean + // and at (nl->count/2 + nr->count/2) we're at nr + double m = (nr->mean - nl->mean) / (nl->count / 2 + nr->count / 2); + double x = (val - nl->mean) / m; + return (k + x) / h->merged_weight; } - double td_quantile(td_histogram_t *h, double q) { - td_compress(h); - if (q < 0 || q > 1 || h->merged_nodes == 0) { - return NAN; - } - // if left of the first node, use the first node - // if right of the last node, use the last node, use it - double goal = q * h->merged_weight; - double k = 0; - int i = 0; - node_t *n = NULL; - for (i = 0; i < h->merged_nodes; i++) { - n = &h->nodes[i]; - if (k + n->count > goal) { - break; - } - k += n->count; - } - double delta_k = goal - k - (n->count/2); - if (is_very_small(delta_k)) { - return n->mean; - } - bool right = delta_k > 0; - if ((right && ((i+1) == h->merged_nodes)) || - (!right && (i == 0))) { - return n->mean; - } - node_t *nl; - node_t *nr; - if (right) { - nl = n; - nr = &h->nodes[i+1]; - k += (n->count/2); - } else { - nl = &h->nodes[i-1]; - nr = n; - k -= (n->count/2); - } - double x = goal - k; - // we have two points (0, nl->mean), (nr->count, nr->mean) - // and we want x - double m = (nr->mean - nl->mean) / (nl->count/2 + nr->count/2); - return m * x + nl->mean; + td_compress(h); + if (q < 0 || q > 1 || h->merged_nodes == 0) { + return NAN; + } + // if left of the first node, use the first node + // if right of the last node, use the last node, use it + double goal = q * h->merged_weight; + double k = 0; + int i = 0; + node_t *n = NULL; + for (i = 0; i < h->merged_nodes; i++) { + n = &h->nodes[i]; + if (k + n->count > goal) { + break; + } + k += n->count; + } + double delta_k = goal - k - (n->count / 2); + if (is_very_small(delta_k)) { + return n->mean; + } + bool right = delta_k > 0; + if ((right && ((i + 1) == h->merged_nodes)) || (!right && (i == 0))) { + return n->mean; + } + node_t *nl; + node_t *nr; + if (right) { + nl = n; + nr = &h->nodes[i + 1]; + k += (n->count / 2); + } else { + nl = &h->nodes[i - 1]; + nr = n; + k -= (n->count / 2); + } + double x = goal - k; + // we have two points (0, nl->mean), (nr->count, nr->mean) + // and we want x + double m = (nr->mean - nl->mean) / (nl->count / 2 + nr->count / 2); + return m * x + nl->mean; } - void td_add(td_histogram_t *h, double mean, double count) { - if (should_td_compress(h)) { - td_compress(h); - } - if (mean < h->min){ - h->min = mean; - } - if (mean > h->max){ - h->max = mean; - } - h->nodes[next_node(h)] = (node_t) { - .mean = mean, - .count = count, - }; - h->unmerged_nodes++; - h->unmerged_weight += count; + if (should_td_compress(h)) { + td_compress(h); + } + if (mean < h->min) { + h->min = mean; + } + if (mean > h->max) { + h->max = mean; + } + h->nodes[next_node(h)] = (node_t){ + .mean = mean, + .count = count, + }; + h->unmerged_nodes++; + h->unmerged_weight += count; } static int compare_nodes(const void *v1, const void *v2) { - node_t *n1 = (node_t *)(v1); - node_t *n2 = (node_t *)(v2); - if (n1->mean < n2->mean) { - return -1; - } else if (n1->mean > n2->mean) { - return 1; - } else { - return 0; - } + node_t *n1 = (node_t *)(v1); + node_t *n2 = (node_t *)(v2); + if (n1->mean < n2->mean) { + return -1; + } else if (n1->mean > n2->mean) { + return 1; + } else { + return 0; + } } void td_compress(td_histogram_t *h) { - if (h->unmerged_nodes == 0) { - return; - } - int N = h->merged_nodes + h->unmerged_nodes; - qsort((void *)(h->nodes), N, sizeof(node_t), &compare_nodes); - double total_count = h->merged_weight + h->unmerged_weight; - double denom = 2 * MM_PI * total_count * log(total_count); - double normalizer = h->compression / denom; - int cur = 0; - double count_so_far = 0; - for (int i = 1; i < N; i++) { - double proposed_count = h->nodes[cur].count + h->nodes[i].count; - double z = proposed_count * normalizer; - double q0 = count_so_far / total_count; - double q2 = (count_so_far + proposed_count) / total_count; - bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2))); - // next point will fit - // so merge into existing centroid - if (should_add) { - h->nodes[cur].count += h->nodes[i].count; - double delta = h->nodes[i].mean - h->nodes[cur].mean; - double weighted_delta = (delta * h->nodes[i].count) / h->nodes[cur].count; - h->nodes[cur].mean += weighted_delta; - } else { - count_so_far += h->nodes[cur].count; - cur++; - h->nodes[cur] = h->nodes[i]; - } - if (cur != i) { - h->nodes[i] = (node_t) { - .mean = 0, - .count = 0, - }; - } - } - h->merged_nodes = cur+1; - h->merged_weight = total_count; - h->unmerged_nodes = 0; - h->unmerged_weight = 0; - h->total_compressions++; + if (h->unmerged_nodes == 0) { + return; + } + int N = h->merged_nodes + h->unmerged_nodes; + qsort((void *)(h->nodes), N, sizeof(node_t), &compare_nodes); + double total_count = h->merged_weight + h->unmerged_weight; + double denom = 2 * MM_PI * total_count * log(total_count); + double normalizer = h->compression / denom; + int cur = 0; + double count_so_far = 0; + for (int i = 1; i < N; i++) { + double proposed_count = h->nodes[cur].count + h->nodes[i].count; + double z = proposed_count * normalizer; + double q0 = count_so_far / total_count; + double q2 = (count_so_far + proposed_count) / total_count; + bool should_add = (z <= (q0 * (1 - q0))) && (z <= (q2 * (1 - q2))); + // next point will fit + // so merge into existing centroid + if (should_add) { + h->nodes[cur].count += h->nodes[i].count; + double delta = h->nodes[i].mean - h->nodes[cur].mean; + double weighted_delta = (delta * h->nodes[i].count) / h->nodes[cur].count; + h->nodes[cur].mean += weighted_delta; + } else { + count_so_far += h->nodes[cur].count; + cur++; + h->nodes[cur] = h->nodes[i]; + } + if (cur != i) { + h->nodes[i] = (node_t){ + .mean = 0, + .count = 0, + }; + } + } + h->merged_nodes = cur + 1; + h->merged_weight = total_count; + h->unmerged_nodes = 0; + h->unmerged_weight = 0; + h->total_compressions++; } -double td_min(td_histogram_t *h) { - return h->min; -} +double td_min(td_histogram_t *h) { return h->min; } -double td_max(td_histogram_t *h) { - return h->max; -} +double td_max(td_histogram_t *h) { return h->max; } -int td_compression(td_histogram_t *h) { - return h->compression; -} +int td_compression(td_histogram_t *h) { return h->compression; } -// TODO: -const double* td_centroids_weight(td_histogram_t *h){ - return NULL; -} +// TODO: +const double *td_centroids_weight(td_histogram_t *h) { return NULL; } -// TODO: -const double *td_centroids_mean(td_histogram_t *h){ - return NULL; -} +// TODO: +const double *td_centroids_mean(td_histogram_t *h) { return NULL; } -double td_centroids_weight_at(td_histogram_t *h, int pos){ - if (pos < 0 || pos > h->merged_nodes) { - return NAN; - } - return h->nodes[pos].count; +double td_centroids_weight_at(td_histogram_t *h, int pos) { + if (pos < 0 || pos > h->merged_nodes) { + return NAN; + } + return h->nodes[pos].count; } -double td_centroids_mean_at(td_histogram_t *h, int pos){ - if (pos < 0 || pos > h->merged_nodes) { - return NAN; - } - return h->nodes[pos].mean; +double td_centroids_mean_at(td_histogram_t *h, int pos) { + if (pos < 0 || pos > h->merged_nodes) { + return NAN; + } + return h->nodes[pos].mean; } \ No newline at end of file diff --git a/src/tdigest.h b/src/tdigest.h index 6649a3d..8dc7fce 100644 --- a/src/tdigest.h +++ b/src/tdigest.h @@ -5,14 +5,15 @@ * Adaptive histogram based on something like streaming k-means crossed with Q-digest. * The implementation is a direct descendent of MergingDigest * https://github.com/tdunning/t-digest/ - * + * * Copyright (c) 2018 Andrew Werner, All rights reserved. * * The special characteristics of this algorithm are: * * - smaller summaries than Q-digest * - * - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy for middle quantiles + * - provides part per million accuracy for extreme quantiles and typically <1000 ppm accuracy + * for middle quantiles * * - fast * @@ -23,61 +24,59 @@ #define MM_PI 3.14159265358979323846 -typedef struct node -{ - double mean; - double count; +typedef struct node { + double mean; + double count; } node_t; -struct td_histogram -{ - // compression is a setting used to configure the size of centroids when merged. - double compression; +struct td_histogram { + // compression is a setting used to configure the size of centroids when merged. + double compression; - double min; - double max; + double min; + double max; - // cap is the total size of nodes - int cap; - // merged_nodes is the number of merged nodes at the front of nodes. - int merged_nodes; - // unmerged_nodes is the number of buffered nodes. - int unmerged_nodes; + // cap is the total size of nodes + int cap; + // merged_nodes is the number of merged nodes at the front of nodes. + int merged_nodes; + // unmerged_nodes is the number of buffered nodes. + int unmerged_nodes; - // we run the merge in reverse every other merge to avoid left-to-right bias in merging - long long total_compressions; + // we run the merge in reverse every other merge to avoid left-to-right bias in merging + long long total_compressions; - double merged_weight; - double unmerged_weight; + double merged_weight; + double unmerged_weight; - node_t nodes[]; + node_t nodes[]; }; typedef struct td_histogram td_histogram_t; #ifdef __cplusplus -extern "C" -{ +extern "C" { #endif - /** +/** * Allocate the memory, initialise the t-digest, and return the histogram as output parameter. - * @param compression The compression parameter. - * 100 is a common value for normal uses. - * 1000 is extremely large. - * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. + * @param compression The compression parameter. + * 100 is a common value for normal uses. + * 1000 is extremely large. + * The number of centroids retained will be a smallish (usually less than 10) multiple of this + * number. * @return the histogram on success, NULL if allocation failed. */ - td_histogram_t *td_new(double compression); +td_histogram_t *td_new(double compression); - /** +/** * Frees the memory associated with the t-digest. * * @param h The histogram you want to free. */ - void td_free(td_histogram_t *h); +void td_free(td_histogram_t *h); - /** +/** * Reset a histogram to zero - empty out a histogram and re-initialise it * * If you want to re-use an existing histogram, but reset everything back to zero, this @@ -86,128 +85,128 @@ extern "C" * @param h The histogram you want to reset to empty. * */ - void td_reset(td_histogram_t *h); +void td_reset(td_histogram_t *h); - /** +/** * Adds a sample to a histogram. * * @param val The value to add. * @param weight The weight of this point. */ - void td_add(td_histogram_t *h, double val, double weight); +void td_add(td_histogram_t *h, double val, double weight); - /** +/** * Re-examines a t-digest to determine whether some centroids are redundant. If your data are * perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space. * * The cost is roughly the same as adding as many data points as there are centroids. This * is typically < 10 * compression, but could be as high as 100 * compression. * This is a destructive operation that is not thread-safe. - * + * * @param h The histogram you want to compress. * */ - void td_compress(td_histogram_t *h); +void td_compress(td_histogram_t *h); - /** - * Merges all of the values from 'from' to 'this' histogram. +/** + * Merges all of the values from 'from' to 'this' histogram. * * @param h "This" pointer * @param from Histogram to copy values from. */ - void td_merge(td_histogram_t *h, td_histogram_t *from); +void td_merge(td_histogram_t *h, td_histogram_t *from); - /** +/** * Returns the fraction of all points added which are ≤ x. * * @param x The cutoff for the cdf. * @return The fraction of all data which is less or equal to x. */ - double td_cdf(td_histogram_t *h, double x); +double td_cdf(td_histogram_t *h, double x); - /** - * Returns an estimate of the cutoff such that a specified fraction of the data - * added to this TDigest would be less than or equal to the cutoff. - * - * @param q The desired fraction - * @return The value x such that cdf(x) == q; - */ - double td_quantile(td_histogram_t *h, double q); +/** + * Returns an estimate of the cutoff such that a specified fraction of the data + * added to this TDigest would be less than or equal to the cutoff. + * + * @param q The desired fraction + * @return The value x such that cdf(x) == q; + */ +double td_quantile(td_histogram_t *h, double q); - /** +/** * Returns the current compression factor. * * @return The compression factor originally used to set up the TDigest. */ - int td_compression(td_histogram_t *h); +int td_compression(td_histogram_t *h); - /** - * Returns the number of points that have been added to this TDigest. - * - * @return The sum of the weights on all centroids. - */ - double td_size(td_histogram_t *h); +/** + * Returns the number of points that have been added to this TDigest. + * + * @return The sum of the weights on all centroids. + */ +double td_size(td_histogram_t *h); - /** +/** * Returns the number of centroids being used by this TDigest. * * @return The number of centroids being used. */ - int td_centroid_count(td_histogram_t *h); +int td_centroid_count(td_histogram_t *h); - /** +/** * Get minimum value from the histogram. Will return __DBL_MAX__ if the histogram * is empty. * * @param h "This" pointer */ - double td_min(td_histogram_t *h); +double td_min(td_histogram_t *h); - /** +/** * Get maximum value from the histogram. Will return __DBL_MIN__ if the histogram * is empty. * * @param h "This" pointer */ - double td_max(td_histogram_t *h); +double td_max(td_histogram_t *h); - /** - * Get the full centroids weight array for 'this' histogram. +/** + * Get the full centroids weight array for 'this' histogram. * * @param h "This" pointer - * + * * @return The full centroids weight array. */ - const double *td_centroids_weight(td_histogram_t *h); +const double *td_centroids_weight(td_histogram_t *h); - /** - * Get the full centroids mean array for 'this' histogram. +/** + * Get the full centroids mean array for 'this' histogram. * * @param h "This" pointer - * + * * @return The full centroids mean array. */ - const double *td_centroids_mean(td_histogram_t *h); +const double *td_centroids_mean(td_histogram_t *h); - /** - * Get the centroid weight for 'this' histogram and 'pos'. +/** + * Get the centroid weight for 'this' histogram and 'pos'. * * @param h "This" pointer * @param pos centroid position. - * + * * @return The centroid weight. */ - double td_centroids_weight_at(td_histogram_t *h, int pos); +double td_centroids_weight_at(td_histogram_t *h, int pos); - /** - * Get the centroid mean for 'this' histogram and 'pos'. +/** + * Get the centroid mean for 'this' histogram and 'pos'. * * @param h "This" pointer * @param pos centroid position. - * + * * @return The centroid mean. */ - double td_centroids_mean_at(td_histogram_t *h, int pos); +double td_centroids_mean_at(td_histogram_t *h, int pos); #ifdef __cplusplus } From 351f12253b05f8ec2e4bffae8510626843f09257 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 16:16:36 +0000 Subject: [PATCH 5/8] [add] Ensuring linter is run on CI --- .circleci/config.yml | 23 +++++++++++++++++++++++ .gitignore | 3 +++ Makefile | 12 +++++++----- build/.gitignore | 4 ---- src/CMakeLists.txt | 16 ++++++++++------ src/tdigest.c | 8 +++++++- 6 files changed, 50 insertions(+), 16 deletions(-) delete mode 100644 build/.gitignore diff --git a/.circleci/config.yml b/.circleci/config.yml index 3d816ce..2bb2c7f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,6 +2,28 @@ version: 2.1 jobs: + lint: + docker: + - image: redislabsmodules/llvm-toolset:latest + steps: + - checkout + - run: + name: lint + command: | + make lint + + static-analysis-infer: + docker: + - image: redisbench/infer-linux64:1.0.0 + steps: + - checkout + - run: + name: Submodule checkout + command: git submodule update --init --recursive + - run: + name: run fbinfer + command: | + CC=clang CXX=clang++ INFER=infer make static-analysis build: docker: - image: "debian:stretch" @@ -40,6 +62,7 @@ jobs: workflows: commit: jobs: + - lint - build nightly: triggers: diff --git a/.gitignore b/.gitignore index c299066..86d0f78 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,9 @@ build/* .vscode tests/vendor/* +# fb infer static analysis +infer-out/* + # Prerequisites *.d diff --git a/Makefile b/Makefile index 7b8785e..5f5686d 100644 --- a/Makefile +++ b/Makefile @@ -90,18 +90,20 @@ coverage: ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; ) format: - clang-format -style=file -i $(SRCDIR)/* + clang-format -style=file -i $(SRCDIR)/*.c + clang-format -style=file -i $(SRCDIR)/*.h lint: - clang-format -style=file -Werror -n $(SRCDIR)/* + clang-format -style=file -Werror -n $(SRCDIR)/*.c + clang-format -style=file -Werror -n $(SRCDIR)/*.h # build all full: ( cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) ) -static-analysis-docker: - $(MAKE) clean - docker run -v $(ROOT)/:/RedisBloom/ --user "$(username):$(usergroup)" $(INFER_DOCKER) bash -c "cd RedisBloom && CC=clang infer run --fail-on-issue --biabduction --skip-analysis-in-path ".*rmutil.*" -- make" +# static-analysis-docker: +# $(MAKE) clean +# docker run -v $(ROOT)/:/t-digest-c/ --user "$(username):$(usergroup)" $(INFER_DOCKER) bash -c "cd t-digest-c && CC=clang infer run --keep-going --fail-on-issue --biabduction -- make test" clean: distclean diff --git a/build/.gitignore b/build/.gitignore deleted file mode 100644 index 153d0f5..0000000 --- a/build/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignores all files in tdigest/build except this one. -* -*/ -!.gitignore \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 579b1ba..668d7f4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,11 +2,15 @@ FILE(GLOB c_files "*.c") FILE(GLOB header_files "*.h") if (BUILD_SHARED) -add_library(tdigest SHARED ${c_files} ${header_files}) target_link_libraries(tdigest m) + add_library(tdigest SHARED ${c_files} ${header_files}) + target_link_libraries(tdigest m) target_include_directories(tdigest SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - install(TARGETS tdigest DESTINATION lib${LIB_SUFFIX}) endif(BUILD_SHARED) + install(TARGETS tdigest DESTINATION lib${LIB_SUFFIX}) +endif(BUILD_SHARED) - if (BUILD_STATIC) add_library(tdigest_static STATIC ${c_files} ${ - header_files}) target_link_libraries(tdigest_static m) - target_include_directories(tdigest_static SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) - install(TARGETS tdigest_static DESTINATION lib${LIB_SUFFIX}) endif(BUILD_STATIC) \ No newline at end of file +if (BUILD_STATIC) + add_library(tdigest_static STATIC ${c_files} ${header_files}) + target_link_libraries(tdigest_static m) + target_include_directories(tdigest_static SYSTEM PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) + install(TARGETS tdigest_static DESTINATION lib${LIB_SUFFIX}) +endif(BUILD_STATIC) diff --git a/src/tdigest.c b/src/tdigest.c index e99a8dc..0d31da8 100644 --- a/src/tdigest.c +++ b/src/tdigest.c @@ -89,6 +89,9 @@ double td_cdf(td_histogram_t *h, double val) { } k += n->count; } + if (n == NULL) { + return NAN; + } if (val == n->mean) { // technically this needs to find all of the nodes which contain this value and sum their // weight @@ -131,6 +134,9 @@ double td_quantile(td_histogram_t *h, double q) { } k += n->count; } + if (n == NULL) { + return NAN; + } double delta_k = goal - k - (n->count / 2); if (is_very_small(delta_k)) { return n->mean; @@ -254,4 +260,4 @@ double td_centroids_mean_at(td_histogram_t *h, int pos) { return NAN; } return h->nodes[pos].mean; -} \ No newline at end of file +} From 29dbac4b337f117624ae33f6710159f724e814d1 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 16:27:47 +0000 Subject: [PATCH 6/8] [fix] Fixed build/* dir issue --- Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 5f5686d..881f268 100644 --- a/Makefile +++ b/Makefile @@ -65,15 +65,15 @@ default: full # just build the static library. Do not build tests or benchmarks library_static: - ( cd build ; cmake $(CMAKE_LIBRARY_STATIC_OPTIONS) .. ; $(MAKE) ) + ( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_STATIC_OPTIONS) .. ; $(MAKE) ) # just build the shared library. Do not build tests or benchmarks library_shared: - ( cd build ; cmake $(CMAKE_LIBRARY_SHARED_OPTIONS) .. ; $(MAKE) ) + ( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_SHARED_OPTIONS) .. ; $(MAKE) ) # just build the static and shared libraries. Do not build tests or benchmarks library_all: - ( cd build ; cmake $(CMAKE_LIBRARY_OPTIONS) .. ; $(MAKE) ) + ( mkdir -p build; cd build ; cmake $(CMAKE_LIBRARY_OPTIONS) .. ; $(MAKE) ) # just build the static and shared libraries and produce measurements # of accuracy versus compression factor for fixed data size @@ -81,13 +81,13 @@ library_all: # just build the static and shared libraries and tests unit_tests: - ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test ) + ( mkdir -p build; cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test ) test: $(MAKE) unit_tests coverage: - ( cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; ) + ( mkdir -p build; cd build ; cmake $(CMAKE_TEST_OPTIONS) .. ; $(MAKE) ; $(MAKE) test; make coverage; ) format: clang-format -style=file -i $(SRCDIR)/*.c @@ -99,7 +99,7 @@ lint: # build all full: - ( cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) ) + ( mkdir -p build; cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) ) # static-analysis-docker: # $(MAKE) clean From 334215f9c148c979034ccfa829ffadb830c19cbd Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 17:15:26 +0000 Subject: [PATCH 7/8] [add] Added perf makefile helpers --- Makefile | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 881f268..9561fcd 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,7 @@ ifndef CMAKE_LIBRARY_SHARED_OPTIONS -DBUILD_SHARED=ON \ -DBUILD_STATIC=OFF \ -DENABLE_FRAME_POINTER=ON \ + -DENABLE_CODECOVERAGE=OFF \ -DBUILD_TESTS=OFF \ -DBUILD_BENCHMARK=OFF \ -DBUILD_EXAMPLES=OFF @@ -24,6 +25,7 @@ ifndef CMAKE_LIBRARY_STATIC_OPTIONS -DBUILD_SHARED=OFF \ -DBUILD_STATIC=ON \ -DENABLE_FRAME_POINTER=ON \ + -DENABLE_CODECOVERAGE=OFF \ -DBUILD_TESTS=OFF \ -DBUILD_BENCHMARK=OFF \ -DBUILD_EXAMPLES=OFF @@ -34,6 +36,7 @@ ifndef CMAKE_LIBRARY_OPTIONS -DBUILD_SHARED=ON \ -DBUILD_STATIC=ON \ -DENABLE_FRAME_POINTER=ON \ + -DENABLE_CODECOVERAGE=OFF \ -DBUILD_TESTS=OFF \ -DBUILD_BENCHMARK=OFF \ -DBUILD_EXAMPLES=OFF @@ -44,6 +47,7 @@ ifndef CMAKE_FULL_OPTIONS -DBUILD_SHARED=ON \ -DBUILD_STATIC=ON \ -DENABLE_FRAME_POINTER=ON \ + -DENABLE_CODECOVERAGE=OFF \ -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON @@ -99,7 +103,7 @@ lint: # build all full: - ( mkdir -p build; cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) ) + ( mkdir -p build; cd build ; cmake $(CMAKE_FULL_OPTIONS) .. ; $(MAKE) VERBOSE=1 ) # static-analysis-docker: # $(MAKE) clean @@ -110,5 +114,17 @@ clean: distclean distclean: rm -rf build/* -bench: full - $(SHOW) build/tests/histogram_benchmark --benchmark_min_time=10 \ No newline at end of file +bench: clean + CFLAGS="-g -fno-omit-frame-pointer " CXXFLAGS="-g -fno-omit-frame-pointer " $(MAKE) + $(SHOW) build/tests/histogram_benchmark --benchmark_min_time=10 + +perf-stat-bench: + CFLAGS="-g -fno-omit-frame-pointer " CXXFLAGS="-g -fno-omit-frame-pointer " $(MAKE) + $(SHOW) perf stat build/tests/histogram_benchmark --benchmark_min_time=10 + +perf-record-bench: + CFLAGS="-g -fno-omit-frame-pointer " CXXFLAGS="-g -fno-omit-frame-pointer " $(MAKE) + $(SHOW) perf record -g -o perf.data.td_add build/tests/histogram_benchmark --benchmark_min_time=10 + +perf-report-bench: + $(SHOW) perf report -g -i perf.data.td_add \ No newline at end of file From 42f8b5a34d205396c791c951341e6ab8cb993a97 Mon Sep 17 00:00:00 2001 From: filipecosta90 Date: Sun, 15 Nov 2020 17:25:43 +0000 Subject: [PATCH 8/8] [add] Ignoring perf.data and also properly displaying perf report data --- .gitignore | 4 ++++ Makefile | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 86d0f78..e908041 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,10 @@ build/* .vscode tests/vendor/* +# perf related +perf.data* + + # fb infer static analysis infer-out/* diff --git a/Makefile b/Makefile index 9561fcd..b7e0948 100644 --- a/Makefile +++ b/Makefile @@ -127,4 +127,4 @@ perf-record-bench: $(SHOW) perf record -g -o perf.data.td_add build/tests/histogram_benchmark --benchmark_min_time=10 perf-report-bench: - $(SHOW) perf report -g -i perf.data.td_add \ No newline at end of file + $(SHOW) perf report -g 'graph,0.5,caller' -i perf.data.td_add \ No newline at end of file