Fix asv problems (#2065)

#### Reference Issues/PRs Fixes problems with publishing of ASV benchmarks #### What does this implement or fix? This PR implements the following: - Upgrade the Python analysis flow and C++ tests to Python 3.11 - this is needed because some of the benchmarks need at least 3.10 - Fixes the problems with publishing of the benchmarks - because they can't evaluate np.inf correctly - Fixes LFS for BI benchmarks - Incorporates changes from #2060 See successful run [here](https://github.com/man-group/ArcticDB/actions/runs/12317667688/job/34380720877) (had to be started manually due to the changes to the workflow files) #### Any other comments? #### Checklist <details> <summary> Checklist for code changes... </summary> - [ ] Have you updated the relevant docstrings, documentation and copyright notice? - [ ] Is this contribution tested against [all ArcticDB's features](../docs/mkdocs/docs/technical/contributing.md)? - [ ] Do all exceptions introduced raise appropriate [error messages](https://docs.arcticdb.io/error_messages/)? - [ ] Are API changes highlighted in the PR description? - [ ] Is the PR labelled as enhancement or bug so it appears in autogenerated release notes? </details>
man-group · Dec 16, 2024 · a02f90f · a02f90f
1 parent 9b3eed2
commit a02f90f
Show file tree

Hide file tree

Showing 8 changed files with 210 additions and 148 deletions.
diff --git a/.github/actions/setup_deps/action.yml b/.github/actions/setup_deps/action.yml
@@ -9,8 +9,8 @@ runs:
         dnf update -y
         dnf remove -y 'gcc-toolset-13-*'
         dnf install -y zip flex bison gcc-toolset-10 gcc-toolset-10-gdb gcc-toolset-10-libatomic-devel krb5-devel cyrus-sasl-devel openssl-devel \
-        unzip tar epel-release jq wget libcurl-devel python3 \
-        python3-devel python3-pip perl-IPC-Cmd
+        unzip tar epel-release jq wget libcurl-devel \
+        python3.11-devel python3.11-pip perl-IPC-Cmd
 
         dnf groupinstall -y 'Development Tools'
 
@@ -19,7 +19,7 @@ runs:
         echo "CXX=/opt/rh/gcc-toolset-10/root/bin/g++" | tee -a $GITHUB_ENV
         echo "CMAKE_CXX_COMPILER=/opt/rh/gcc-toolset-10/root/bin/g++" | tee -a $GITHUB_ENV
         echo "LD_LIBRARY_PATH=/opt/rh/gcc-toolset-10/root/usr/lib64:/opt/rh/gcc-toolset-10/root/usr/lib:/opt/rh/gcc-toolset-10/root/usr/lib64/dyninst" | tee -a $GITHUB_ENV
-        echo "/opt/rh/devtoolset-10/root/usr/bin" | tee -a $GITHUB_PATH
+        echo "/opt/rh/devtoolset-10/root/usr/bin:/opt/python/cp311-cp311/bin" | tee -a $GITHUB_PATH
 
         echo $GITHUB_ENV
 

diff --git a/.github/workflows/analysis_workflow.yml b/.github/workflows/analysis_workflow.yml
@@ -6,7 +6,7 @@ on:
         type: boolean
         default: false
 
-  schedule: # Schdeule the job to run at 12 a.m. daily
+  schedule: # Schedule the job to run at 12 a.m. daily
     - cron: '0 0 * * *'
 
   pull_request_target:

diff --git a/.github/workflows/benchmark_commits.yml b/.github/workflows/benchmark_commits.yml
@@ -31,8 +31,14 @@ jobs:
     defaults:
       run: {shell: bash}
     steps:  
+      - name: Initialize LFS
+        shell: bash -l {0}
+        run: |
+          dnf install -y git-lfs
+
       - uses: actions/[email protected]
         with:
+          lfs: 'true'
           fetch-depth: 0
           submodules: recursive
           token: ${{ secrets.ARCTICDB_TEST_PAT }}
@@ -46,14 +52,15 @@ jobs:
       - name: Install deps
         uses: ./.github/actions/setup_deps
 
-      # We are changing the python here because we want to use the default python to build (it is devel version)
-      # and this python for the rest of the testing
-      - name: Select Python (Linux)
-        shell: bash -el {0}
+      - name: Extra envs
+        shell: bash -l {0}
         run: |
-          ls /opt/python  
-          echo /opt/python/cp36-cp36m/bin >> $GITHUB_PATH
-
+          . build_tooling/vcpkg_caching.sh # Linux follower needs another call in CIBW
+          echo -e "VCPKG_BINARY_SOURCES=$VCPKG_BINARY_SOURCES
+          VCPKG_ROOT=$PLATFORM_VCPKG_ROOT" | tee -a $GITHUB_ENV
+          cmake -P cpp/CMake/CpuCount.cmake | sed 's/^-- //' | tee -a $GITHUB_ENV
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: ${{vars.CMAKE_BUILD_PARALLEL_LEVEL}}
 
       - name: Set persistent storage variables
         uses: ./.github/actions/set_persistent_storage_env_vars

diff --git a/build_tooling/transform_asv_results.py b/build_tooling/transform_asv_results.py
@@ -5,7 +5,9 @@
 
 As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
 """
+
 import pandas as pd
+from numpy import inf
 from arcticdb.storage_fixtures.s3 import real_s3_from_environment_variables
 import json
 from pathlib import Path

diff --git a/python/.asv/results/benchmarks.json b/python/.asv/results/benchmarks.json
diff --git a/python/benchmarks/basic_functions.py b/python/benchmarks/basic_functions.py
@@ -5,6 +5,8 @@
 
 As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
 """
+import time
+from typing import List
 from arcticdb import Arctic
 from arcticdb.version_store.library import WritePayload, ReadRequest
 import pandas as pd
@@ -185,6 +187,7 @@ def time_read_batch_pure(self, rows, num_symbols):
     def peakmem_read_batch(self, rows, num_symbols):
         read_reqs = [ReadRequest(f"{sym}_sym") for sym in range(num_symbols)]
         self.lib.read_batch(read_reqs)
+
     def time_read_batch_with_columns(self, rows, num_symbols):
         COLS = ["value"]
         read_reqs = [
@@ -214,14 +217,22 @@ def peakmem_read_batch_with_date_ranges(self, rows, num_symbols):
         ]
         self.lib.read_batch(read_reqs)
 
+def get_time_at_fraction_of_df(fraction, rows):
+    end_time = pd.Timestamp("1/1/2023")
+    time_delta = pd.tseries.offsets.DateOffset(seconds=round(rows * (fraction-1)))
+    return end_time + time_delta
 
 from shutil import copytree, rmtree
 class ModificationFunctions:
     """
     Modification functions (update, append, delete) need a different setup/teardown process, thus we place them in a
     separate group.
     """
+    rounds = 1
     number = 1 # We do a single run between setup and teardown because we e.g. can't delete a symbol twice
+    repeat = 3
+    warmup_time=0
+
     timeout = 6000
     ARCTIC_DIR = "modification_functions"
     ARCTIC_DIR_ORIGINAL = "modification_functions_original"
@@ -232,7 +243,42 @@ class ModificationFunctions:
     params = PARAMS
     param_names = PARAM_NAMES
 
+    class LargeAppendDataModify:
+        """
+            This class will hold a cache of append large dataframes
+            The purpose of this cache is to create dataframes
+            which timestamps are sequenced over time so that 
+            overlap does not occur
+        """
+
+        def __init__(self, num_rows_list:List[int], number_elements:int):
+            self.df_append_large = {}
+            self.df_append_short_wide = {}
+            start_time = time.time()
+            for rows in num_rows_list:
+                print("Generating dataframe with rows: ", rows)
+                lst = list()
+                lst_saw = list()
+                for n in range(number_elements+1):
+                    print("Generating dataframe no: ", n)
+
+                    df = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(2*(n+1), rows))
+                    df_saw = generate_random_floats_dataframe_with_index(
+                        ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS, "s", 
+                        get_time_at_fraction_of_df(2*(n+1), rows=ModificationFunctions.WIDE_DF_ROWS)
+                    )
+
+                    lst.append(df)
+                    lst_saw.append(df_saw)
+                    print(f"STANDARD     Index {df.iloc[0].name} - {df.iloc[df.shape[0] - 1].name}")
+                    print(f"SHORT_n_WIDE Index {df_saw.iloc[0].name} - {df_saw.iloc[df_saw.shape[0] - 1].name}")
+                print("Add dataframes: ", len(lst))
+                self.df_append_large[rows] = lst
+                self.df_append_short_wide[rows] = lst_saw
+            print("APPEND LARGE cache generation took (s) :", time.time() - start_time)
+
     def setup_cache(self):
+
         self.ac = Arctic(ModificationFunctions.CONNECTION_STRING)
         rows_values = ModificationFunctions.params
 
@@ -241,7 +287,9 @@ def setup_cache(self):
             lib_name = get_prewritten_lib_name(rows)
             self.ac.delete_library(lib_name)
             lib = self.ac.create_library(lib_name)
-            lib.write("sym", self.init_dfs[rows])
+            df = self.init_dfs[rows]
+            lib.write("sym", df)
+            print(f"INITIAL DATAFRAME {rows} rows has Index {df.iloc[0].name} - {df.iloc[df.shape[0] - 1].name}")
 
         lib_name = get_prewritten_lib_name(ModificationFunctions.WIDE_DF_ROWS)
         self.ac.delete_library(lib_name)
@@ -257,62 +305,61 @@ def setup_cache(self):
         # Then on each teardown we restore the initial state by overwriting the modified with the original.
         copytree(ModificationFunctions.ARCTIC_DIR, ModificationFunctions.ARCTIC_DIR_ORIGINAL)
 
+        number_iteration = ModificationFunctions.repeat * ModificationFunctions.number * ModificationFunctions.rounds
 
-    def setup(self, rows):
-        def get_time_at_fraction_of_df(fraction, rows=rows):
-            end_time = pd.Timestamp("1/1/2023")
-            time_delta = pd.tseries.offsets.DateOffset(seconds=round(rows * (fraction-1)))
-            return end_time + time_delta
+        lad = ModificationFunctions.LargeAppendDataModify(ModificationFunctions.params, number_iteration)
+
+        return lad
+
+    def setup(self, lad: LargeAppendDataModify, rows):
 
-        self.df_update_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(0.5))
-        self.df_update_half = generate_pseudo_random_dataframe(rows//2, "s", get_time_at_fraction_of_df(0.75))
-        self.df_update_upsert = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(1.5))
-        self.df_append_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(1.1))
-        self.df_append_large = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(2))
+        self.df_update_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(0.5, rows))
+        self.df_update_half = generate_pseudo_random_dataframe(rows//2, "s", get_time_at_fraction_of_df(0.75, rows))
+        self.df_update_upsert = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(1.5, rows))
+        self.df_append_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(1.1, rows))
 
         self.df_update_short_wide = generate_random_floats_dataframe_with_index(
             ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS
         )
-        self.df_append_short_wide = generate_random_floats_dataframe_with_index(
-            ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS, "s", get_time_at_fraction_of_df(2, rows=ModificationFunctions.WIDE_DF_ROWS)
-        )
 
         self.ac = Arctic(ModificationFunctions.CONNECTION_STRING)
         self.lib = self.ac[get_prewritten_lib_name(rows)]
         self.lib_short_wide = self.ac[get_prewritten_lib_name(ModificationFunctions.WIDE_DF_ROWS)]
 
 
-    def teardown(self, rows):
+    def teardown(self, lad: LargeAppendDataModify, rows):
         # After the modification functions clean up the changes by replacing the modified ARCTIC_DIR with the original ARCTIC_DIR_ORIGINAL
         # TODO: We can use dirs_exist_ok=True on copytree instead of removing first if we run with python version >=3.8
         rmtree(ModificationFunctions.ARCTIC_DIR)
         copytree(ModificationFunctions.ARCTIC_DIR_ORIGINAL, ModificationFunctions.ARCTIC_DIR)
-        del self.ac
 
+        del self.ac
 
-    def time_update_single(self, rows):
+    def time_update_single(self, lad: LargeAppendDataModify, rows):
         self.lib.update(f"sym", self.df_update_single)
 
-    def time_update_half(self, rows):
+    def time_update_half(self, lad: LargeAppendDataModify, rows):
         self.lib.update(f"sym", self.df_update_half)
 
-    def time_update_upsert(self, rows):
+    def time_update_upsert(self, lad: LargeAppendDataModify, rows):
         self.lib.update(f"sym", self.df_update_upsert, upsert=True)
 
-    def time_update_short_wide(self, rows):
+    def time_update_short_wide(self, lad: LargeAppendDataModify, rows):
         self.lib_short_wide.update("short_wide_sym", self.df_update_short_wide)
 
-    def time_append_single(self, rows):
+    def time_append_single(self, lad: LargeAppendDataModify, rows):
         self.lib.append(f"sym", self.df_append_single)
 
-    def time_append_large(self, rows):
-        self.lib.append(f"sym", self.df_append_large)
+    def time_append_large(self, lad: LargeAppendDataModify, rows):
+        large: pd.DataFrame = lad.df_append_large[rows].pop(0)
+        self.lib.append(f"sym", large)
 
-    def time_append_short_wide(self, rows):
-        self.lib_short_wide.append("short_wide_sym", self.df_append_short_wide)
+    def time_append_short_wide(self, lad: LargeAppendDataModify, rows):
+        large: pd.DataFrame = lad.df_append_short_wide[rows].pop(0)
+        self.lib_short_wide.append("short_wide_sym", large)
 
-    def time_delete(self, rows):
+    def time_delete(self, lad: LargeAppendDataModify, rows):
         self.lib.delete(f"sym")
 
-    def time_delete_short_wide(self, rows):
+    def time_delete_short_wide(self, lad: LargeAppendDataModify, rows):
         self.lib_short_wide.delete("short_wide_sym")
diff --git a/python/benchmarks/bi_benchmarks.py b/python/benchmarks/bi_benchmarks.py
@@ -184,8 +184,4 @@ def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) ->
 
 
     def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):
-        return self.query_groupby_city_count_filter_two_aggregations(times_bigger)
-
-
-
-
+        return self.query_groupby_city_count_filter_two_aggregations(times_bigger)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -184,8 +184,4 @@ def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) ->


		def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):
		return self.query_groupby_city_count_filter_two_aggregations(times_bigger)




		return self.query_groupby_city_count_filter_two_aggregations(times_bigger)