Skip to content

Commit

Permalink
Fix asv problems (#2065)
Browse files Browse the repository at this point in the history
#### Reference Issues/PRs
Fixes problems with publishing of ASV benchmarks

#### What does this implement or fix?
This PR implements the following:
- Upgrade the Python analysis flow and C++ tests to Python 3.11 - this
is needed because some of the benchmarks need at least 3.10
- Fixes the problems with publishing of the benchmarks - because they
can't evaluate np.inf correctly
- Fixes LFS for BI benchmarks
- Incorporates changes from
#2060

See successful run
[here](https://github.com/man-group/ArcticDB/actions/runs/12317667688/job/34380720877)
(had to be started manually due to the changes to the workflow files)

#### Any other comments?

#### Checklist

<details>
  <summary>
   Checklist for code changes...
  </summary>
 
- [ ] Have you updated the relevant docstrings, documentation and
copyright notice?
- [ ] Is this contribution tested against [all ArcticDB's
features](../docs/mkdocs/docs/technical/contributing.md)?
- [ ] Do all exceptions introduced raise appropriate [error
messages](https://docs.arcticdb.io/error_messages/)?
 - [ ] Are API changes highlighted in the PR description?
- [ ] Is the PR labelled as enhancement or bug so it appears in
autogenerated release notes?
</details>

<!--
Thanks for contributing a Pull Request to ArcticDB! Please ensure you
have taken a look at:
- ArcticDB's Code of Conduct:
https://github.com/man-group/ArcticDB/blob/master/CODE_OF_CONDUCT.md
- ArcticDB's Contribution Licensing:
https://github.com/man-group/ArcticDB/blob/master/docs/mkdocs/docs/technical/contributing.md#contribution-licensing
-->
  • Loading branch information
G-D-Petrov authored Dec 16, 2024
1 parent 9b3eed2 commit a02f90f
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 148 deletions.
6 changes: 3 additions & 3 deletions .github/actions/setup_deps/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ runs:
dnf update -y
dnf remove -y 'gcc-toolset-13-*'
dnf install -y zip flex bison gcc-toolset-10 gcc-toolset-10-gdb gcc-toolset-10-libatomic-devel krb5-devel cyrus-sasl-devel openssl-devel \
unzip tar epel-release jq wget libcurl-devel python3 \
python3-devel python3-pip perl-IPC-Cmd
unzip tar epel-release jq wget libcurl-devel \
python3.11-devel python3.11-pip perl-IPC-Cmd
dnf groupinstall -y 'Development Tools'
Expand All @@ -19,7 +19,7 @@ runs:
echo "CXX=/opt/rh/gcc-toolset-10/root/bin/g++" | tee -a $GITHUB_ENV
echo "CMAKE_CXX_COMPILER=/opt/rh/gcc-toolset-10/root/bin/g++" | tee -a $GITHUB_ENV
echo "LD_LIBRARY_PATH=/opt/rh/gcc-toolset-10/root/usr/lib64:/opt/rh/gcc-toolset-10/root/usr/lib:/opt/rh/gcc-toolset-10/root/usr/lib64/dyninst" | tee -a $GITHUB_ENV
echo "/opt/rh/devtoolset-10/root/usr/bin" | tee -a $GITHUB_PATH
echo "/opt/rh/devtoolset-10/root/usr/bin:/opt/python/cp311-cp311/bin" | tee -a $GITHUB_PATH
echo $GITHUB_ENV
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/analysis_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
type: boolean
default: false

schedule: # Schdeule the job to run at 12 a.m. daily
schedule: # Schedule the job to run at 12 a.m. daily
- cron: '0 0 * * *'

pull_request_target:
Expand Down
21 changes: 14 additions & 7 deletions .github/workflows/benchmark_commits.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,14 @@ jobs:
defaults:
run: {shell: bash}
steps:
- name: Initialize LFS
shell: bash -l {0}
run: |
dnf install -y git-lfs
- uses: actions/[email protected]
with:
lfs: 'true'
fetch-depth: 0
submodules: recursive
token: ${{ secrets.ARCTICDB_TEST_PAT }}
Expand All @@ -46,14 +52,15 @@ jobs:
- name: Install deps
uses: ./.github/actions/setup_deps

# We are changing the python here because we want to use the default python to build (it is devel version)
# and this python for the rest of the testing
- name: Select Python (Linux)
shell: bash -el {0}
- name: Extra envs
shell: bash -l {0}
run: |
ls /opt/python
echo /opt/python/cp36-cp36m/bin >> $GITHUB_PATH
. build_tooling/vcpkg_caching.sh # Linux follower needs another call in CIBW
echo -e "VCPKG_BINARY_SOURCES=$VCPKG_BINARY_SOURCES
VCPKG_ROOT=$PLATFORM_VCPKG_ROOT" | tee -a $GITHUB_ENV
cmake -P cpp/CMake/CpuCount.cmake | sed 's/^-- //' | tee -a $GITHUB_ENV
env:
CMAKE_BUILD_PARALLEL_LEVEL: ${{vars.CMAKE_BUILD_PARALLEL_LEVEL}}

- name: Set persistent storage variables
uses: ./.github/actions/set_persistent_storage_env_vars
Expand Down
2 changes: 2 additions & 0 deletions build_tooling/transform_asv_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""

import pandas as pd
from numpy import inf
from arcticdb.storage_fixtures.s3 import real_s3_from_environment_variables
import json
from pathlib import Path
Expand Down
136 changes: 68 additions & 68 deletions python/.asv/results/benchmarks.json

Large diffs are not rendered by default.

101 changes: 74 additions & 27 deletions python/benchmarks/basic_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""
import time
from typing import List
from arcticdb import Arctic
from arcticdb.version_store.library import WritePayload, ReadRequest
import pandas as pd
Expand Down Expand Up @@ -185,6 +187,7 @@ def time_read_batch_pure(self, rows, num_symbols):
def peakmem_read_batch(self, rows, num_symbols):
read_reqs = [ReadRequest(f"{sym}_sym") for sym in range(num_symbols)]
self.lib.read_batch(read_reqs)

def time_read_batch_with_columns(self, rows, num_symbols):
COLS = ["value"]
read_reqs = [
Expand Down Expand Up @@ -214,14 +217,22 @@ def peakmem_read_batch_with_date_ranges(self, rows, num_symbols):
]
self.lib.read_batch(read_reqs)

def get_time_at_fraction_of_df(fraction, rows):
end_time = pd.Timestamp("1/1/2023")
time_delta = pd.tseries.offsets.DateOffset(seconds=round(rows * (fraction-1)))
return end_time + time_delta

from shutil import copytree, rmtree
class ModificationFunctions:
"""
Modification functions (update, append, delete) need a different setup/teardown process, thus we place them in a
separate group.
"""
rounds = 1
number = 1 # We do a single run between setup and teardown because we e.g. can't delete a symbol twice
repeat = 3
warmup_time=0

timeout = 6000
ARCTIC_DIR = "modification_functions"
ARCTIC_DIR_ORIGINAL = "modification_functions_original"
Expand All @@ -232,7 +243,42 @@ class ModificationFunctions:
params = PARAMS
param_names = PARAM_NAMES

class LargeAppendDataModify:
"""
This class will hold a cache of append large dataframes
The purpose of this cache is to create dataframes
which timestamps are sequenced over time so that
overlap does not occur
"""

def __init__(self, num_rows_list:List[int], number_elements:int):
self.df_append_large = {}
self.df_append_short_wide = {}
start_time = time.time()
for rows in num_rows_list:
print("Generating dataframe with rows: ", rows)
lst = list()
lst_saw = list()
for n in range(number_elements+1):
print("Generating dataframe no: ", n)

df = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(2*(n+1), rows))
df_saw = generate_random_floats_dataframe_with_index(
ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS, "s",
get_time_at_fraction_of_df(2*(n+1), rows=ModificationFunctions.WIDE_DF_ROWS)
)

lst.append(df)
lst_saw.append(df_saw)
print(f"STANDARD Index {df.iloc[0].name} - {df.iloc[df.shape[0] - 1].name}")
print(f"SHORT_n_WIDE Index {df_saw.iloc[0].name} - {df_saw.iloc[df_saw.shape[0] - 1].name}")
print("Add dataframes: ", len(lst))
self.df_append_large[rows] = lst
self.df_append_short_wide[rows] = lst_saw
print("APPEND LARGE cache generation took (s) :", time.time() - start_time)

def setup_cache(self):

self.ac = Arctic(ModificationFunctions.CONNECTION_STRING)
rows_values = ModificationFunctions.params

Expand All @@ -241,7 +287,9 @@ def setup_cache(self):
lib_name = get_prewritten_lib_name(rows)
self.ac.delete_library(lib_name)
lib = self.ac.create_library(lib_name)
lib.write("sym", self.init_dfs[rows])
df = self.init_dfs[rows]
lib.write("sym", df)
print(f"INITIAL DATAFRAME {rows} rows has Index {df.iloc[0].name} - {df.iloc[df.shape[0] - 1].name}")

lib_name = get_prewritten_lib_name(ModificationFunctions.WIDE_DF_ROWS)
self.ac.delete_library(lib_name)
Expand All @@ -257,62 +305,61 @@ def setup_cache(self):
# Then on each teardown we restore the initial state by overwriting the modified with the original.
copytree(ModificationFunctions.ARCTIC_DIR, ModificationFunctions.ARCTIC_DIR_ORIGINAL)

number_iteration = ModificationFunctions.repeat * ModificationFunctions.number * ModificationFunctions.rounds

def setup(self, rows):
def get_time_at_fraction_of_df(fraction, rows=rows):
end_time = pd.Timestamp("1/1/2023")
time_delta = pd.tseries.offsets.DateOffset(seconds=round(rows * (fraction-1)))
return end_time + time_delta
lad = ModificationFunctions.LargeAppendDataModify(ModificationFunctions.params, number_iteration)

return lad

def setup(self, lad: LargeAppendDataModify, rows):

self.df_update_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(0.5))
self.df_update_half = generate_pseudo_random_dataframe(rows//2, "s", get_time_at_fraction_of_df(0.75))
self.df_update_upsert = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(1.5))
self.df_append_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(1.1))
self.df_append_large = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(2))
self.df_update_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(0.5, rows))
self.df_update_half = generate_pseudo_random_dataframe(rows//2, "s", get_time_at_fraction_of_df(0.75, rows))
self.df_update_upsert = generate_pseudo_random_dataframe(rows, "s", get_time_at_fraction_of_df(1.5, rows))
self.df_append_single = generate_pseudo_random_dataframe(1, "s", get_time_at_fraction_of_df(1.1, rows))

self.df_update_short_wide = generate_random_floats_dataframe_with_index(
ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS
)
self.df_append_short_wide = generate_random_floats_dataframe_with_index(
ModificationFunctions.WIDE_DF_ROWS, ModificationFunctions.WIDE_DF_COLS, "s", get_time_at_fraction_of_df(2, rows=ModificationFunctions.WIDE_DF_ROWS)
)

self.ac = Arctic(ModificationFunctions.CONNECTION_STRING)
self.lib = self.ac[get_prewritten_lib_name(rows)]
self.lib_short_wide = self.ac[get_prewritten_lib_name(ModificationFunctions.WIDE_DF_ROWS)]


def teardown(self, rows):
def teardown(self, lad: LargeAppendDataModify, rows):
# After the modification functions clean up the changes by replacing the modified ARCTIC_DIR with the original ARCTIC_DIR_ORIGINAL
# TODO: We can use dirs_exist_ok=True on copytree instead of removing first if we run with python version >=3.8
rmtree(ModificationFunctions.ARCTIC_DIR)
copytree(ModificationFunctions.ARCTIC_DIR_ORIGINAL, ModificationFunctions.ARCTIC_DIR)
del self.ac

del self.ac

def time_update_single(self, rows):
def time_update_single(self, lad: LargeAppendDataModify, rows):
self.lib.update(f"sym", self.df_update_single)

def time_update_half(self, rows):
def time_update_half(self, lad: LargeAppendDataModify, rows):
self.lib.update(f"sym", self.df_update_half)

def time_update_upsert(self, rows):
def time_update_upsert(self, lad: LargeAppendDataModify, rows):
self.lib.update(f"sym", self.df_update_upsert, upsert=True)

def time_update_short_wide(self, rows):
def time_update_short_wide(self, lad: LargeAppendDataModify, rows):
self.lib_short_wide.update("short_wide_sym", self.df_update_short_wide)

def time_append_single(self, rows):
def time_append_single(self, lad: LargeAppendDataModify, rows):
self.lib.append(f"sym", self.df_append_single)

def time_append_large(self, rows):
self.lib.append(f"sym", self.df_append_large)
def time_append_large(self, lad: LargeAppendDataModify, rows):
large: pd.DataFrame = lad.df_append_large[rows].pop(0)
self.lib.append(f"sym", large)

def time_append_short_wide(self, rows):
self.lib_short_wide.append("short_wide_sym", self.df_append_short_wide)
def time_append_short_wide(self, lad: LargeAppendDataModify, rows):
large: pd.DataFrame = lad.df_append_short_wide[rows].pop(0)
self.lib_short_wide.append("short_wide_sym", large)

def time_delete(self, rows):
def time_delete(self, lad: LargeAppendDataModify, rows):
self.lib.delete(f"sym")

def time_delete_short_wide(self, rows):
def time_delete_short_wide(self, lad: LargeAppendDataModify, rows):
self.lib_short_wide.delete("short_wide_sym")
6 changes: 1 addition & 5 deletions python/benchmarks/bi_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,4 @@ def time_query_groupby_city_count_filter_two_aggregations(self, times_bigger) ->


def peakmem_query_groupby_city_count_filter_two_aggregations(self, times_bigger):
return self.query_groupby_city_count_filter_two_aggregations(times_bigger)




return self.query_groupby_city_count_filter_two_aggregations(times_bigger)
Loading

0 comments on commit a02f90f

Please sign in to comment.