Skip to content

Refactor DataFrameNormalizer to improve performance #3699

Refactor DataFrameNormalizer to improve performance

Refactor DataFrameNormalizer to improve performance #3699

name: Build with analysis tools
on:
workflow_dispatch:
inputs:
run_all_benchmarks:
type: boolean
default: false
schedule: # Schdeule the job to run at 12 a.m. daily
- cron: '0 0 * * *'
pull_request_target:
paths-ignore:
- "**/*.md"
jobs:
get_commits_to_benchmark:
name: Get tag commits
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/[email protected]
with:
fetch-depth: 0
ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || '' }} # Note: This is dangerous if we run automatic CI on external PRs
- name: Get tags
id: get_tags
run: |
python3 build_tooling/get_commits_for_benchmark.py ${{ inputs.run_all_benchmarks == true && '--run_all_benchmarks' || ''}}
outputs:
matrix: ${{ steps.get_tags.outputs.commits }}
cibw_docker_image:
uses: ./.github/workflows/cibw_docker_image.yml
permissions: {packages: write}
with:
cibuildwheel_ver: "2.12.1"
force_update: false
benchmark_commits:
needs: [get_commits_to_benchmark, cibw_docker_image]
strategy:
fail-fast: false
matrix:
commits: ${{ fromJson(needs.get_commits_to_benchmark.outputs.matrix)}}
name: Benchmark commit ${{ matrix.commits }}
uses: ./.github/workflows/benchmark_commits.yml
secrets: inherit
with:
commit: ${{ matrix.commits }}
cibw_image_tag: ${{ needs.cibw_docker_image.outputs.tag }}
run_all_benchmarks: ${{ inputs.run_all_benchmarks || false }}
run_on_pr_head: ${{ github.event_name == 'pull_request_target' }}
publish_benchmark_results_to_gh_pages:
name: Publish benchmark results to gh-pages
if: github.ref == 'refs/heads/master'
needs: [benchmark_commits]
runs-on: ubuntu-22.04
permissions:
contents: write
steps:
- uses: actions/[email protected]
with:
fetch-depth: 0
token: ${{ secrets.ARCTICDB_TEST_PAT }}
ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || '' }} # Note: This is dangerous if we run automatic CI on external PRs
- name: Set persistent storage variables
uses: ./.github/actions/set_persistent_storage_env_vars
with:
bucket: "arcticdb-ci-benchmark-results"
aws_access_key: "${{ secrets.AWS_S3_ACCESS_KEY }}"
aws_secret_key: "${{ secrets.AWS_S3_SECRET_KEY }}"
- name: Install ArcticDB[Testing]
shell: bash -el {0}
run: |
pip install arcticdb[Testing] "protobuf<5"
- name: Publish results to Github Pages
shell: bash -el {0}
run: |
git config --global --add safe.directory /__w/ArcticDB/ArcticDB
git config --global user.name "${GITHUB_ACTOR}"
git config --global user.email "${GITHUB_ACTOR_ID}+${GITHUB_ACTOR}@users.noreply.github.com"
python build_tooling/transform_asv_results.py --mode extract
python -m asv publish -v
python -m asv gh-pages -v --rewrite
code_coverage:
needs: [cibw_docker_image]
runs-on: "ubuntu-22.04"
container:
image: ${{needs.cibw_docker_image.outputs.tag}}
services:
mongodb:
image: mongo:4.4
ports:
- 27017:27017
env:
VCPKG_NUGET_USER: ${{secrets.VCPKG_NUGET_USER || github.repository_owner}}
VCPKG_NUGET_TOKEN: ${{secrets.VCPKG_NUGET_TOKEN || secrets.GITHUB_TOKEN}}
VCPKG_MAN_NUGET_USER: ${{secrets.VCPKG_MAN_NUGET_USER}} # For forks to download pre-compiled dependencies from the Man repo
VCPKG_MAN_NUGET_TOKEN: ${{secrets.VCPKG_MAN_NUGET_TOKEN}}
ARCTIC_CMAKE_PRESET: linux-debug
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- uses: actions/[email protected]
with:
submodules: recursive
ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || '' }} # Note: This is dangerous if we run automatic CI on external PRs
- name: Get number of CPU cores
uses: SimenB/[email protected]
id: cpu-cores
- name: Extra envs
run: |
. build_tooling/vcpkg_caching.sh # Linux follower needs another call in CIBW
echo -e "VCPKG_BINARY_SOURCES=$VCPKG_BINARY_SOURCES
VCPKG_ROOT=$PLATFORM_VCPKG_ROOT" | tee -a $GITHUB_ENV
cmake -P cpp/CMake/CpuCount.cmake | sed 's/^-- //' | tee -a $GITHUB_ENV
echo "ARCTICDB_CODE_COVERAGE_BUILD=1" | tee -a $GITHUB_ENV
env:
CMAKE_BUILD_PARALLEL_LEVEL: ${{vars.CMAKE_BUILD_PARALLEL_LEVEL}}
- name: Prepare C++ compilation env
run: . build_tooling/prep_cpp_build.sh
- name: CMake compile
# We are pinning the version to 10.6 because >= 10.7, use node20 which is not supported in the container
uses: lukka/[email protected]
with:
cmakeListsTxtPath: ${{github.workspace}}/cpp/CMakeLists.txt
configurePreset: ${{env.ARCTIC_CMAKE_PRESET}}
buildPreset: ${{env.ARCTIC_CMAKE_PRESET}}
env:
ARCTICDB_DEBUG_FIND_PYTHON: ${{vars.ARCTICDB_DEBUG_FIND_PYTHON}}
python_impl_name: 'cp311'
- name: Run C++ Tests
shell: bash -l {0}
run: |
cd cpp/out/linux-debug-build/
ls arcticdb
make -j ${{ steps.cpu-cores.outputs.count }} arcticdb_rapidcheck_tests
make -j ${{ steps.cpu-cores.outputs.count }} test_unit_arcticdb
ctest
# We are chainging the python here because we want to use the default python to build (it is devel version)
# and this python for the rest of the testing
- name: Select Python (Linux)
run: echo /opt/python/cp36-cp36m/bin >> $GITHUB_PATH
- name: Install local dependencies with pip
shell: bash
run: |
python -m pip install --upgrade pip
ARCTIC_CMAKE_PRESET=skip pip install -ve .[Testing]
# - name: Test with pytest
# uses: ./.github/actions/run_local_pytest
# with:
# build_type: debug
# threads: 1
# fast_tests_only: 0
# other_params: '-m coverage run '
- name: Get python Coverage report
shell: bash -l {0}
run: |
cd python
python -m coverage report -m | tee output.txt
python -m coverage html
zip -r python_cov.zip htmlcov/
echo "PYTHON_COV_PERCENT=$(cat output.txt | grep 'TOTAL' | awk '{print $NF}' | tr -d '%')" >> $GITHUB_ENV
- name: Run Gcovr manually post-pytest
shell: bash -l {0}
run: |
cd cpp/out/linux-debug-build/
python -m pip install gcovr
mkdir coverage
python -m gcovr --txt --html-details coverage/index.html -e vcpkg_installed/ -e proto/ -e ../../third_party -e ../../arcticdb/util/test/ -r ../.. --exclude-throw-branches --exclude-unreachable-branches -u --exclude-function-lines | tee output.txt
zip -r coverage.zip coverage/
echo "CPP_COV_PERCENT=$(cat output.txt | grep 'TOTAL' | awk '{print $NF}' | tr -d '%')" >> $GITHUB_ENV
- name: Upload Coverage
uses: actions/[email protected]
with:
name: cpp-coverage-artifact
path: cpp/out/linux-debug-build/coverage.zip
- name: Upload Python Coverage
uses: actions/[email protected]
with:
name: python-coverage-artifact
path: python/python_cov.zip
- name: Restore cached CPP Coverage Percentage from the previous run
id: cache-cov-restore
uses: actions/cache/[email protected]
with:
path: prev_coverage.txt
key: coverage
- name: Get and compare coverage if cache was restored
run: |
# if cache was restored, compare coverage
if [ -f coverage.txt ]; then
PREV_COVERAGE=$(cat prev_coverage.txt | cut -d' ' -f2)
echo "Previous coverage: $PREV_COVERAGE"
CURR_COVERAGE=${{env.CPP_COV_PERCENT}}
echo "CPP_COV_PREV_PERCENT=$PREV_COVERAGE" >> $GITHUB_ENV
echo "Current coverage: $CURR_COVERAGE"
if [ $CURR_COVERAGE -gt $PREV_COVERAGE ]; then
echo "Coverage increased"
elif [ $CURR_COVERAGE -lt $PREV_COVERAGE ]; then
echo "Coverage decreased"
else
echo "Coverage unchanged"
fi
fi
- name: Save CPP Coverage Percentage to file
run: |
echo "Coverage: ${{ env.CPP_COV_PERCENT }}" > current_coverage.txt
- name: Save the current CPP Coverage Percentage to the cache
id: cache-cov-save
uses: actions/cache/[email protected]
with:
path: current_coverage.txt
key: coverage
- name: Check percentage and send Slack notification
if: ${{ env.CPP_COV_PREV_PERCENT && env.CPP_COV_PERCENT && env.CPP_COV_PERCENT < env.CPP_COV_PREV_PERCENT }}
uses: slackapi/[email protected]
with:
# For posting a rich message using Block Kit
payload: |
{
"text": "The CPP Code Coverage has been reduced",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "The CPP Code Coverage from the current run(${{ env.CPP_COV_PERCENT }}%) is lower the previous one(${{ env.CPP_COV_PREV_PERCENT }}%)."
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.ARCTICDB_DEV_WEBHOOK_URL }}
SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK