Update MATH dataset with model judge #1299

Workflow file for this run

.github/workflows/pr-run-test.yml at 165e8a7

	name: pr_run_test

	on:
	pull_request:
	paths-ignore:
	- 'README.md'
	- 'README_zh-CN.md'
	- 'docs/**'
	- 'configs/**'
	- 'tools/**'

	workflow_dispatch:
	schedule:
	- cron: '56 22 * * *'

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	env:
	CONDA_ENV: opencompass_
	USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
	HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
	HF_DATASETS_OFFLINE: 1
	TRANSFORMERS_OFFLINE: 1
	HF_HUB_OFFLINE: 1
	VLLM_USE_MODELSCOPE: false
	LMDEPLOY_USE_MODELSCOPE: false

	jobs:
	pr_run_test:
	runs-on: self-hosted
	environment: 'prod'
	timeout-minutes: 30
	steps:
	- name: Checkout repository
	uses: actions/checkout@v2
	- name: Prepare - Install opencompass
	run: \|
	. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
	conda activate ${{env.CONDA_ENV}}${{ runner.name }}
	python3 -m pip uninstall opencompass -y
	python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
	conda info --envs
	- name: Prepare - prepare data and hf model
	run: \|
	cp -r ${{env.USERSPACE_PREFIX}}/data .
	rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
	ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
	- name: Run test
	run: \|
	. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
	conda activate ${{env.CONDA_ENV}}${{ runner.name }}
	conda info --envs
	rm -rf regression_result
	opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result1 --debug
	opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir regression_result2 --debug --max-num-workers 2
	opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir regression_result3 --debug --max-num-workers 2
	- name: Get result
	run: \|
	score=$(sed -n '$p' regression_result1//summary/.csv \| awk -F ',' '{print $NF}')
	if (( ${score%.} >= 88 && ${score%.} <= 89 )); then
	echo "score is $score between 88 and 89"
	else
	echo "score is $score not between 88 and 89"
	exit 1
	fi
	score=$(sed -n '$p' regression_result2//summary/.csv \| awk -F ',' '{print $NF}')
	if (( ${score%.} >= 87 && ${score%.} <= 88 )); then
	echo "score is $score between 87 and 88"
	else
	echo "score is $score not between 87 and 88"
	exit 1
	fi
	score=$(sed -n '$p' regression_result3//summary/.csv \| awk -F ',' '{print $NF}')
	if (( ${score%.} >= 84 && ${score%.} <= 87 )); then
	echo "score is $score between 84 and 87"
	else
	echo "score is $score not between 84 and 87"
	exit 1
	fi
	rm -rf regression_result1 & rm -rf regression_result2 & rm -rf regression_result3
	- name: Uninstall opencompass
	if: always()
	run: \|
	. /cpfs01/shared/public/qa-llm-cicd/miniconda3/bin/activate
	conda activate ${{env.CONDA_ENV}}${{ runner.name }}
	python3 -m pip uninstall opencompass -y
	conda info --envs

	notify_to_feishu:
	if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' \|\| github.ref_name == 'main') }}
	needs: [pr_run_test]
	environment: 'prod'
	timeout-minutes: 5
	runs-on: self-hosted
	steps:
	- name: notify
	run: \|
	curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"Opencompass- pr test failed","content":[[{"tag":"text","text":"branch: ${{github.ref_name}}, run action: ${{github.workflow}} failed. "},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update MATH dataset with model judge #1299

Workflow file

Update MATH dataset with model judge #1299

Jobs

Run details

Workflow file for this run