From db74b6684646a4aad26c250d4335cf90e8d17f0e Mon Sep 17 00:00:00 2001 From: Stonepia Date: Sat, 14 Sep 2024 17:40:40 +0800 Subject: [PATCH] [CI] Print Kernel information for CI jobs (#903) Recently, we found that the Linux kernel version may affect the result. This PR adds kernel information for CI jobs: 1. For UT, I changed to use `python torch/utils/collect_env.py`. It will print the necessary info. 2. For E2E test, added the `KERNEL_VERSION` column in the report. Related PR: https://github.com/pytorch/pytorch/pull/135828 --- .github/workflows/_linux_ut.yml | 4 ++++ .github/workflows/nightly_ondemand.yml | 6 ++++-- .github/workflows/nightly_ondemand_rolling.yml | 7 +++++-- .github/workflows/nightly_ondemand_whl.yml | 7 +++++-- .github/workflows/pull.yml | 7 ++++++- 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e004f355f..d1700ca6e 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -113,6 +113,10 @@ jobs: python -c "import torch; print(torch.__config__.show())" python -c "import torch; print(torch.__config__.parallel_info())" python -c "import torch; print(torch.__config__.torch.xpu.device_count())" + python -c "import triton; print(triton.__version__)" + + cd .. + python pytorch/torch/utils/collect_env.py - name: Run XPU OP Examples if: contains(inputs.ut, 'op_example') || github.event_name == 'schedule' run: | diff --git a/.github/workflows/nightly_ondemand.yml b/.github/workflows/nightly_ondemand.yml index 4631138eb..83c70668f 100644 --- a/.github/workflows/nightly_ondemand.yml +++ b/.github/workflows/nightly_ondemand.yml @@ -158,6 +158,7 @@ jobs: echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "KERNEL_VERSION=$(uname -rv 2>&1 |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" . /etc/os-release echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" @@ -307,6 +308,7 @@ jobs: TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_BRANCH_ID }}" TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.TORCH_COMMIT_ID }}" DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.DRIVER_VERSION }}" + KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.KERNEL_VERSION }}" BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.BUNDLE_VERSION }}" OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.OS_PRETTY_NAME }}" GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests.outputs.GCC_VERSION }}" @@ -350,8 +352,8 @@ jobs: printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Bundle(DPCPP)\n--- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION| $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt + printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt + echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION | $KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" if [ "${{ inputs.model }}" != "" ];then diff --git a/.github/workflows/nightly_ondemand_rolling.yml b/.github/workflows/nightly_ondemand_rolling.yml index 201c9c341..e753fa2f2 100644 --- a/.github/workflows/nightly_ondemand_rolling.yml +++ b/.github/workflows/nightly_ondemand_rolling.yml @@ -98,6 +98,7 @@ jobs: TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} + KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} @@ -157,6 +158,7 @@ jobs: echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "KERNEL_VERSION=$(uname -rv 2>&1 |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" . /etc/os-release echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" @@ -305,6 +307,7 @@ jobs: repo="${{ github.repository }}" TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_BRANCH_ID }}" TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.TORCH_COMMIT_ID }}" + KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.KERNEL_VERSION }}" DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.DRIVER_VERSION }}" BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.BUNDLE_VERSION }}" OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-Tests-Rolling.outputs.OS_PRETTY_NAME }}" @@ -349,8 +352,8 @@ jobs: printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Bundle(DPCPP)\n--- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | rolling-$DRIVER_VERSION| $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt + printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt + echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | rolling-$DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" if [ "${{ inputs.model }}" != "" ];then diff --git a/.github/workflows/nightly_ondemand_whl.yml b/.github/workflows/nightly_ondemand_whl.yml index 641ca6865..fbfab9fdd 100644 --- a/.github/workflows/nightly_ondemand_whl.yml +++ b/.github/workflows/nightly_ondemand_whl.yml @@ -78,6 +78,7 @@ jobs: TORCH_BRANCH_ID: ${{ steps.pinned.outputs.TORCH_BRANCH_ID }} TORCH_COMMIT_ID: ${{ steps.pinned.outputs.TORCH_COMMIT_ID }} DRIVER_VERSION: ${{ steps.pinned.outputs.DRIVER_VERSION }} + KERNEL_VERSION: ${{ steps.pinned.outputs.KERNEL_VERSION }} BUNDLE_VERSION: ${{ steps.pinned.outputs.BUNDLE_VERSION }} OS_PRETTY_NAME: ${{ steps.pinned.outputs.OS_PRETTY_NAME }} GCC_VERSION: ${{ steps.pinned.outputs.GCC_VERSION }} @@ -130,6 +131,7 @@ jobs: echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "MODEL_ONLY_NAME=${{ inputs.model }}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "DRIVER_VERSION=$(dkms status 2>&1 |grep 'intel-i915-dkms' |sed 's/.*\///;s/,.*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" + echo "KERNEL_VERSION=$(uname -rv 2>&1 |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" echo "BUNDLE_VERSION=$(dpcpp --version 2>&1 |grep 'DPC++/C++' |sed 's/.*(//;s/).*//')" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" . /etc/os-release echo "OS_PRETTY_NAME=${PRETTY_NAME}" |tee -a "${GITHUB_OUTPUT}" >> "${GITHUB_ENV}" @@ -270,6 +272,7 @@ jobs: TORCH_BRANCH_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_BRANCH_ID }}" TORCH_COMMIT_ID="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.TORCH_COMMIT_ID }}" DRIVER_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.DRIVER_VERSION }}" + KERNEL_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.KERNEL_VERSION }}" BUNDLE_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.BUNDLE_VERSION }}" OS_PRETTY_NAME="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.OS_PRETTY_NAME }}" GCC_VERSION="${{ needs.Linux-Nightly-Ondemand-E2E-WHL-Tests.outputs.GCC_VERSION }}" @@ -313,8 +316,8 @@ jobs: printf "[${TORCHBENCH_COMMIT_ID:0:7}](https://github.com/pytorch/benchmark/commit/${TORCHBENCH_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt printf "[${TORCHVISION_COMMIT_ID:0:7}](https://github.com/pytorch/vision/commit/${TORCHVISION_COMMIT_ID:0:7}) | " >> ${{ github.workspace }}/report.txt echo -e "[${TORCHAUDIO_COMMIT_ID:0:7}](https://github.com/pytorch/audio/commit/${TORCHAUDIO_COMMIT_ID:0:7}) \n" >> ${{ github.workspace }}/report.txt - printf "Device | OS | GCC | Python | Driver(DKMS) | Bundle(DPCPP)\n--- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt - echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION| $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt + printf "Device | OS | GCC | Python | Driver(DKMS) | Kernel | Bundle(DPCPP)\n--- | --- | --- | --- | --- | --- | ---\n" >> ${{ github.workspace }}/report.txt + echo -e "$RUNNER_NAME | $OS_PRETTY_NAME | $GCC_VERSION | ${{ env.python }} | $DRIVER_VERSION |$KERNEL_VERSION | $BUNDLE_VERSION \n" >> ${{ github.workspace }}/report.txt if [ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ];then test_scope="${{ inputs.suite }}/${{ inputs.dt }}/${{ inputs.mode }}/${{ inputs.scenario }}" if [ "${{ inputs.model }}" != "" ];then diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 45fdef513..98a14e3d0 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -94,11 +94,16 @@ jobs: echo "TORCHAUDIO_COMMIT_ID=$(<.github/ci_commit_pins/audio.txt)" >> "${GITHUB_ENV}" echo "TRANSFORMERS_VERSION=$(<.ci/docker/ci_commit_pins/huggingface.txt)" >> "${GITHUB_ENV}" echo "TIMM_COMMIT_ID=$(<.ci/docker/ci_commit_pins/timm.txt)" >> "${GITHUB_ENV}" - - name: Show GITHUB_ENV + - name: Torch Config run: | echo "$GITHUB_ENV" rm -rf ../pytorch/inductor_log rm -rf /tmp/torchinductor_* + + cd .. + source activate e2e_ci + python -c "import triton; print(triton.__version__)" + python pytorch/torch/utils/collect_env.py - name: Huggingface BF16 Training Accuracy Test uses: ./.github/actions/inductor-xpu-e2e-test with: