diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 17cceb665747..54579ab2d850 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -33,6 +33,9 @@ on: log: description: Last 2000 characters of the test step's log value: ${{ jobs.main.outputs.log }} + potential_infra_failure: + description: Boolean flag when infra-related keyword spotted in logs. + value: ${{ jobs.main.outputs.potential_infra_failure }} jobs: main: @@ -40,6 +43,7 @@ jobs: outputs: conclusion: ${{ steps.main.conclusion }} log: ${{ steps.main.outputs.log }} + potential_infra_failure: ${{ steps.main.outputs.potential_infra_failure }} steps: - name: Docker system cleanup run: | @@ -75,6 +79,9 @@ jobs: echo "log=$(tail -c 2000 err.log | base64 -w 0)" >> "$GITHUB_OUTPUT" + potential_infra_failure=$(cat err.log | grep -Eqi "gpu|cuda|device" && echo true || echo false) + echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT" + exit $EXIT_CODE - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 098b9d635cb3..6b39d2a9082e 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4515,7 +4515,10 @@ jobs: if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }} env: SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + SLACK_WEBHOOK_ADMIN: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_ACTOR: ${{ github.actor }} + BRANCH: ${{ github.head_ref || github.ref_name }} REPOSITORY: ${{ github.repository }} RUN_ID: ${{ github.run_id }} PR_NUMBER: ${{ github.event.number }} @@ -4571,13 +4574,15 @@ jobs: echo "* [$JOB_NAME]($JOB_URL)" | tee -a $GITHUB_STEP_SUMMARY LOGS=$(echo $JOB | yq '(.value.outputs.log | @base64d)' | tr -d '"') + LOGS=$([[ $(echo $LOGS | wc -c) -gt 0 ]] && echo -E "\`\`\`\n$LOGS\n\`\`\`" || echo "") + LOGS=$([[ $(echo $JOB | yq '.value.outputs.potential_infra_failure') == "true" ]] && echo -E "$LOGS\n\ncc: $SLACK_WEBHOOK_ADMIN" || echo -E "$LOGS") SUMMARY=$(echo "$SUMMARY" | jq \ --arg pr "<$PR_URL|$PR_TITLE>" \ --arg job "<$JOB_URL|$JOB_NAME>" \ - --arg logs "$LOGS" \ - --arg author "" \ - --arg branch ""\ + --arg logs "$(echo -e "$LOGS")" \ + --arg author "" \ + --arg branch ""\ '. += [ { "type": "section", @@ -4588,8 +4593,7 @@ jobs: + "\nJob: " + $job + "\nAuthor: " + $author + "\nBranch: " + $branch - + "\nLogs:" - + "```\n" + $logs + "\n```" + + "\nLogs:" + $logs ) } }