diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 6bf16187..ef511319 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -154,19 +154,21 @@ jobs: # set preserve to true so we can retain the logs ./scripts/e2e-ci.sh -mp + # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library. + # Therefore we must disable the upload of the training logs, as they will not exist in the same location. # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python # and we know that it will be written into a directory created by `mktemp -d`. # Given this information, we can use the following command to find the file: - log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") - mv "${log_file}" training-log.jsonl + # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl") + # mv "${log_file}" training-log.jsonl - - name: Upload training logs - uses: actions/upload-artifact@v4 - with: - name: training-log.jsonl - path: ./instructlab/training-log.jsonl - retention-days: 1 - overwrite: true + # - name: Upload training logs + # uses: actions/upload-artifact@v4 + # with: + # name: training-log.jsonl + # path: ./instructlab/training-log.jsonl + # retention-days: 1 + # overwrite: true stop-medium-ec2-runner: needs: @@ -195,39 +197,39 @@ jobs: label: ${{ needs.start-medium-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} - - name: Download loss data - id: download-logs - uses: actions/download-artifact@v4 - with: - name: training-log.jsonl - path: downloaded-data - - - name: Install dependencies - run: | - pip install -r requirements-dev.txt + # - name: Download loss data + # id: download-logs + # uses: actions/download-artifact@v4 + # with: + # name: training-log.jsonl + # path: downloaded-data + + # - name: Install dependencies + # run: | + # pip install -r requirements-dev.txt - - name: Try to upload to s3 - id: upload-s3 - continue-on-error: true - run: | - output_file='./test.md' - python scripts/create-loss-graph.py \ - --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ - --output-file "${output_file}" \ - --aws-region "${{ vars.AWS_REGION }}" \ - --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ - --base-branch "${{ github.event.pull_request.base.ref }}" \ - --pr-number "${{ github.event.pull_request.number }}" \ - --head-sha "${{ github.event.pull_request.head.sha }}" \ - --origin-repository "${{ github.repository }}" - - cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" - - - name: Check S3 upload status - if: steps.upload-s3.outcome == 'failure' - run: | - echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." - echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" + # - name: Try to upload to s3 + # id: upload-s3 + # continue-on-error: true + # run: | + # output_file='./test.md' + # python scripts/create-loss-graph.py \ + # --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ + # --output-file "${output_file}" \ + # --aws-region "${{ vars.AWS_REGION }}" \ + # --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ + # --base-branch "${{ github.event.pull_request.base.ref }}" \ + # --pr-number "${{ github.event.pull_request.number }}" \ + # --head-sha "${{ github.event.pull_request.head.sha }}" \ + # --origin-repository "${{ github.repository }}" + + # cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}" + + # - name: Check S3 upload status + # if: steps.upload-s3.outcome == 'failure' + # run: | + # echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate." + # echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" e2e-medium-workflow-complete: # we don't want to block PRs on failed EC2 cleanup