main stash

NASA-IMPACT · Dec 19, 2023 · e2282a8 · e2282a8
1 parent 0037139
commit e2282a8
Showing 1 changed file with 45 additions and 4 deletions.
diff --git a/.github/workflows/job-runner.yaml b/.github/workflows/job-runner.yaml
@@ -28,7 +28,7 @@ on:
         default: '1'
 
 jobs:
-  create-job-name:
+  job-name:
     runs-on: ubuntu-latest
     outputs:
       repo_name: ${{ steps.string_manipulation.outputs.result }}
@@ -39,8 +39,8 @@ jobs:
           repo_name=$(basename -s .git "${{ github.event.inputs.repo }}")
           echo "::set-output name=result::$repo_name"
   run-job:
-    name: Job ${{ needs.create-job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
-    needs: create-job-name
+    name: runner job ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
+    needs: job-name
     runs-on: ubuntu-latest
     steps:
 
@@ -135,7 +135,7 @@ jobs:
         kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
 
         # delete the flinkdeployment so we don't have old failures hanging around
-        kubectl get flinkdepoyment --no-headers | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
+        kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
 
         # force GH action to show failed result
         exit 128
@@ -151,3 +151,44 @@ jobs:
         echo $JOB_ID
         echo '############ FLINK DASHBOARD ################'
         echo $FLINK_DASH
+  monitor-job:
+    runs-on: ubuntu-latest
+    name: monitor ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
+    needs: [job-name, run-job]
+    timeout-minutes: 120
+    continue-on-error: true
+    steps:
+      - name: monitor logs of job manager
+        run: |
+          echo "find job status on the job manager logs..."
+          run_status=$(kubectl get pod --no-headers | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | grep "ExecutionGraph.*Job BeamApp.*" | head -n 1)
+          while [[ -z "$run_status" ]]; do
+            echo "still waiting for a status on the job manager logs..."
+            sleep 1
+          done
+          status=$(echo "$run_status" | grep -oP '\b\w+(?=\.$)')
+          if [[ "$status"="FAILING" ]]; then
+            echo "job failed, will dump the logs now..."
+            # force exit so we can move to next step
+            exit 128
+          elif [[ "$status"="FAILED" ]]; then
+            echo "job failed, will dump the logs now..."
+            # force exit so we can move to next step
+            exit 128
+          fi
+      - name: cleanup if "pangeo-forge-runner bake" failed
+        if: steps.excutejob.outcome == 'failure'
+        run: |
+          # much easier to do in bash than in Python via subprocess
+          echo "##################### OPERATOR ######################"
+          kubectl get pod | grep operator | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
+          echo "##################### JOB MANAGER ######################"
+          kubectl get pod | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
+          echo "##################### TASK MANAGER ######################"
+          kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
+
+          # delete the flinkdeployment so we don't have old failures hanging around
+          kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
+
+          # force GH action to show failed result
+          exit 128