Skip to content

Commit

Permalink
main stash
Browse files Browse the repository at this point in the history
  • Loading branch information
ranchodeluxe committed Dec 19, 2023
1 parent 0037139 commit e2282a8
Showing 1 changed file with 45 additions and 4 deletions.
49 changes: 45 additions & 4 deletions .github/workflows/job-runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ on:
default: '1'

jobs:
create-job-name:
job-name:
runs-on: ubuntu-latest
outputs:
repo_name: ${{ steps.string_manipulation.outputs.result }}
Expand All @@ -39,8 +39,8 @@ jobs:
repo_name=$(basename -s .git "${{ github.event.inputs.repo }}")
echo "::set-output name=result::$repo_name"
run-job:
name: Job ${{ needs.create-job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
needs: create-job-name
name: runner job ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
needs: job-name
runs-on: ubuntu-latest
steps:

Expand Down Expand Up @@ -135,7 +135,7 @@ jobs:
kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
# delete the flinkdeployment so we don't have old failures hanging around
kubectl get flinkdepoyment --no-headers | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
# force GH action to show failed result
exit 128
Expand All @@ -151,3 +151,44 @@ jobs:
echo $JOB_ID
echo '############ FLINK DASHBOARD ################'
echo $FLINK_DASH
monitor-job:
runs-on: ubuntu-latest
name: monitor ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
needs: [job-name, run-job]
timeout-minutes: 120
continue-on-error: true
steps:
- name: monitor logs of job manager
run: |
echo "find job status on the job manager logs..."
run_status=$(kubectl get pod --no-headers | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | grep "ExecutionGraph.*Job BeamApp.*" | head -n 1)
while [[ -z "$run_status" ]]; do
echo "still waiting for a status on the job manager logs..."
sleep 1
done
status=$(echo "$run_status" | grep -oP '\b\w+(?=\.$)')
if [[ "$status"="FAILING" ]]; then
echo "job failed, will dump the logs now..."
# force exit so we can move to next step
exit 128
elif [[ "$status"="FAILED" ]]; then
echo "job failed, will dump the logs now..."
# force exit so we can move to next step
exit 128
fi
- name: cleanup if "pangeo-forge-runner bake" failed
if: steps.excutejob.outcome == 'failure'
run: |
# much easier to do in bash than in Python via subprocess
echo "##################### OPERATOR ######################"
kubectl get pod | grep operator | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
echo "##################### JOB MANAGER ######################"
kubectl get pod | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
echo "##################### TASK MANAGER ######################"
kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
# delete the flinkdeployment so we don't have old failures hanging around
kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
# force GH action to show failed result
exit 128

0 comments on commit e2282a8

Please sign in to comment.