Skip to content

dispatch job

dispatch job #17

Workflow file for this run

name: dispatch job
on:
workflow_dispatch:
inputs:
repo:
description: 'The https github url for the recipe feedstock'
required: true
ref:
description: 'The tag or branch to target in your recipe repo'
required: true
default: 'main'
feedstock_subdir:
description: 'The subdir of the feedstock directory in the repo'
required: true
default: 'feedstock'
bucket:
description: 'This job runner leverages s3fs.S3FileSystem for your recipe cache and output. Choices currently are: "default"'
required: true
default: 'default'
prune:
description: 'Only run the first two time steps'
required: true
default: 'False'
parallelism:
description: 'Number of task managers to spin up'
required: true
default: '1'
jobs:
job-name:
runs-on: ubuntu-latest
outputs:
repo_name: ${{ steps.string_manipulation.outputs.result }}
steps:
- name: manipuluate strings
id: string_manipulation
run: |
repo_name=$(basename -s .git "${{ github.event.inputs.repo }}")
echo "::set-output name=result::$repo_name"
run-job:
name: runner job ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
needs: job-name
runs-on: ubuntu-latest
steps:
- name: checkout repository
uses: actions/checkout@v2
- name: set up python 3.10
uses: actions/setup-python@v2
with:
python-version: '3.10'
- name: echo server
run: |
echo "Manually triggered workflow: \
${{ github.event.inputs.repo }} \
${{ github.event.inputs.ref }} \
${{ github.event.inputs.bucket }} \
${{ github.event.inputs.parallelism }} \
${{ github.event.inputs.prune }}"
- name: install deps
run: |
# TODO: move to requirements file
python -m pip install --upgrade pip
pip install \
fsspec \
s3fs \
apache-beam==2.52.0 \
pangeo-forge-recipes>=0.10.0 \
pangeo-forge-runner>=0.9.1
- name: set up aws credentials for job runner user
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.GH_ACTIONS_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.GH_ACTIONS_AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.GH_ACTIONS_AWS_REGION }}
- name: install kubectl
run: |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x ./kubectl
sudo mv ./kubectl /usr/local/bin/kubectl
- name: update kubeconfig with cluster
run: |
aws eks update-kubeconfig --name pangeo-forge-v3 --region ${{ secrets.GH_ACTIONS_AWS_REGION }}
- name: execute recipe on k8s cluster
id: executejob
continue-on-error: true
run: |
pangeo-forge-runner \
bake \
--repo=${{ github.event.inputs.repo }} \
--ref=${{ github.event.inputs.ref }} \
-f .github/workflows/config.py > execute.log
# show all logs
cat execute.log
# export all the valuable information from the logs
JOB_NAME=$(cat execute.log | grep -oP 'flinkdeployment\.flink\.apache\.org/\K[^ ]+' | head -n1)
echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV
JOB_ID=$(cat execute.log | grep -oP 'Started Flink job as \K[^ ]+')
echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV
FLINK_DASH=$(cat execute.log | grep -oP "You can run '\K[^']+(?=')")
echo "FLINK_DASH=$FLINK_DASH" >> $GITHUB_ENV
env:
REPO: ${{ github.event.inputs.repo }}
REF: ${{ github.event.inputs.ref }}
FEEDSTOCK_SUBDIR: ${{ github.event.inputs.feedstock_subdir }}
PRUNE_OPTION: ${{ github.event.inputs.prune }}
PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }}
S3_BUCKET: ${{ github.event.inputs.bucket }}
S3_DEFAULT_AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEFAULT_AWS_ACCESS_KEY_ID }}
S3_DEFAULT_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEFAULT_AWS_SECRET_ACCESS_KEY }}
- name: cleanup if "pangeo-forge-runner bake" failed
if: steps.excutejob.outcome == 'failure'
run: |
echo "The previous 'bake' command failed or timed out. Running cleanup logic..."
# much easier to do in bash than in Python via subprocess
echo "##################### OPERATOR ######################"
kubectl get pod | grep operator | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
echo "##################### JOB MANAGER ######################"
kubectl get pod | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
echo "##################### TASK MANAGER ######################"
kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
# delete the flinkdeployment so we don't have old failures hanging around
kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
# force GH action to show failed result
exit 128
- name: report running job id for user
run: |
# TODO: we also need to report historyserver URL and flink dashboard URL
# but this also requires us to think how we're going to have a thin
# layer of authentication around these services so they aren't totally public
echo '############ JOB NAME ################'
echo $JOB_NAME
echo '############ JOB ID ################'
echo $JOB_ID
echo '############ FLINK DASHBOARD ################'
echo $FLINK_DASH
monitor-job:
runs-on: ubuntu-latest
name: monitor ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }}
needs: [job-name, run-job]
timeout-minutes: 120
continue-on-error: true
steps:
- name: monitor logs of job manager
run: |
echo "find job status on the job manager logs..."
run_status=$(kubectl get pod --no-headers | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | grep "ExecutionGraph.*Job BeamApp.*" | head -n 1)
while [[ -z "$run_status" ]]; do
echo "still waiting for a status on the job manager logs..."
sleep 1
done
status=$(echo "$run_status" | grep -oP '\b\w+(?=\.$)')
if [[ "$status"="FAILING" ]]; then
echo "job failed, will dump the logs now..."
# force exit so we can move to next step
exit 128
elif [[ "$status"="FAILED" ]]; then
echo "job failed, will dump the logs now..."
# force exit so we can move to next step
exit 128
fi
- name: cleanup if "pangeo-forge-runner bake" failed
if: steps.excutejob.outcome == 'failure'
run: |
# much easier to do in bash than in Python via subprocess
echo "##################### OPERATOR ######################"
kubectl get pod | grep operator | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
echo "##################### JOB MANAGER ######################"
kubectl get pod | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
echo "##################### TASK MANAGER ######################"
kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000
# delete the flinkdeployment so we don't have old failures hanging around
kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{}
# force GH action to show failed result
exit 128