dispatch job #17
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: dispatch job | |
on: | |
workflow_dispatch: | |
inputs: | |
repo: | |
description: 'The https github url for the recipe feedstock' | |
required: true | |
ref: | |
description: 'The tag or branch to target in your recipe repo' | |
required: true | |
default: 'main' | |
feedstock_subdir: | |
description: 'The subdir of the feedstock directory in the repo' | |
required: true | |
default: 'feedstock' | |
bucket: | |
description: 'This job runner leverages s3fs.S3FileSystem for your recipe cache and output. Choices currently are: "default"' | |
required: true | |
default: 'default' | |
prune: | |
description: 'Only run the first two time steps' | |
required: true | |
default: 'False' | |
parallelism: | |
description: 'Number of task managers to spin up' | |
required: true | |
default: '1' | |
jobs: | |
job-name: | |
runs-on: ubuntu-latest | |
outputs: | |
repo_name: ${{ steps.string_manipulation.outputs.result }} | |
steps: | |
- name: manipuluate strings | |
id: string_manipulation | |
run: | | |
repo_name=$(basename -s .git "${{ github.event.inputs.repo }}") | |
echo "::set-output name=result::$repo_name" | |
run-job: | |
name: runner job ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }} | |
needs: job-name | |
runs-on: ubuntu-latest | |
steps: | |
- name: checkout repository | |
uses: actions/checkout@v2 | |
- name: set up python 3.10 | |
uses: actions/setup-python@v2 | |
with: | |
python-version: '3.10' | |
- name: echo server | |
run: | | |
echo "Manually triggered workflow: \ | |
${{ github.event.inputs.repo }} \ | |
${{ github.event.inputs.ref }} \ | |
${{ github.event.inputs.bucket }} \ | |
${{ github.event.inputs.parallelism }} \ | |
${{ github.event.inputs.prune }}" | |
- name: install deps | |
run: | | |
# TODO: move to requirements file | |
python -m pip install --upgrade pip | |
pip install \ | |
fsspec \ | |
s3fs \ | |
apache-beam==2.52.0 \ | |
pangeo-forge-recipes>=0.10.0 \ | |
pangeo-forge-runner>=0.9.1 | |
- name: set up aws credentials for job runner user | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.GH_ACTIONS_AWS_ACCESS_KEY_ID }} | |
aws-secret-access-key: ${{ secrets.GH_ACTIONS_AWS_SECRET_ACCESS_KEY }} | |
aws-region: ${{ secrets.GH_ACTIONS_AWS_REGION }} | |
- name: install kubectl | |
run: | | |
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" | |
chmod +x ./kubectl | |
sudo mv ./kubectl /usr/local/bin/kubectl | |
- name: update kubeconfig with cluster | |
run: | | |
aws eks update-kubeconfig --name pangeo-forge-v3 --region ${{ secrets.GH_ACTIONS_AWS_REGION }} | |
- name: execute recipe on k8s cluster | |
id: executejob | |
continue-on-error: true | |
run: | | |
pangeo-forge-runner \ | |
bake \ | |
--repo=${{ github.event.inputs.repo }} \ | |
--ref=${{ github.event.inputs.ref }} \ | |
-f .github/workflows/config.py > execute.log | |
# show all logs | |
cat execute.log | |
# export all the valuable information from the logs | |
JOB_NAME=$(cat execute.log | grep -oP 'flinkdeployment\.flink\.apache\.org/\K[^ ]+' | head -n1) | |
echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV | |
JOB_ID=$(cat execute.log | grep -oP 'Started Flink job as \K[^ ]+') | |
echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV | |
FLINK_DASH=$(cat execute.log | grep -oP "You can run '\K[^']+(?=')") | |
echo "FLINK_DASH=$FLINK_DASH" >> $GITHUB_ENV | |
env: | |
REPO: ${{ github.event.inputs.repo }} | |
REF: ${{ github.event.inputs.ref }} | |
FEEDSTOCK_SUBDIR: ${{ github.event.inputs.feedstock_subdir }} | |
PRUNE_OPTION: ${{ github.event.inputs.prune }} | |
PARALLELISM_OPTION: ${{ github.event.inputs.parallelism }} | |
S3_BUCKET: ${{ github.event.inputs.bucket }} | |
S3_DEFAULT_AWS_ACCESS_KEY_ID: ${{ secrets.S3_DEFAULT_AWS_ACCESS_KEY_ID }} | |
S3_DEFAULT_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_DEFAULT_AWS_SECRET_ACCESS_KEY }} | |
- name: cleanup if "pangeo-forge-runner bake" failed | |
if: steps.excutejob.outcome == 'failure' | |
run: | | |
echo "The previous 'bake' command failed or timed out. Running cleanup logic..." | |
# much easier to do in bash than in Python via subprocess | |
echo "##################### OPERATOR ######################" | |
kubectl get pod | grep operator | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000 | |
echo "##################### JOB MANAGER ######################" | |
kubectl get pod | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000 | |
echo "##################### TASK MANAGER ######################" | |
kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000 | |
# delete the flinkdeployment so we don't have old failures hanging around | |
kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{} | |
# force GH action to show failed result | |
exit 128 | |
- name: report running job id for user | |
run: | | |
# TODO: we also need to report historyserver URL and flink dashboard URL | |
# but this also requires us to think how we're going to have a thin | |
# layer of authentication around these services so they aren't totally public | |
echo '############ JOB NAME ################' | |
echo $JOB_NAME | |
echo '############ JOB ID ################' | |
echo $JOB_ID | |
echo '############ FLINK DASHBOARD ################' | |
echo $FLINK_DASH | |
monitor-job: | |
runs-on: ubuntu-latest | |
name: monitor ${{ needs.job-name.outputs.repo_name }}@${{ github.event.inputs.ref }} | |
needs: [job-name, run-job] | |
timeout-minutes: 120 | |
continue-on-error: true | |
steps: | |
- name: monitor logs of job manager | |
run: | | |
echo "find job status on the job manager logs..." | |
run_status=$(kubectl get pod --no-headers | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | grep "ExecutionGraph.*Job BeamApp.*" | head -n 1) | |
while [[ -z "$run_status" ]]; do | |
echo "still waiting for a status on the job manager logs..." | |
sleep 1 | |
done | |
status=$(echo "$run_status" | grep -oP '\b\w+(?=\.$)') | |
if [[ "$status"="FAILING" ]]; then | |
echo "job failed, will dump the logs now..." | |
# force exit so we can move to next step | |
exit 128 | |
elif [[ "$status"="FAILED" ]]; then | |
echo "job failed, will dump the logs now..." | |
# force exit so we can move to next step | |
exit 128 | |
fi | |
- name: cleanup if "pangeo-forge-runner bake" failed | |
if: steps.excutejob.outcome == 'failure' | |
run: | | |
# much easier to do in bash than in Python via subprocess | |
echo "##################### OPERATOR ######################" | |
kubectl get pod | grep operator | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000 | |
echo "##################### JOB MANAGER ######################" | |
kubectl get pod | grep -v manager | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000 | |
echo "##################### TASK MANAGER ######################" | |
kubectl get pod | grep "$JOB_NAME-task-manager" | head -n1 | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} | tail -n 1000 | |
# delete the flinkdeployment so we don't have old failures hanging around | |
kubectl get flinkdepoyment --no-headers | grep $JOB_NAME | cut -d' ' -f1 | xargs -I{} kubectl delete flinkdeployment/{} | |
# force GH action to show failed result | |
exit 128 |