diff --git a/.gitignore b/.gitignore index 661a29223..4df28de3e 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ clusters #Autogenerated licenses manifest licenses.manifest +registry.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md index 566be8163..aeb6badf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -112,4 +112,4 @@ IMPROVEMENTS: FIXES: -* repair dependabot mayhem that brokes the builds and a tag removed from a 3rd party repository +* repair dependabot mayhem that broke the builds and a tag removed from a 3rd party repository diff --git a/Dockerfile_standalone b/Dockerfile_standalone index 59bd020e3..bdc4b35b6 100644 --- a/Dockerfile_standalone +++ b/Dockerfile_standalone @@ -38,8 +38,8 @@ RUN mkdir $GOPATH/bin && \ go get github.com/karlmutch/enumer && \ go get github.com/karlmutch/petname && \ go install github.com/karlmutch/petname/cmd/petname && \ - wget -q -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.9.3/semver-linux-amd64 && \ - wget -q -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.9.3/stencil-linux-amd64 && \ + wget -q -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.10.0/semver-linux-amd64 && \ + wget -q -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.10.0/stencil-linux-amd64 && \ chmod +x $GOPATH/bin/semver && \ chmod +x $GOPATH/bin/stencil && \ rm /usr/bin/nvidia-* diff --git a/README.md b/README.md index e02e87b38..71195704c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # studio-go-runner -Version: 0.9.11 +Version: 0.9.12 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/leaf-ai/studio-go-runner/blob/master/LICENSE) [![Go Report Card](https://goreportcard.com/badge/leaf-ai/studio-go-runner)](https://goreportcard.com/report/leaf-ai/studio-go-runner)[![DepShield Badge](https://depshield.sonatype.org/badges/leaf-ai/studio-go-runner/depshield.svg)](https://depshield.github.io) @@ -132,9 +132,9 @@ To install the tools on Ubuntu use the following commands: mkdir -p $GOPATH/bin go get github.com/karlmutch/petname go install github.com/karlmutch/petname/cmd/petname -wget -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.9.3/semver-linux-amd64 -wget -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.9.3/stencil-linux-amd64 -wget -O $GOPATH/bin/github-release https://github.com/karlmutch/duat/releases/download/0.9.3/github-release-linux-amd64 +wget -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.10.0/semver-linux-amd64 +wget -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.10.0/stencil-linux-amd64 +wget -O $GOPATH/bin/github-release https://github.com/karlmutch/duat/releases/download/0.10.0/github-release-linux-amd64 chmod +x $GOPATH/bin/semver chmod +x $GOPATH/bin/stencil chmod +x $GOPATH/bin/github-release diff --git a/build.go b/build.go index 7dcf32a76..a9c08bcef 100755 --- a/build.go +++ b/build.go @@ -552,7 +552,7 @@ func test(md *duat.MetaData) (outputs []string, errs []errors.Error) { if !sPod { opts = append(opts, "-test.short") } else { - opts = append(opts, "-test.timeout=15m") + opts = append(opts, "-test.timeout=30m") opts = append(opts, "--use-k8s") } diff --git a/build.sh b/build.sh index 67393a2bb..0af68d55a 100755 --- a/build.sh +++ b/build.sh @@ -117,6 +117,8 @@ travis_fold start "build.image" rm -f $working_file docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH leafai/studio-go-runner-standalone-build docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH localhost:32000/leafai/studio-go-runner-standalone-build + docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH localhost:32000/leafai/studio-go-runner-standalone-build:$GIT_BRANCH + docker push localhost:32000/leafai/studio-go-runner-standalone-build:$GIT_BRANCH || true exit_code=$? if [ $exit_code -ne 0 ]; then exit $exit_code @@ -164,41 +166,13 @@ travis_fold start "image.push" if type docker 2>/dev/null ; then docker login docker.io if [ $? -eq 0 ]; then - docker tag leaf-ai/studio-go-runner/runner:$SEMVER leafai/studio-go-runner:$SEMVER docker tag leafai/studio-go-runner-dev-base:0.0.0 leafai/studio-go-runner-dev-base:$GIT_BRANCH - docker push leafai/studio-go-runner:$SEMVER docker push leafai/studio-go-runner-dev-base:0.0.0 docker push leafai/studio-go-runner-dev-base:$GIT_BRANCH docker push leafai/studio-go-runner-standalone-build:$GIT_BRANCH fi fi - if type aws 2>/dev/null ; then - `aws ecr get-login --no-include-email` - if [ $? -eq 0 ]; then - account=`aws sts get-caller-identity --output text --query Account` - if [ $? -eq 0 ]; then - docker tag leafai/studio-go-runner:$SEMVER $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/runner:$SEMVER - docker push $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/runner:$SEMVER - - docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/standalone-build:$GIT_BRANCH - docker push $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/standalone-build:$GIT_BRANCH - fi - fi - fi - if [ -z ${azure_registry_name+x} ]; then - : - else - if type az 2>/dev/null; then - if az acr login --name $azure_registry_name; then - docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH $azure_registry_name.azurecr.io/leafai/studio-go-runner-standalone-build:$GIT_BRANCH - docker push $azure_registry_name.azurecr.io/leafai/studio-go-runner-standalone-build:$GIT_BRANCH - - docker tag leafai/studio-go-runner:$SEMVER $azure_registry_name.azurecr.io/leafai/studio-go-runner:$SEMVER - docker push $azure_registry_name.azurecr.io/leafai/studio-go-runner:$SEMVER - fi - fi - fi fi travis_time_finish travis_fold end "image.push" diff --git a/ci.sh b/ci.sh index 3576a029d..758275b06 100755 --- a/ci.sh +++ b/ci.sh @@ -87,7 +87,7 @@ working_file=$$.studio-go-runner-working rm -f $working_file trap Tidyup 1 2 3 15 -export GIT_BRANCH=`echo '{{.duat.gitBranch}}' | stencil - | tr '_' '-' | tr '\/' '-'` +export GIT_BRANCH=`echo '{{.duat.gitBranch | replace "/" "-" | replace "_" "-"}}' | stencil` export RUNNER_BUILD_LOG=build-$GIT_BRANCH.log exit_code=0 @@ -97,21 +97,41 @@ export travis_fold start "build.image" travis_time_start - set -o pipefail ; (go run build.go -r -dirs=internal && go run build.go -r -dirs=cmd && echo "Success" || echo "Failure") 2>&1 | tee $RUNNER_BUILD_LOG - exit_code=$? - if [ $exit_code -ne 0 ]; then - exit $exit_code - fi + set -o pipefail ; (go run build.go -r -dirs=internal && go run build.go -r -dirs=cmd ; exit_code=$?) 2>&1 | tee $RUNNER_BUILD_LOG + [[ exit_code == 0 ]] && echo "Success" || echo "Failure" travis_time_finish travis_fold end "build.image" +rm -rf /build/* + +if [ $exit_code -eq 0 ]; then + cd cmd/runner + rsync --recursive --relative . /build/ + cd - +fi + +ls /build -alcrt cleanup -echo "Starting the namespace injections etc" $K8S_POD_NAME -kubectl label deployment build keel.sh/policy=force --namespace=$K8S_NAMESPACE +echo "Scale testing dependencies to 0" $K8S_POD_NAME kubectl scale --namespace $K8S_NAMESPACE --replicas=0 rc/rabbitmq-controller kubectl scale --namespace $K8S_NAMESPACE --replicas=0 deployment/minio-deployment +if [ $exit_code -eq 0 ]; then + kubectl --namespace $K8S_NAMESPACE delete job/imagebuilder || true + echo "imagebuild-mounted starting" $K8S_POD_NAME +# Run the docker image build using Mikasu within the same namespace we are occupying and +# the context for the image build will be the /build mount + stencil -values Namespace=$K8S_NAMESPACE -input ci_containerize.yaml | kubectl --namespace $K8S_NAMESPACE create -f - + until kubectl --namespace $K8S_NAMESPACE get job/imagebuilder -o jsonpath='{.status.conditions[].status}' | grep True ; do sleep 3 ; done + echo "imagebuild-mounted complete" $K8S_POD_NAME + kubectl --namespace $K8S_NAMESPACE logs job/imagebuilder + kubectl --namespace $K8S_NAMESPACE delete job/imagebuilder +fi + +echo "Return pod back to the ready state for keel to begin monitoring for new images" $K8S_POD_NAME +kubectl label deployment build keel.sh/policy=force --namespace=$K8S_NAMESPACE + for (( ; ; )) do sleep 600 diff --git a/ci_containerize.yaml b/ci_containerize.yaml new file mode 100644 index 000000000..b7bb3cf34 --- /dev/null +++ b/ci_containerize.yaml @@ -0,0 +1,37 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: imagebuilder + namespace: {{ .Namespace }} +spec: + template: + spec: + restartPolicy: Never + containers: + - name: makisu + image: gcr.io/makisu-project/makisu:v0.1.9 + imagePullPolicy: IfNotPresent + args: + - build + - --push=index.docker.io + - --modifyfs=true + - -t=leafai/studio-go-runner:{{.duat.version}} + - --registry-config=/registry-config/registry.yaml + - /makisu-context + volumeMounts: + - name: context + mountPath: /makisu-context + - name: registry-config + mountPath: /registry-config + - name: storage + mountPath: /makisu-storage + volumes: + - name: context + persistentVolumeClaim: + # Name of the PVC created earlier + claimName: build-pv-claim + - name: registry-config + secret: + secretName: docker-registry-config + - name: storage + emptyDir: {} diff --git a/ci_keel.yaml b/ci_keel.yaml index 2ed49b30e..e9a7dea71 100644 --- a/ci_keel.yaml +++ b/ci_keel.yaml @@ -19,6 +19,24 @@ roleRef: apiGroup: rbac.authorization.k8s.io --- apiVersion: v1 +kind: Secret +metadata: + name: docker-registry-config + namespace: {{ .Namespace }} +type: Opaque +data: + registry.yaml: '{{ .Registry | b64enc }}' +--- +apiVersion: v1 +kind: Secret +metadata: + name: release-github-token + namespace: {{ .Namespace }} +type: Opaque +data: + github_token: '{{ expandenv "$GITHUB_TOKEN" | b64enc }}' +--- +apiVersion: v1 kind: ConfigMap metadata: name: build-env @@ -153,15 +171,6 @@ spec: mountPath: "/storage" --- apiVersion: v1 -kind: Secret -metadata: - name: release-github-token - namespace: {{ .Namespace }} -type: Opaque -data: - github_token: '{{ expandenv "$GITHUB_TOKEN" | b64enc }}' ---- -apiVersion: v1 kind: Service metadata: name: minio-service @@ -175,6 +184,25 @@ spec: selector: app: minio --- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + # This name uniquely identifies the PVC. Will be used in deployment below. + name: build-pv-claim + labels: + app: build-storage-claim + namespace: {{ .Namespace }} +spec: + # Read more about access modes here: https://kubernetes.io/docs/user-guide/persistent-volumes/#access-modes + accessModes: + - ReadWriteMany + resources: + # This is the request for storage. Should be available in the cluster. + requests: + storage: 10Gi + # Uncomment and add storageClass specific to your requirements below. Read more https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1 + #storageClassName: +--- # Run the integration build as a deployment, the lifecycle will be dealt with by the CMD entry apiVersion: extensions/v1beta1 kind: Deployment @@ -193,6 +221,10 @@ spec: app: build spec: volumes: + - name: build-storage + persistentVolumeClaim: + # Name of the PVC created earlier + claimName: build-pv-claim - name: podinfo downwardAPI: items: @@ -224,13 +256,16 @@ spec: envFrom: - configMapRef: name: build-env - image: quay.io/leaf_ai_dockerhub/studio-go-runner-standalone-build:{{ .duat.gitBranch | replace "/" "-" }} + image: {{ $branch := .duat.gitBranch | replace "/" "_" | replace "-" "_"}}{{ .Image | empty | ternary "quay.io/leaf_ai_dockerhub/studio-go-runner-standalone-build:" ""}}{{ .Image | empty | ternary $branch .Image }} imagePullPolicy: Always resources: limits: memory: "1024Mi" - cpu: 1 + cpu: 4 + nvidia.com/gpu: {{ expandenv "$NVIDIA_VISIBLE_DEVICES" | empty | ternary "0" "2" }} volumeMounts: + - name: build-storage # must match the volume name, above + mountPath: "/build" - name: podinfo mountPath: /etc/podinfo readOnly: false diff --git a/cmd/runner/Dockerfile b/cmd/runner/Dockerfile index 191917d08..c0a9ec658 100644 --- a/cmd/runner/Dockerfile +++ b/cmd/runner/Dockerfile @@ -1,8 +1,6 @@ FROM ubuntu:16.04 -LABEL maintainer "karlmutch@gmail.com" -RUN \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y locales && \ apt-get install -y language-pack-en && \ update-locale "en_US.UTF-8" && \ @@ -64,8 +62,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libzmq3-dev \ pkg-config \ software-properties-common \ - unzip \ - && \ + unzip && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -73,8 +70,7 @@ RUN apt-get update && \ apt-get install -y python python-pip python3 python3-pip python3-dev python-dev git lshw && \ pip install --upgrade pip==9.0.3 setuptools -RUN \ - apt-get -y install libssl-dev libcurl4-openssl-dev libsm6 libxrender-dev libxext-dev && \ +RUN apt-get -y install libssl-dev libcurl4-openssl-dev libsm6 libxrender-dev libxext-dev && \ pip install tensorflow-gpu==1.4.1 && \ pip install tensorflow-gpu==1.8.0 && \ pip install tensorflow-gpu==1.9.0 && \ @@ -96,16 +92,16 @@ LABEL vendor="Sentient Technologies INC" \ ai.sentient.module.name=studio-go-runner # Add support for richer terminals to aid debugging etc -RUN mkdir -p /lib/terminfo/x -RUN mkdir -p /usr/local/share/terminfo/x +RUN mkdir -p /lib/terminfo/x && \ + mkdir -p /usr/local/share/terminfo/x COPY add-ons/termite.terminfo /usr/local/share/terminfo/x/xterm-termite COPY add-ons/termite.terminfo /lib/terminfo/x/xterm-termite # Prometheus instrumented port EXPOSE 9090 -COPY run.sh /runner/. -COPY bin/runner-linux-amd64 /runner/. -COPY bin/runner-linux-amd64-cpu /runner/. +COPY run.sh /runner/run.sh +COPY bin/runner-linux-amd64 /runner/runner-linux-amd64 +COPY bin/runner-linux-amd64-cpu /runner/runner-linux-amd64-cpu CMD /bin/bash -C ./run.sh diff --git a/cmd/runner/main.go b/cmd/runner/main.go index 1f681340e..baab2abf6 100644 --- a/cmd/runner/main.go +++ b/cmd/runner/main.go @@ -280,7 +280,7 @@ func EntryPoint(quitCtx context.Context, cancel context.CancelFunc, doneC chan s if runner.HasCUDA() { msg := fmt.Errorf("no available GPUs could be found using the nvidia management library") - if runner.CudaInitErr == nil { + if runner.CudaInitErr != nil { msg = *runner.CudaInitErr } err := errors.Wrap(msg).With("stack", stack.Trace().TrimRuntime()) diff --git a/cmd/runner/main_test.go b/cmd/runner/main_test.go index c3bbdd49d..3a57ce3b4 100644 --- a/cmd/runner/main_test.go +++ b/cmd/runner/main_test.go @@ -44,6 +44,10 @@ var ( // and their command line options for each case func init() { cleanupDirs = append(cleanupDirs, "/tmp/cache-runner") + + // Disable certain checks related to ECC validation for smaller cards that are used during testing + runner.CudaInTest = true + } func cleanup() { diff --git a/cmd/runner/metadata_test.go b/cmd/runner/metadata_test.go index 89dfa2d48..7bf55434f 100644 --- a/cmd/runner/metadata_test.go +++ b/cmd/runner/metadata_test.go @@ -46,7 +46,7 @@ func waitForMetaDataRun(ctx context.Context, qName string, queueType string, r * } // Wait for prometheus to show the task stopped for our specific queue, host, project and experiment ID - if runningCnt == 0 && finishedCnt == 2 { + if runningCnt == 0 && finishedCnt >= 2 { return nil } logger.Info("stats", "runner", runningCnt, "finished", finishedCnt) diff --git a/docs/ci.md b/docs/ci.md new file mode 100644 index 000000000..444abb216 --- /dev/null +++ b/docs/ci.md @@ -0,0 +1,130 @@ +# Continuous Integration Setup + +This document describes setting up a CI pipline that can be used to prepare releases for studio go runner. + +studio go runner is designed to run in resource intensive environments using GPU enabled machines and so providing a free hosted pipeline for CI/CD is cost prohibitive. As an alternative, parties interested in studio go runner can make use of quay.io hosted images built automatically on github commit triggers to then trigger their own downstream build, test and deploy automation. Downstream automation can be hosted on a self provisioned Kubernetes provisioned cluster either within the cloud or on private infrastructure. This allows testing to be done using the CI pipeline on both local laptops, workstations and in cloud or data center environments. The choice of quay.io as the registry for the build images is due to its support of selectively exposing only public repositories from github accounts preserving privacy. + +This document contains instructions that can be used for hardware configurations that individual users to large scale enterprises can use without incuring monthly charges from third party providers. These instructions first detail how a quay.io trigger can be setup to trigger builds on github commits. Instructions then detail how to make use of Keel, https://keel.sh/, to pull CI images into a cluster and run the pipeline. Finally this document describes the us of Ubers Makisu to deliver production images to the docker.io image hosting service. docker is used as this is the most reliable of the image registries that Makisu supports, quay.io could not be made to work for this step. + + +Instructions within this document make use of the go based stencil tool. This tool can be obtained for Linux from the github release point, https://github.com/karlmutch/duat/releases/download/0.9.3/stencil-linux-amd64. + +``` +mkdir -p ~/bin +wget -O ~/bin/stencil https://github.com/karlmutch/duat/releases/download/0.9.3/stencil-linux-amd64 +chmod +x ~/bin/stencil +export PATH=~/bin:$PATH +``` + +# A word about privacy + +Many of the services that provide image hosting use Single Sign On and credentials management with your source code control platform of choice. As a consequence of this often these services will gain access to any and all repositories private or otherwise that you might have access to within your account. In order to preserve privacy and maintain fine grained control over the visibility of your private repositories it is recommended that when using quay.io and other services that you create a service account that has the minimal level of access to repositories as nessasary to implement your CI/CD features. + +# CI Image building + +The studio go runner project uses Docker images to encapsulate builds within an immutable archive format. Using internet image registries it is possible to configure a registry to actively build an image from the git repository at that commit and to then host the resulting image. A number of internet registries offer hosting for open source projects for free, and also offer paid hosted plans for users requiring privacy. These instructions give a summary of what needs to be done in order to use the quay.io service to provision an image repository that auto-builds images from the studio go runner project, and then tests and delivers the result to the docker.io image registra. + +The first step is to create or login to an account on quay.io. When creating an account on quay.io it is best to ensure before starting that you have a browser window open to giuthub.com using the account that you wish to use for accessing code on github to prevent any unintended accesses to private repositories. As you create the account on you can choose to link it automatically to github granting application access from quay to your github authorized applications. This is needed in order that quay can poll your projects for any changes in order to trigger image building. + +Having logged in you can now create a repository using the label at the top right corner of your web page underneath the account related drop down menu. + +The first screen will allow you to specify tgar you wish to create an image repository and assign it a name, also set the visibility to public, and to 'Link to a GitHub Repository Push', this indicates that any push of a commit or tag will result in a container build being triggered. +- +Pushing the next button will then cause the browser to request github to authorize access from quay to github and will prompt you to allow this authorization to be setup for future interactions between the two platform. Again, be sure you are assuming the role of the most recently logged in github user and that the one being authorized is the one you intend to allow Quay to obtain access to. + +After the authorization is enabled, the next web page is displayed which allows the organization and account to be choosen from which the image will be built. Step through the next two screens to then select the repository that will be used and then push the continue button. + +You can then specify the branch(es) that can then be used for the builds to meet you own needs. Pushing con tinue will then allow you to select the Dockerfile that will act as your source for the new image. When using studio go runner a Dockerfile called Dockerfile\_standalone is versioned in the source code repository that will allow a fully standalone container to be created that can be perform the entire build, test, release life cycle for the software. usign a slash indicates the top level of the go runner repo. + +Using continue will then prompt for the Context of the build which should be set to '/'. You can now click through the rest of the selections and will end up with a fully populated trigger for the repository. + +You can now trigger the first build and test cycle for the repository. Once the repository has been built you can proceed to setting up a Kubernetes test cluster than can pull the image(s) from the repository as they are updated via git commits followed by a git push. + +# Continuous Integration + +The presence of the quay.io image repository allows a suitably configured Kubernetes cluster to query for build images and to use these for testing and integration. + +The studio go runner standalone build image can be used within a go runner deployment to perform testing and validation against a live minio (s3 server) and a RabbitMQ (queue server) instances deployed within a single Kubernetes namespace. The definition of the deployment is stored within the source code repository, in the file ci\_keel.yaml. + +The build deployment contains an annotated deployment of the build image that when deployed concurrently with keel can react to freshly created build images to cycle automatically through build, test, deploy image cycles. + +Keel is documented at https://keel.sh/, installation instruction can also be found there, https://keel.sh/guide/installation.html. Once deploy keel can be left to run as a background service observing Kubernetes deployments that contain annotations it is designed to react to. Keel will watch for changes to image repositories that Deployments have annotations for and will automatically upgrade the Deployment pods as new images are seen. + +The studio go runner ci\_keel.yaml pods use Kubernetes annotations for the studio go runner istandalobe build deployment that the user should look into and configure to select the branches for which they want to watch and perform tests and releases for. The keel labels within the ci\_keel.yaml file dictate under what circumstances the keel server will trigger a new pod for the build and test to be created in response to the reference build image changing as git commit and push operations are performed. Information about these labels can be found at, https://keel.sh/v1/guide/documentation.html#Policies. + +The commands that you might performed to this point in order to deploy keel into an existing Kubernetes deploy might well appear as follows: + +``` +mkdir -p ~/project/src/github.com/keel-hq +cd ~/project/src/github.com/keel-hq +git clone https://github.com/keel-hq/keel.git +cd keel +kubectl create -f deployment-rbac.yaml +mkdir -p ~/project/src/github.com/leaf-ai +cd ~/project/src/github.com/leaf-ai +git clone https://github.com/leaf-ai/studio-go-runner.git +cd studio-go-runner +git checkout [branch name] +# Follow the instructions for setting up the Prerequisites for compilation in the main README.md file +``` + +The next step would be to edit the ci_keel.yaml file to reflect the branch name on which the development is being performed or the release prepared, and then deploy the integration stack. + +``` +stencil -input ci_keel.yaml -values Registry=xxx,Namespace=ci-go-runner | kubectl apply -f - +``` + +This will deploy a stack capable of builds and testing. As a build finishes the stack will scale down the dependencies it uses for queuing and storage and will keep the build container alive so that logs can be examined. The build activities will disable container upgrades while the build is running and will then open for upgrades once the build steps have completed to prevent premature termination. When the build, and test has completed and pushed commits have been seen for the code base then the pod will be shutdown for the latest build and a new pod created. + +If the env variable GITHUB\_TOKEN is present when deploying an integration stack it will be placed as a Kubernetes secret into the integration stack. If the secret is present then upon successful build and test cycles the running container will attempt to create and deploy a release using the github release pages. + +The Registry value, xxx, is used to pass your docker hub username, and password to keel orchestrated containers and the release image builder, Makisu, using a kubernetes secret. An example of how to set this value is included in the next section, continue on for more details. Currently only dockerhub is supported for pushing release images to. + +When the build completes the pods that are present that are only useful during the actual build and test steps will be scaled back to 0 instances. The CI script, ci.sh, will spin up and down specific kubernetes jobs and deployments when they are needed automatically by using the Kubernetes kubectl command. Bceuase of this your development and build cluster will need access to the Kubernetes API server to complete these tasks. The Kubernetes API access is enabled by the ci\_keel.yaml file when the standalone build container is initialized. + +Before using the registry setting you should copy registry-template.yaml to registry.yaml, and modify the contents. + +If the environment is shared between multiple people the namespace can be assigned using the petname tool, github.com/karlmutch/petname, as shown below. + +``` +cat registry.yaml +index.docker.io: + .*: + security: + tls: + client: + disabled: false + basic: + username: docker_account_name + password: docker_account_password +export Registry=`cat registry.yaml` +export K8S_NAMESPACE=ci-go-runner-`petname` +stencil -input ci_keel.yaml -values Registry=${Registry},Namespace=$K8S_NAMESPACE | kubectl apply -f - + +export K8S_POD_NAME=`kubectl --namespace=$K8S_NAMESPACE get pods -o json | jq '.items[].metadata.name | select ( startswith("build-"))' --raw-output` +kubectl --namespace $K8S_NAMESPACE logs -f $K8S_POD_NAME +``` + +or + +``` +export Registry=`cat registry.yaml` +stencil -input ci_keel.yaml -values Namespace=ci-go-runner-`petname`| kubectl apply -f - +export K8S_NAMESPACE=`kubectl get ns -o json | jq --raw-output '.items[] | select(.metadata.name | startswith("ci-go-runner-")) | .metadata.name'` + +export K8S_POD_NAME=`kubectl --namespace=$K8S_NAMESPACE get pods -o json | jq '.items[].metadata.name | select ( startswith("build-"))' --raw-output` +kubectl --namespace $K8S_NAMESPACE logs -f $K8S_POD_NAME +``` + +# Locally deploy keel testing and CI + +These instructions will be useful to those using a locally deployed Kubernetes distribution such as microk8s. If you wish to use microk8s you should first deploy using the workstations instructions found in this souyrce code repository at docs/workstation.md. You can then return to this section for further information on deploying the keel based CI/CD within your microk8s environment. + +In the case that a test of a locally pushed docker image is needed you can build your image locally and then when the build.sh is run it will do a docker push to a microk8s cluster instance running on your workstation or laptop. In order for the keel deployment to select the locally hosted image registry you set the Image variable for stencil to substitute into the ci\_keel.yaml file. + +When the release features are used the CI/CD system will make use of the Makisu image builder, authored by Uber. Makisu allows docker containers to build images entirely within an existing container with no specialized dependencies and also without needing dind (docker in docker), or access to a docker server socket. + +``` +./build.sh +stencil -input ci_keel.yaml -values Registry=${Registry},Image=localhost:32000/leafai/studio-go-runner-standalone-build:${GIT_BRANCH},Namespace=ci-go-runner-`petname`| kubectl apply -f - +``` diff --git a/docs/gpus.md b/docs/gpus.md index 0286f7562..46bbb1e25 100644 --- a/docs/gpus.md +++ b/docs/gpus.md @@ -17,7 +17,7 @@ However as the power of the cards deployed within your infrastructure increases |Slots|Card Types| |---|---| -|1|GTX 1050, GTX 1060| +|2|GTX 1050, GTX 1060| |2|GTX 1070, GTX 1080| |4|Titan X, Tesla P40| |8|Tesla P100| diff --git a/docs/quay_io_ci.md b/docs/quay_io_ci.md deleted file mode 100644 index dae0b3917..000000000 --- a/docs/quay_io_ci.md +++ /dev/null @@ -1,71 +0,0 @@ -# Continuous Integration Setup - -This document describes setting up a CI pipline that can be used to prepare releases for studio go runner. - -studio go runner is designed to run in resource intensive environments using GPU enabled machines and so providing a free hosted pipeline for CI/CD is cost prohibitive. As an alternative, parties interested in studio go runner can make use of quay.io hosted images built automatically on github commit triggers to then trigger their own downstream build, test and deploy automation. Downstream automation can be hosted on a self provisioned Kubernetes provisioned cluster either within the cloud or on private infrastructure. This allows testing to be done using the CI pipeline on both local laptops, workstations and in cloud or data center environments. - -This document contains instructions that can be used for hardware configurations that individual users to large scale enterprises can use without incuring monthly charges from third party providers. These instructions first detail how a quay.io trigger can be setup to trigger builds on github commits. Instructions then detail how to make use of Keel, https://keel.sh/, to pull CI images into a cluster and run the pipeline. - -# A word about privacy - -Many of the services that provide image hosting use Single Sign On and credentials management with your source code control platform of choice. As a consequence of this often these services will gain access to any and all repositories private or otherwise that you might have access to within your account. In order to preserve privacy and maintain fine grained control over the visibility of your private repositories it is recommended that when using quay.io and other services that you create a service account that has the minimal level of access to repositories as nessasary to implement your CI/CD features. - -# CI Image building - -The studio go runner project uses Docker images to encapsulate builds within an immutable archive format. Using internet accessible hosted registries it is possible to configure a registry to actively build an image from the git repository at that commit and to then host the resulting image. A number of internet registries offer hosting for open source projects for free, and also offer paid hosted plans for users requiring privacy. These instructions give a summary of what needs to be done in order to use the quay.io service to provision an image repository that auto-builds images from the studio go runner project. - -The first step is to create or login to an account on quay.io. When creating an account on quay.io it is best to ensure before starting that you have a browser window open to giuthub.com using the account that you wish to use for accessing code on github to prevent any unintended accesses to private repositories. As you create the account on you can choose to link it automatically to github granting application access from quay to your github authorized applications. This is needed in order that quay can poll your projects for any changes in order to trigger image building. - -Having logged in you can now create a repository using the label at the top right corner of your web page underneath the account related drop down menu. - -The first screen will allow you to specify tgar you wish to create an image repository and assign it a name, also set the visibility to public, and to 'Link to a GitHub Repository Push', this indicates that any push of a commit or tag will result in a container build being triggered. - -Pushing the next button will then cause the browser to request github to authorize access from quay to github and will prompt you to allow this authorization to be setup for future interactions between the two platform. Again, be sure you are assuming the role of the most recently logged in github user and that the one being authorized is the one you intend to allow Quay to obtain access to. - -After the auhtorization is enabled the next web page is displayed which allows the organization and account to be choosen from which the image will be built. Step through the next two screen to drill down to the repository that will be used and then push the continue button. - -You can then specify the branch(es) that can then be used for the builds to meet you own needs. Pushing con tinue will then allow you to select the Dockerfile that will act as your source for the new image. When using studio go runner a Dockerfile called Dockerfile_standalone is versioned in the source code repository that will allow a fully standalone container to be created that can be perform the entire build, test, release life cycle for the software. usign a slash indicates the top level of the go runner repo. - -Using continue will then prompt for the Context of the build which should be set to '/'. You can now click through the rest of the selections and will end up with a fully populated trigger for the repository. - -You can now trigger the first build and test cycle for the repository. Once the repository has been built you can proceed to setting up a Kubernetes test cluster than can pull the image(s) from the repository as they are updated via git commits followed by a git push. - -# Continuous Integration - -The presence of a publically accesible repository allows a suitably configured Kubernetes cluster to query for the presence of build images for testing and integration. - -The studio go runner standalone build image can be used within a go runner deployment to perform testing and validation against a live minio(s3 server) and a RabbitMQ (queue server) instances deployed within a single Kubernetes namespace. The definition of the deployment is stored within the source code repository, as ci_keel.yaml. - -The build deployment contains an annotated deployment of the build image that when deployed concurrently with http://keel.sh/ can react to freshly created build images to cycle through build, deploy, test cycles automatically. - -Keel is documented at https://keel.sh/, installation instruction can also be found there, https://keel.sh/guide/installation.html. Once deploy keel can be left to run as a background service observing Kubernetes deployments that contain annotations it is designed to react to. Keel will watch for changes to image repositories that Deployments have annotations for and will automatically upgrade the Deployment pods as new images are seen. - -The studio go runner ci_keel.yaml contains annotations for a studio go runner Deployment that the user should look into and configure to select the branches for which they want to watch and perform tests and releases for. The keel labels within the ci_keel.yaml file dictate under what circumstances the keel server will trigger a new pod for the build and test to be created in response to the reference build image changing as git commit and push operations are performed. Information about these labels can be found at, https://keel.sh/v1/guide/documentation.html#Policies. - -The commands that you might performed to this point in order to deploy keel into an existing Kubernetes deploy might well appear as follows: - -``` -mkdir -p ~/project/src/github.com/keel-hq -cd ~/project/src/github.com/keel-hq -git clone https://github.com/keel-hq/keel.git -cd keel -kubectl create -f deployment-rbac.yaml -mkdir -p ~/project/src/github.com/leaf-ai -cd ~/project/src/github.com/leaf-ai -git clone https://github.com/leaf-ai/studio-go-runner.git -cd studio-go-runner -git checkout [branch name] -# Follow the instructions for setting up the Prerequisites for compilation in the main README.md file -``` - -The next step would be to edit the ci_keel.yaml file to reflect the branch name on which the development is being performed or the release prepared, and then deploy the integration stack. - -``` -stencil -input ci_keel.yaml -values Namespace=ci-go-runner | kubectl apply -f - -``` - -This will deploy a stack capable of builds and testing. As a build finishes the stack will scale down the dependencies it uses for queuing and storage and will keep the build container alive so that logs can be examined. The build activities will disable container upgrades while the build is running and will then open for upgrades once the build steps have completed to prevent premature termination. When the build, and test has completed and pushed commits have been seen for the code base then the pod will be shutdown for the latest build and a new pod created. - -If the env variable GITHUB_TOKEN is present when deploying an integration stack it will be placed as a Kubernetes secret into the integration stack. If the secret is present then upon successful build and test cycles the running container will attempt to create and deploy a release using the github release pages. - -When the build completes the pods that are present that are only useful during the actual build and test steps will be scaled back to 0 instances. diff --git a/docs/workstation_k8s.md b/docs/workstation_k8s.md index 75ffa0b25..49bcd2daf 100644 --- a/docs/workstation_k8s.md +++ b/docs/workstation_k8s.md @@ -20,14 +20,14 @@ For laptops, and private workstations using Windows 10 Professional Edition, or For Ubuntu hosts a microk8s solution exists that implements a single host deployment, https://microk8s.io/. Use snap on Ubuntu to install this component to allow for management of the optional features of microk8s. -The following example details how to configure microk8s: +The following example details how to configure microk8s once it has been installed: ``` # Allow the containers within the cluster to communicate with the public internet. Needed for rabbitMQ pkg to be fetched and installed sudo ufw default allow routed sudo iptables -P FORWARD ACCEPT sudo /snap/bin/microk8s.start -sudo /snap/bin/microk8s.enable dashboard dns ingress storage registry +sudo /snap/bin/microk8s.enable dashboard dns ingress storage registry gpu ``` ## Usage diff --git a/internal/runner/cuda.go b/internal/runner/cuda.go index 9a8ace6ab..763004167 100644 --- a/internal/runner/cuda.go +++ b/internal/runner/cuda.go @@ -93,6 +93,11 @@ var ( // CudaInitWarnings records warnings and errors that are deemed not be be fatal // to the ongoing CUDA library usage but are of importance CudaInitWarnings = []errors.Error{} + + // Used to check if the running process is a go test process, if so then + // this will disable certain types of checking when using very limited GPU + // Hardware + CudaInTest = false ) func init() { @@ -110,8 +115,16 @@ func init() { if len(devs) == 0 { devs = os.Getenv("NVIDIA_VISIBLE_DEVICES") } + visDevices := strings.Split(devs, ",") + if devs == "all" { + visDevices = make([]string, 0, len(gpuDevices.Devices)) + for _, device := range gpuDevices.Devices { + visDevices = append(visDevices, device.UUID) + } + } + gpuAllocs.Lock() defer gpuAllocs.Unlock() gpuAllocs.Allocs = make(map[string]*GPUTrack, len(visDevices)) @@ -158,15 +171,13 @@ func init() { track := &GPUTrack{ UUID: dev.UUID, Mem: dev.MemFree, - Slots: 1, - FreeSlots: 1, EccFailure: dev.EccFailure, Tracking: map[string]struct{}{}, } switch { case strings.Contains(dev.Name, "GTX 1050"), strings.Contains(dev.Name, "GTX 1060"): - track.Slots = 1 + track.Slots = 2 case strings.Contains(dev.Name, "GTX 1070"), strings.Contains(dev.Name, "GTX 1080"): track.Slots = 2 diff --git a/internal/runner/cuda_linux.go b/internal/runner/cuda_linux.go index 8e9a310d3..cc171c298 100644 --- a/internal/runner/cuda_linux.go +++ b/internal/runner/cuda_linux.go @@ -96,8 +96,6 @@ func getCUDAInfo() (outDevs cudaDevices, err errors.Error) { return outDevs, errors.Wrap(errGo).With("GPUID", uuid).With("stack", stack.Trace().TrimRuntime()) } - errEcc := dev.EccErrors() - runnerDev := device{ Name: name, UUID: uuid, @@ -108,7 +106,8 @@ func getCUDAInfo() (outDevs cudaDevices, err errors.Error) { MemFree: mem.Free, } // Dont use the ECC Error check on AWS as the NVML APIs do not appear to return the expected values - if isAWS, _ := IsAWS(); !isAWS { + if isAWS, _ := IsAWS(); !isAWS && !CudaInTest { + errEcc := dev.EccErrors() if errEcc != nil && errEcc.Error() != "nvmlDeviceGetMemoryErrorCounter is not supported on this hardware" { err := errors.Wrap(errEcc).With("stack", stack.Trace().TrimRuntime()) runnerDev.EccFailure = &err diff --git a/internal/runner/cuda_test.go b/internal/runner/cuda_test.go index ec9412a98..fdc6b230d 100644 --- a/internal/runner/cuda_test.go +++ b/internal/runner/cuda_test.go @@ -22,6 +22,10 @@ var ( errFormatIssue = errors.New("unexpected format, lines should be in the format x=y") ) +func init() { + CudaInTest = true +} + // This file contains an integration test implementation that submits a studio runner // task across an SQS queue and then validates is has completed successfully by // the go runner this test is running within diff --git a/registry-template.yaml b/registry-template.yaml new file mode 100644 index 000000000..fae5c8485 --- /dev/null +++ b/registry-template.yaml @@ -0,0 +1,9 @@ +index.docker.io: + .*: + security: + tls: + client: + disabled: false + basic: + username: [dockerhub user name] + password: [dockerhub user password]