Skip to content

Commit

Permalink
Merge pull request #208 from leaf-ai/feature/142_makisu_experiment
Browse files Browse the repository at this point in the history
Feature/142 makisu experiment
  • Loading branch information
karlmutch authored Mar 4, 2019
2 parents eb76f2c + 8b68b8e commit b309e3e
Show file tree
Hide file tree
Showing 21 changed files with 298 additions and 149 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ clusters

#Autogenerated licenses manifest
licenses.manifest
registry.yaml
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,4 @@ IMPROVEMENTS:

FIXES:

* repair dependabot mayhem that brokes the builds and a tag removed from a 3rd party repository
* repair dependabot mayhem that broke the builds and a tag removed from a 3rd party repository
4 changes: 2 additions & 2 deletions Dockerfile_standalone
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ RUN mkdir $GOPATH/bin && \
go get github.com/karlmutch/enumer && \
go get github.com/karlmutch/petname && \
go install github.com/karlmutch/petname/cmd/petname && \
wget -q -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.9.3/semver-linux-amd64 && \
wget -q -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.9.3/stencil-linux-amd64 && \
wget -q -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.10.0/semver-linux-amd64 && \
wget -q -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.10.0/stencil-linux-amd64 && \
chmod +x $GOPATH/bin/semver && \
chmod +x $GOPATH/bin/stencil && \
rm /usr/bin/nvidia-*
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# studio-go-runner

Version: <repo-version>0.9.11</repo-version>
Version: <repo-version>0.9.12</repo-version>

[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/leaf-ai/studio-go-runner/blob/master/LICENSE) [![Go Report Card](https://goreportcard.com/badge/leaf-ai/studio-go-runner)](https://goreportcard.com/report/leaf-ai/studio-go-runner)[![DepShield Badge](https://depshield.sonatype.org/badges/leaf-ai/studio-go-runner/depshield.svg)](https://depshield.github.io)

Expand Down Expand Up @@ -132,9 +132,9 @@ To install the tools on Ubuntu use the following commands:
mkdir -p $GOPATH/bin
go get github.com/karlmutch/petname
go install github.com/karlmutch/petname/cmd/petname
wget -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.9.3/semver-linux-amd64
wget -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.9.3/stencil-linux-amd64
wget -O $GOPATH/bin/github-release https://github.com/karlmutch/duat/releases/download/0.9.3/github-release-linux-amd64
wget -O $GOPATH/bin/semver https://github.com/karlmutch/duat/releases/download/0.10.0/semver-linux-amd64
wget -O $GOPATH/bin/stencil https://github.com/karlmutch/duat/releases/download/0.10.0/stencil-linux-amd64
wget -O $GOPATH/bin/github-release https://github.com/karlmutch/duat/releases/download/0.10.0/github-release-linux-amd64
chmod +x $GOPATH/bin/semver
chmod +x $GOPATH/bin/stencil
chmod +x $GOPATH/bin/github-release
Expand Down
2 changes: 1 addition & 1 deletion build.go
Original file line number Diff line number Diff line change
Expand Up @@ -552,7 +552,7 @@ func test(md *duat.MetaData) (outputs []string, errs []errors.Error) {
if !sPod {
opts = append(opts, "-test.short")
} else {
opts = append(opts, "-test.timeout=15m")
opts = append(opts, "-test.timeout=30m")
opts = append(opts, "--use-k8s")
}

Expand Down
30 changes: 2 additions & 28 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ travis_fold start "build.image"
rm -f $working_file
docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH leafai/studio-go-runner-standalone-build
docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH localhost:32000/leafai/studio-go-runner-standalone-build
docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH localhost:32000/leafai/studio-go-runner-standalone-build:$GIT_BRANCH
docker push localhost:32000/leafai/studio-go-runner-standalone-build:$GIT_BRANCH || true
exit_code=$?
if [ $exit_code -ne 0 ]; then
exit $exit_code
Expand Down Expand Up @@ -164,41 +166,13 @@ travis_fold start "image.push"
if type docker 2>/dev/null ; then
docker login docker.io
if [ $? -eq 0 ]; then
docker tag leaf-ai/studio-go-runner/runner:$SEMVER leafai/studio-go-runner:$SEMVER
docker tag leafai/studio-go-runner-dev-base:0.0.0 leafai/studio-go-runner-dev-base:$GIT_BRANCH

docker push leafai/studio-go-runner:$SEMVER
docker push leafai/studio-go-runner-dev-base:0.0.0
docker push leafai/studio-go-runner-dev-base:$GIT_BRANCH
docker push leafai/studio-go-runner-standalone-build:$GIT_BRANCH
fi
fi
if type aws 2>/dev/null ; then
`aws ecr get-login --no-include-email`
if [ $? -eq 0 ]; then
account=`aws sts get-caller-identity --output text --query Account`
if [ $? -eq 0 ]; then
docker tag leafai/studio-go-runner:$SEMVER $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/runner:$SEMVER
docker push $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/runner:$SEMVER

docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/standalone-build:$GIT_BRANCH
docker push $account.dkr.ecr.us-west-2.amazonaws.com/leafai/studio-go-runner/standalone-build:$GIT_BRANCH
fi
fi
fi
if [ -z ${azure_registry_name+x} ]; then
:
else
if type az 2>/dev/null; then
if az acr login --name $azure_registry_name; then
docker tag leafai/studio-go-runner-standalone-build:$GIT_BRANCH $azure_registry_name.azurecr.io/leafai/studio-go-runner-standalone-build:$GIT_BRANCH
docker push $azure_registry_name.azurecr.io/leafai/studio-go-runner-standalone-build:$GIT_BRANCH

docker tag leafai/studio-go-runner:$SEMVER $azure_registry_name.azurecr.io/leafai/studio-go-runner:$SEMVER
docker push $azure_registry_name.azurecr.io/leafai/studio-go-runner:$SEMVER
fi
fi
fi
fi
travis_time_finish
travis_fold end "image.push"
Expand Down
36 changes: 28 additions & 8 deletions ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ working_file=$$.studio-go-runner-working
rm -f $working_file
trap Tidyup 1 2 3 15

export GIT_BRANCH=`echo '{{.duat.gitBranch}}' | stencil - | tr '_' '-' | tr '\/' '-'`
export GIT_BRANCH=`echo '{{.duat.gitBranch | replace "/" "-" | replace "_" "-"}}' | stencil`
export RUNNER_BUILD_LOG=build-$GIT_BRANCH.log

exit_code=0
Expand All @@ -97,21 +97,41 @@ export

travis_fold start "build.image"
travis_time_start
set -o pipefail ; (go run build.go -r -dirs=internal && go run build.go -r -dirs=cmd && echo "Success" || echo "Failure") 2>&1 | tee $RUNNER_BUILD_LOG
exit_code=$?
if [ $exit_code -ne 0 ]; then
exit $exit_code
fi
set -o pipefail ; (go run build.go -r -dirs=internal && go run build.go -r -dirs=cmd ; exit_code=$?) 2>&1 | tee $RUNNER_BUILD_LOG
[[ exit_code == 0 ]] && echo "Success" || echo "Failure"
travis_time_finish
travis_fold end "build.image"

rm -rf /build/*

if [ $exit_code -eq 0 ]; then
cd cmd/runner
rsync --recursive --relative . /build/
cd -
fi

ls /build -alcrt
cleanup

echo "Starting the namespace injections etc" $K8S_POD_NAME
kubectl label deployment build keel.sh/policy=force --namespace=$K8S_NAMESPACE
echo "Scale testing dependencies to 0" $K8S_POD_NAME
kubectl scale --namespace $K8S_NAMESPACE --replicas=0 rc/rabbitmq-controller
kubectl scale --namespace $K8S_NAMESPACE --replicas=0 deployment/minio-deployment

if [ $exit_code -eq 0 ]; then
kubectl --namespace $K8S_NAMESPACE delete job/imagebuilder || true
echo "imagebuild-mounted starting" $K8S_POD_NAME
# Run the docker image build using Mikasu within the same namespace we are occupying and
# the context for the image build will be the /build mount
stencil -values Namespace=$K8S_NAMESPACE -input ci_containerize.yaml | kubectl --namespace $K8S_NAMESPACE create -f -
until kubectl --namespace $K8S_NAMESPACE get job/imagebuilder -o jsonpath='{.status.conditions[].status}' | grep True ; do sleep 3 ; done
echo "imagebuild-mounted complete" $K8S_POD_NAME
kubectl --namespace $K8S_NAMESPACE logs job/imagebuilder
kubectl --namespace $K8S_NAMESPACE delete job/imagebuilder
fi

echo "Return pod back to the ready state for keel to begin monitoring for new images" $K8S_POD_NAME
kubectl label deployment build keel.sh/policy=force --namespace=$K8S_NAMESPACE

for (( ; ; ))
do
sleep 600
Expand Down
37 changes: 37 additions & 0 deletions ci_containerize.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: batch/v1
kind: Job
metadata:
name: imagebuilder
namespace: {{ .Namespace }}
spec:
template:
spec:
restartPolicy: Never
containers:
- name: makisu
image: gcr.io/makisu-project/makisu:v0.1.9
imagePullPolicy: IfNotPresent
args:
- build
- --push=index.docker.io
- --modifyfs=true
- -t=leafai/studio-go-runner:{{.duat.version}}
- --registry-config=/registry-config/registry.yaml
- /makisu-context
volumeMounts:
- name: context
mountPath: /makisu-context
- name: registry-config
mountPath: /registry-config
- name: storage
mountPath: /makisu-storage
volumes:
- name: context
persistentVolumeClaim:
# Name of the PVC created earlier
claimName: build-pv-claim
- name: registry-config
secret:
secretName: docker-registry-config
- name: storage
emptyDir: {}
57 changes: 46 additions & 11 deletions ci_keel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,24 @@ roleRef:
apiGroup: rbac.authorization.k8s.io
---
apiVersion: v1
kind: Secret
metadata:
name: docker-registry-config
namespace: {{ .Namespace }}
type: Opaque
data:
registry.yaml: '{{ .Registry | b64enc }}'
---
apiVersion: v1
kind: Secret
metadata:
name: release-github-token
namespace: {{ .Namespace }}
type: Opaque
data:
github_token: '{{ expandenv "$GITHUB_TOKEN" | b64enc }}'
---
apiVersion: v1
kind: ConfigMap
metadata:
name: build-env
Expand Down Expand Up @@ -153,15 +171,6 @@ spec:
mountPath: "/storage"
---
apiVersion: v1
kind: Secret
metadata:
name: release-github-token
namespace: {{ .Namespace }}
type: Opaque
data:
github_token: '{{ expandenv "$GITHUB_TOKEN" | b64enc }}'
---
apiVersion: v1
kind: Service
metadata:
name: minio-service
Expand All @@ -175,6 +184,25 @@ spec:
selector:
app: minio
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
# This name uniquely identifies the PVC. Will be used in deployment below.
name: build-pv-claim
labels:
app: build-storage-claim
namespace: {{ .Namespace }}
spec:
# Read more about access modes here: https://kubernetes.io/docs/user-guide/persistent-volumes/#access-modes
accessModes:
- ReadWriteMany
resources:
# This is the request for storage. Should be available in the cluster.
requests:
storage: 10Gi
# Uncomment and add storageClass specific to your requirements below. Read more https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
#storageClassName:
---
# Run the integration build as a deployment, the lifecycle will be dealt with by the CMD entry
apiVersion: extensions/v1beta1
kind: Deployment
Expand All @@ -193,6 +221,10 @@ spec:
app: build
spec:
volumes:
- name: build-storage
persistentVolumeClaim:
# Name of the PVC created earlier
claimName: build-pv-claim
- name: podinfo
downwardAPI:
items:
Expand Down Expand Up @@ -224,13 +256,16 @@ spec:
envFrom:
- configMapRef:
name: build-env
image: quay.io/leaf_ai_dockerhub/studio-go-runner-standalone-build:{{ .duat.gitBranch | replace "/" "-" }}
image: {{ $branch := .duat.gitBranch | replace "/" "_" | replace "-" "_"}}{{ .Image | empty | ternary "quay.io/leaf_ai_dockerhub/studio-go-runner-standalone-build:" ""}}{{ .Image | empty | ternary $branch .Image }}
imagePullPolicy: Always
resources:
limits:
memory: "1024Mi"
cpu: 1
cpu: 4
nvidia.com/gpu: {{ expandenv "$NVIDIA_VISIBLE_DEVICES" | empty | ternary "0" "2" }}
volumeMounts:
- name: build-storage # must match the volume name, above
mountPath: "/build"
- name: podinfo
mountPath: /etc/podinfo
readOnly: false
Expand Down
20 changes: 8 additions & 12 deletions cmd/runner/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
FROM ubuntu:16.04
LABEL maintainer "[email protected]"

RUN \
apt-get update && \
RUN apt-get update && \
apt-get install -y locales && \
apt-get install -y language-pack-en && \
update-locale "en_US.UTF-8" && \
Expand Down Expand Up @@ -64,17 +62,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libzmq3-dev \
pkg-config \
software-properties-common \
unzip \
&& \
unzip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN apt-get update && \
apt-get install -y python python-pip python3 python3-pip python3-dev python-dev git lshw && \
pip install --upgrade pip==9.0.3 setuptools

RUN \
apt-get -y install libssl-dev libcurl4-openssl-dev libsm6 libxrender-dev libxext-dev && \
RUN apt-get -y install libssl-dev libcurl4-openssl-dev libsm6 libxrender-dev libxext-dev && \
pip install tensorflow-gpu==1.4.1 && \
pip install tensorflow-gpu==1.8.0 && \
pip install tensorflow-gpu==1.9.0 && \
Expand All @@ -96,16 +92,16 @@ LABEL vendor="Sentient Technologies INC" \
ai.sentient.module.name=studio-go-runner

# Add support for richer terminals to aid debugging etc
RUN mkdir -p /lib/terminfo/x
RUN mkdir -p /usr/local/share/terminfo/x
RUN mkdir -p /lib/terminfo/x && \
mkdir -p /usr/local/share/terminfo/x
COPY add-ons/termite.terminfo /usr/local/share/terminfo/x/xterm-termite
COPY add-ons/termite.terminfo /lib/terminfo/x/xterm-termite

# Prometheus instrumented port
EXPOSE 9090

COPY run.sh /runner/.
COPY bin/runner-linux-amd64 /runner/.
COPY bin/runner-linux-amd64-cpu /runner/.
COPY run.sh /runner/run.sh
COPY bin/runner-linux-amd64 /runner/runner-linux-amd64
COPY bin/runner-linux-amd64-cpu /runner/runner-linux-amd64-cpu

CMD /bin/bash -C ./run.sh
2 changes: 1 addition & 1 deletion cmd/runner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ func EntryPoint(quitCtx context.Context, cancel context.CancelFunc, doneC chan s
if runner.HasCUDA() {

msg := fmt.Errorf("no available GPUs could be found using the nvidia management library")
if runner.CudaInitErr == nil {
if runner.CudaInitErr != nil {
msg = *runner.CudaInitErr
}
err := errors.Wrap(msg).With("stack", stack.Trace().TrimRuntime())
Expand Down
4 changes: 4 additions & 0 deletions cmd/runner/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ var (
// and their command line options for each case
func init() {
cleanupDirs = append(cleanupDirs, "/tmp/cache-runner")

// Disable certain checks related to ECC validation for smaller cards that are used during testing
runner.CudaInTest = true

}

func cleanup() {
Expand Down
2 changes: 1 addition & 1 deletion cmd/runner/metadata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func waitForMetaDataRun(ctx context.Context, qName string, queueType string, r *
}

// Wait for prometheus to show the task stopped for our specific queue, host, project and experiment ID
if runningCnt == 0 && finishedCnt == 2 {
if runningCnt == 0 && finishedCnt >= 2 {
return nil
}
logger.Info("stats", "runner", runningCnt, "finished", finishedCnt)
Expand Down
Loading

0 comments on commit b309e3e

Please sign in to comment.