Skip to content

Commit

Permalink
Merge branch 'master' into fix-notebook-controller
Browse files Browse the repository at this point in the history
  • Loading branch information
gabrielwen committed Jun 3, 2019
2 parents 5de0bf1 + 1d5dff8 commit eeca453
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,19 @@ type NotebookStatus struct {
}

type NotebookCondition struct {
// Type of the confition/
Type NotebookConditionType `json:"type"`
// Type is the type of the condition. Possible values are Running|Waiting|Terminated
Type string `json:"type"`
// Last time we probed the condition.
// +optional
LastProbeTime metav1.Time `json:"lastProbeTime,omitempty"`
// (brief) reason the container is in the current state
// +optional
Reason string `json:"reason,omitempty"`
// Message regarding why the container is in the current state.
// +optional
Message string `json:"message,omitempty"`
}

type NotebookConditionType string

// +genclient
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,19 +241,6 @@ func (r *ReconcileNotebook) Reconcile(request reconcile.Request) (reconcile.Resu
}
}

// Update the status if previous condition is not "Ready"
oldConditions := instance.Status.Conditions
if len(oldConditions) == 0 || oldConditions[0].Type != "Ready" {
newCondition := v1alpha1.NotebookCondition{
Type: "Ready",
}
instance.Status.Conditions = append([]v1alpha1.NotebookCondition{newCondition}, oldConditions...)
// Using context.Background as: https://book.kubebuilder.io/basics/status_subresource.html
err = r.Status().Update(context.Background(), instance)
if err != nil {
return reconcile.Result{}, err
}
}
// Update the readyReplicas if the status is changed
if foundStateful.Status.ReadyReplicas != instance.Status.ReadyReplicas {
log.Info("Updating Status", "namespace", instance.Namespace, "name", instance.Name)
Expand All @@ -268,7 +255,7 @@ func (r *ReconcileNotebook) Reconcile(request reconcile.Request) (reconcile.Resu
pod := &corev1.Pod{}
err = r.Get(context.TODO(), types.NamespacedName{Name: ss.Name + "-0", Namespace: ss.Namespace}, pod)
if err != nil && errors.IsNotFound(err) {
// This should be reconcile by the StatefulSet
// This should be reconciled by the StatefulSet
log.Info("Pod not found...")
} else if err != nil {
return reconcile.Result{}, err
Expand All @@ -277,7 +264,17 @@ func (r *ReconcileNotebook) Reconcile(request reconcile.Request) (reconcile.Resu
if len(pod.Status.ContainerStatuses) > 0 &&
pod.Status.ContainerStatuses[0].State != instance.Status.ContainerState {
log.Info("Updating container state: ", "namespace", instance.Namespace, "name", instance.Name)
instance.Status.ContainerState = pod.Status.ContainerStatuses[0].State
cs := pod.Status.ContainerStatuses[0].State
instance.Status.ContainerState = cs
oldConditions := instance.Status.Conditions
newCondition := getNextCondition(cs)
// Append new condition
if len(oldConditions) == 0 || oldConditions[0].Type != newCondition.Type ||
oldConditions[0].Reason != newCondition.Reason ||
oldConditions[0].Message != newCondition.Message {
log.Info("Appending to conditions: ", "namespace", instance.Namespace, "name", instance.Name, "type", newCondition.Type, "reason", newCondition.Reason, "message", newCondition.Message)
instance.Status.Conditions = append([]v1alpha1.NotebookCondition{newCondition}, oldConditions...)
}
err = r.Status().Update(context.Background(), instance)
if err != nil {
return reconcile.Result{}, err
Expand All @@ -288,6 +285,32 @@ func (r *ReconcileNotebook) Reconcile(request reconcile.Request) (reconcile.Resu
return reconcile.Result{}, nil
}

func getNextCondition(cs corev1.ContainerState) v1alpha1.NotebookCondition {
var nbtype = ""
var nbreason = ""
var nbmsg = ""

if cs.Running != nil {
nbtype = "Running"
} else if cs.Waiting != nil {
nbtype = "Waiting"
nbreason = cs.Waiting.Reason
nbmsg = cs.Waiting.Message
} else {
nbtype = "Terminated"
nbreason = cs.Terminated.Reason
nbmsg = cs.Terminated.Reason
}

newCondition := v1alpha1.NotebookCondition{
Type: nbtype,
LastProbeTime: metav1.Now(),
Reason: nbreason,
Message: nbmsg,
}
return newCondition

}
func generateStatefulSet(instance *v1alpha1.Notebook) *appsv1.StatefulSet {
ss := &appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Expand Down
2 changes: 1 addition & 1 deletion kubeflow/jupyter/tests/jupyter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_jupyter(env, namespace):
ks_util.setup_ks_app(app_dir, env, namespace, component, params)

util.run([ks_cmd, "apply", env, "-c", component], cwd=app_dir)
conditions = ["Ready"]
conditions = ["Running"]
results = util.wait_for_cr_condition(api_client, GROUP, PLURAL, VERSION,
namespace, name, conditions)

Expand Down
21 changes: 1 addition & 20 deletions kubeflow/jupyter/tests/test_app/components/jupyter.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,14 @@ local jupyter = {
"spec": {
"containers": [
{
"image": "gcr.io/kubeflow-images-public/tensorflow-1.10.1-notebook-cpu:v0.3.0",
"image": "gcr.io/kubeflow-images-public/tensorflow-1.13.1-notebook-cpu:v0.5.0",
"name": "notebook",
args: [
"start.sh",
"jupyter",
"lab",
"--LabApp.token=''",
"--LabApp.allow_remote_access='True'",
"--LabApp.allow_root='True'",
"--LabApp.ip='*'",
"--LabApp.base_url=/" + env.namespace + "/" + params.name + "/",
"--port=8888",
"--no-browser",
],
env: [
{
name: "JUPYTER_ENABLE_LAB",
value: "true",
},
],
"resources": {
"requests": {
"cpu": "500m",
"memory": "1Gi"
}
},
"workingDir": "/home/jovyan"
}
],
}
Expand Down
3 changes: 2 additions & 1 deletion kubeflow/tf-training/prototypes/tf-job-operator.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
// @shortDescription A TensorFlow job operator.
// @param name string Name to give to each of the components
// @optionalParam cloud string null String identifying the cloud to customize the deployment for.
// @optionalParam tfJobImage string gcr.io/kubeflow-images-public/tf_operator:v0.5.1 The image for the TfJob controller.
// @optionalParam tfJobImage string gcr.io/kubeflow-images-public/tf_operator@sha256:03a591257d0762b83d010c8f29b8160afce4af05cb537449ececb177df049552 The image for the TfJob controller.
// @optionalParam tfDefaultImage string null The default image to use for TensorFlow.
// @optionalParam tfJobUiServiceType string ClusterIP The service type for the UI.
// @optionalParam deploymentScope string cluster The scope at which tf-job-operator should be deployed - valid values are cluster, namespace.
// @optionalParam deploymentNamespace string null The namespace to which tf-job-operator should be scoped. If deploymentScope is set to cluster, this is ignored.
// @optionalParam enableGangScheduling string false If set true, enable gang scheduling by kube-batch.
// @optionalParam injectIstio string false Whether to inject istio sidecar; should be true or false.
// @optionalParam clusterDomain string cluster.local DNS config to cluster domain.
// @optionalParam monitoringPort string 8443 Port for monitoring agent to scrape metrics from.

local tfJobOperator = import "kubeflow/tf-training/tf-job-operator.libsonnet";
local instance = tfJobOperator.new(env, params);
Expand Down
14 changes: 14 additions & 0 deletions kubeflow/tf-training/tests/tf-job_test.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ local paramsv1 = {
deploymentScope:: "cluster",
deploymentNamespace:: "null",
enableGangScheduling: "false",
monitoringPort: "8443",
};
local env = {
namespace: "test-kf-001",
Expand Down Expand Up @@ -109,9 +110,21 @@ std.assertEqual(
namespace: "test-kf-001",
},
spec: {
ports: [
{
name: "monitoring-port",
port: "8443",
targetPort: "8443"
}
],
replicas: 1,
template: {
metadata: {
annotations: {
"prometheus.io/path": "/metrics",
"prometheus.io/port": "8443",
"prometheus.io/scrape": "true"
},
labels: {
name: "tf-job-operator",
},
Expand All @@ -123,6 +136,7 @@ std.assertEqual(
"/opt/kubeflow/tf-operator.v1",
"--alsologtostderr",
"-v=1",
"--monitoring-port=8443"
],
env: [
{
Expand Down
15 changes: 15 additions & 0 deletions kubeflow/tf-training/tf-job-operator.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@
] else []
+ if util.toBool(params.enableGangScheduling) then [
"--enable-gang-scheduling",
] else []
+ if params.monitoringPort != null then [
"--monitoring-port=" + params.monitoringPort,
] else [],
env:
if params.deploymentScope == "namespace" && params.deploymentNamespace != null then [{
Expand Down Expand Up @@ -154,11 +157,23 @@
},
spec: {
replicas: 1,
ports: [
{
name: "monitoring-port",
port: params.monitoringPort,
targetPort: params.monitoringPort,
},
],
template: {
metadata: {
labels: {
name: "tf-job-operator",
},
annotations: {
"prometheus.io/scrape": 'true',
"prometheus.io/path": "/metrics",
"prometheus.io/port": params.monitoringPort,
},
},
spec: {
containers: [
Expand Down

0 comments on commit eeca453

Please sign in to comment.