Skip to content

Commit

Permalink
feat: Support debugging webhooks locally
Browse files Browse the repository at this point in the history
  • Loading branch information
shalousun committed Aug 31, 2024
1 parent 2044b41 commit f2f942d
Show file tree
Hide file tree
Showing 5 changed files with 122 additions and 117 deletions.
79 changes: 48 additions & 31 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func main() {
var webhookServerPort int
var webhookServiceName string
var webhookSecretName string
var disableWebhook bool

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
Expand Down Expand Up @@ -110,6 +111,7 @@ func main() {
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator", "Name of the Service used as part of the DNSName")
flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-cert", "Name of the Secret to store CA and server certs")
flag.BoolVar(&disableWebhook, "disable-webhook", false, "Disable the webhook server for local debugging.")

opts := zap.Options{
Development: true,
Expand All @@ -129,14 +131,19 @@ func main() {
}
}

var webhookServer webhook.Server
if !disableWebhook {
webhookServer = webhook.NewServer(webhook.Options{
Port: webhookServerPort,
})
}

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
WebhookServer: webhook.NewServer(webhook.Options{
Port: webhookServerPort,
}),
WebhookServer: webhookServer,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
Expand All @@ -147,20 +154,24 @@ func main() {
os.Exit(1)
}

// Setup webhook server based on the disableWebhook flag

certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
}
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
if !disableWebhook {
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
}
}

setupProbeEndpoints(mgr, certsReady)
setupProbeEndpoints(mgr, certsReady, disableWebhook)
// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady)
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady, disableWebhook)

//+kubebuilder:scaffold:builder

Expand All @@ -171,10 +182,12 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}, disableWebhook bool) {
if !disableWebhook {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
}

setupLog.Info("registering controllers...")
// Prepare GangSchedulingSetupFunc
Expand Down Expand Up @@ -207,19 +220,21 @@ func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchem
setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s)
os.Exit(1)
}
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
if !disableWebhook {
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
}
}
}
}

func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}, disableWebhook bool) {
defer setupLog.Info("Probe endpoints are configured on healthz and readyz")

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand All @@ -230,20 +245,22 @@ func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
// Wait for the webhook server to be listening before advertising the
// training-operator replica as ready. This allows users to wait with sending the first
// requests, requiring webhooks, until the training-operator deployment is available, so
// that the early requests are not rejected during the traininig-operator's startup.
// that the early requests are not rejected during the training-operator's startup.
// We wrap the call to GetWebhookServer in a closure to delay calling
// the function, otherwise a not fully-initialized webhook server (without
// ready certs) fails the start of the manager.
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
if !disableWebhook {
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}

Expand Down
123 changes: 37 additions & 86 deletions docs/development/developer_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@ Kubeflow Training Operator is currently at v1.
- [Python](https://www.python.org/) (3.11 or later)
- [kustomize](https://kustomize.io/) (4.0.5 or later)
- [Kind](https://kind.sigs.k8s.io/) (0.22.0 or later)
- [Lima](https://github.com/lima-vm/lima?tab=readme-ov-file#adopters) (an alternative to DockerDesktop) (0.21.0 or
later)
- [Colima](https://github.com/abiosoft/colima) (Lima specifically for MacOS) (0.6.8 or later)
- [Lima](https://github.com/lima-vm/lima?tab=readme-ov-file#adopters) (an alternative to DockerDesktop) (0.21.0 or later)
- [Colima](https://github.com/abiosoft/colima) (Lima specifically for MacOS) (0.6.8 or later)
- [pre-commit](https://pre-commit.com/)

Note for Lima the link is to the Adopters, which supports several different container environments.
Expand Down Expand Up @@ -50,50 +49,38 @@ Running the operator locally (as opposed to deploying it on a K8s cluster) is co
First, you need to run a Kubernetes cluster locally. We recommend [Kind](https://kind.sigs.k8s.io).

You can create a `kind` cluster by running

```sh
kind create cluster
```

This will load your kubernetes config file with the new cluster.

After creating the cluster, you can check the nodes with the code below which should show you the kind-control-plane.

```sh
kubectl get nodes
```

The output should look something like below:

```
$ kubectl get nodes
NAME STATUS ROLES AGE VERSION
kind-control-plane Ready control-plane 32s v1.27.3
```

Note, that for the example job below, the PyTorchJob uses the `kubeflow` namespace.

From here we can apply the manifests to the cluster.

```sh
kubectl apply -k "github.com/kubeflow/training-operator/manifests/overlays/standalone"
```

Then we can patch it with the latest operator image.

```sh
kubectl patch -n kubeflow deployments training-operator --type json -p '[{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "kubeflow/training-operator:latest"}]'
```

Then we can run the job with the following command.

```sh
kubectl apply -f https://raw.githubusercontent.com/kubeflow/training-operator/master/examples/pytorch/simple.yaml
```

And we can see the output of the job from the logs, which may take some time to produce but should look something like
below.

And we can see the output of the job from the logs, which may take some time to produce but should look something like below.
```
$ kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple --follow
Defaulted container "pytorch" out of: pytorch, init-pytorch (init)
Expand All @@ -119,21 +106,51 @@ Defaulted container "pytorch" out of: pytorch, init-pytorch (init)
2024-04-19T19:00:57Z INFO Train Epoch: 1 [10240/60000 (17%)] loss=1.1650
```

## Running the Operator and Debugging Locally

If you need to develop, test, and debug the Operator on your local machine (such as a Mac) and want to disable webhook validation, you can follow the steps below.

### Configure KUBECONFIG and KUBEFLOW_NAMESPACE

We can configure the Operator to use the configuration available in your kubeconfig to communicate with a K8s cluster. Set your environment:

```sh
export KUBECONFIG=$(echo ~/.kube/config)
```

### Create the CRDS
After the cluster is up, the CRDS should be created on the cluster.
The CRDS created using `/manifests/overlays/local` will ignore the webhook validation.

```bash
kubectl apply -k ./manifests/overlays/local
```

### Run Operator

Now we are ready to run the Operator locally and disable webhook validation,

```sh
go run ./cmd/training-operator.v1/main.go -disable-webhook=true
```
- `-disable-webhook=true`: This flag disables webhook validation.


This way, the Operator will run locally and will not perform any webhook-related operations, making it easier for you to debug.


## Testing changes locally

Now that you confirmed you can spin up an operator locally, you can try to test your local changes to the operator.
You do this by building a new operator image and loading it into your kind cluster.

### Build Operator Image

```sh
make docker-build IMG=my-username/training-operator:my-pr-01
```

You can swap `my-username/training-operator:my-pr-01` with whatever you would like.

## Load docker image

```sh
kind load docker-image my-username/training-operator:my-pr-01
```
Expand All @@ -144,93 +161,27 @@ kind load docker-image my-username/training-operator:my-pr-01
cd ./manifests/overlays/standalone
kustomize edit set image my-username/training-operator=my-username/training-operator:my-pr-01
```

Update the `newTag` key in `./manifests/overlayes/standalone/kustimization.yaml` with the new image.

Deploy the operator with:

```sh
kubectl apply -k ./manifests/overlays/standalone
```

And now we can submit jobs to the operator.

```sh
kubectl patch -n kubeflow deployments training-operator --type json -p '[{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value": "my-username/training-operator:my-pr-01"}]'
kubectl apply -f https://raw.githubusercontent.com/kubeflow/training-operator/master/examples/pytorch/simple.yaml
```

You should be able to see a pod for your training operator running in your namespace using

```
kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple
```

## Testing changes locally without build image

Building and testing changes through container images can be time-consuming, so here is a simpler method that allows you
to start and test directly through the command line or your development tools. Note that this approach is effective only
for clusters created with Kind on your local machine (e.g., on a Mac).

### Install cert-manager and generate a certificate

Deploy cert-manager to manage the webhook's certificate:

```sh
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.15.3/cert-manager.yaml
```

To generate a certificate for local debugging of webhooks using cert-manager, create a certificate.yaml file with the
following content:

```yaml
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: selfsigned-issuer
namespace: kubeflow
spec:
selfSigned: { }

---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: serving-cert # This name should match the one appearing in kustomizeconfig.yaml
namespace: kubeflow
spec:
# $(SERVICE_NAME) and $(SERVICE_NAMESPACE) will be substituted by kustomize
dnsNames:
- $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc
- $(SERVICE_NAME).$(SERVICE_NAMESPACE).svc.cluster.local
- host.docker.internal # host.docker.internal is the hostname for Docker Desktop on macOS
ipAddresses: # New configuration about node IP addresses
- "172.17.0.1" # IP address for Docker on Linux
issuerRef:
kind: Issuer
name: selfsigned-issuer
secretName: webhook-server-cert # This secret will not be prefixed, since it's not managed by kustomize
```
Create the certificate:
```
kubectl apply -f certificate.yaml
```

The generated `tls.*` files need to be stored in the `/tmp/k8s-webhook-server/serving-certs` directory.

```sh
kubectl get secret -n kubeflow webhook-server-cert -o=jsonpath='{.data.tls\.key}' | base64 -d >${TMPDIR}/k8s-webhook-server/serving-certs/tls.key
kubectl get secret -n kubeflow webhook-server-cert -o=jsonpath='{.data.tls\.crt}' | base64 -d >${TMPDIR}/k8s-webhook-server/serving-certs/tls.crt

```

## Go version

On ubuntu the default go package appears to be gccgo-go which has problems
see [issue](https://github.com/golang/go/issues/15429) golang-go package is also really old so install from golang
tarballs instead.
On ubuntu the default go package appears to be gccgo-go which has problems see [issue](https://github.com/golang/go/issues/15429) golang-go package is also really old so install from golang tarballs instead.

## Generate Python SDK

Expand Down
20 changes: 20 additions & 0 deletions manifests/overlays/local/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kubeflow
resources:
- ../../base/
- namespace.yaml
images:
- name: kubeflow/training-operator
newName: kubeflow/training-operator
newTag: latest
secretGenerator:
- name: training-operator-webhook-cert
options:
disableNameSuffixHash: true
patches:
- path: validating_webhook_patch.yaml
target:
group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
version: v1
4 changes: 4 additions & 0 deletions manifests/overlays/local/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow
Loading

0 comments on commit f2f942d

Please sign in to comment.