Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support debugging webhooks locally #2227

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 48 additions & 31 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func main() {
var webhookServerPort int
var webhookServiceName string
var webhookSecretName string
var disableWebhook bool

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
Expand Down Expand Up @@ -110,6 +111,7 @@ func main() {
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator", "Name of the Service used as part of the DNSName")
flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-cert", "Name of the Secret to store CA and server certs")
flag.BoolVar(&disableWebhook, "disable-webhook", false, "Disable the webhook server for local debugging.")

opts := zap.Options{
Development: true,
Expand All @@ -129,14 +131,19 @@ func main() {
}
}

var webhookServer webhook.Server
if !disableWebhook {
webhookServer = webhook.NewServer(webhook.Options{
Port: webhookServerPort,
})
}

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
WebhookServer: webhook.NewServer(webhook.Options{
Port: webhookServerPort,
}),
WebhookServer: webhookServer,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
Expand All @@ -147,20 +154,24 @@ func main() {
os.Exit(1)
}

// Setup webhook server based on the disableWebhook flag

certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
}
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
if !disableWebhook {
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
}
}

setupProbeEndpoints(mgr, certsReady)
setupProbeEndpoints(mgr, certsReady, disableWebhook)
// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady)
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady, disableWebhook)

//+kubebuilder:scaffold:builder

Expand All @@ -171,10 +182,12 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}, disableWebhook bool) {
if !disableWebhook {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
}

setupLog.Info("registering controllers...")
// Prepare GangSchedulingSetupFunc
Expand Down Expand Up @@ -207,19 +220,21 @@ func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchem
setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s)
os.Exit(1)
}
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
if !disableWebhook {
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
}
}
}
}

func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}, disableWebhook bool) {
defer setupLog.Info("Probe endpoints are configured on healthz and readyz")

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand All @@ -230,20 +245,22 @@ func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
// Wait for the webhook server to be listening before advertising the
// training-operator replica as ready. This allows users to wait with sending the first
// requests, requiring webhooks, until the training-operator deployment is available, so
// that the early requests are not rejected during the traininig-operator's startup.
// that the early requests are not rejected during the training-operator's startup.
// We wrap the call to GetWebhookServer in a closure to delay calling
// the function, otherwise a not fully-initialized webhook server (without
// ready certs) fails the start of the manager.
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
if !disableWebhook {
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}

Expand Down
36 changes: 36 additions & 0 deletions docs/development/developer_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,39 @@ Defaulted container "pytorch" out of: pytorch, init-pytorch (init)
2024-04-19T19:00:57Z INFO Train Epoch: 1 [10240/60000 (17%)] loss=1.1650
```

## Running the Operator and Debugging Locally

If you need to develop, test, and debug the Operator on your local machine (such as a Mac) and want to disable webhook validation, you can follow the steps below.

### Configure KUBECONFIG and KUBEFLOW_NAMESPACE

We can configure the Operator to use the configuration available in your kubeconfig to communicate with a K8s cluster. Set your environment:

```sh
export KUBECONFIG=$(echo ~/.kube/config)
```

### Create the CRDS
After the cluster is up, the CRDS should be created on the cluster.
The CRDS created using `/manifests/overlays/local` will ignore the webhook validation.

```bash
kubectl apply -k ./manifests/overlays/local
```

### Run Operator

Now we are ready to run the Operator locally and disable webhook validation,

```sh
go run ./cmd/training-operator.v1/main.go -disable-webhook=true
```
- `-disable-webhook=true`: This flag disables webhook validation.


This way, the Operator will run locally and will not perform any webhook-related operations, making it easier for you to debug.


## Testing changes locally

Now that you confirmed you can spin up an operator locally, you can try to test your local changes to the operator.
Expand Down Expand Up @@ -143,6 +176,9 @@ You should be able to see a pod for your training operator running in your names
```
kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple
```



## Go version

On ubuntu the default go package appears to be gccgo-go which has problems see [issue](https://github.com/golang/go/issues/15429) golang-go package is also really old so install from golang tarballs instead.
Expand Down
20 changes: 20 additions & 0 deletions manifests/overlays/local/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kubeflow
resources:
- ../../base/
- namespace.yaml
images:
- name: kubeflow/training-operator
newName: kubeflow/training-operator
newTag: latest
secretGenerator:
- name: training-operator-webhook-cert
options:
disableNameSuffixHash: true
patches:
- path: validating_webhook_patch.yaml
target:
group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
version: v1
4 changes: 4 additions & 0 deletions manifests/overlays/local/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow
13 changes: 13 additions & 0 deletions manifests/overlays/local/validating_webhook_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
- op: replace
path: /webhooks/0/failurePolicy
value: Ignore
- op: replace
path: /webhooks/1/failurePolicy
value: Ignore
- op: replace
path: /webhooks/2/failurePolicy
value: Ignore
- op: replace
path: /webhooks/3/failurePolicy
value: Ignore