Skip to content

Commit

Permalink
Implement webhook validations for the PyTorchJob
Browse files Browse the repository at this point in the history
Signed-off-by: Yuki Iwai <[email protected]>
  • Loading branch information
tenzen-y committed Apr 5, 2024
1 parent a08246d commit 611c496
Show file tree
Hide file tree
Showing 22 changed files with 749 additions and 351 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ help: ## Display this help.
##@ Development

manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." output:crd:artifacts:config=manifests/base/crds output:rbac:artifacts:config=manifests/base/rbac
$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=training-operator webhook paths="./pkg/..." \
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.go.txt" paths="./pkg/apis/..."
Expand Down
91 changes: 72 additions & 19 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"errors"
"flag"
"net/http"
"os"
"strings"

Expand All @@ -40,9 +41,11 @@ import (
volcanoclient "volcano.sh/apis/pkg/client/clientset/versioned"

kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
"github.com/kubeflow/training-operator/pkg/cert"
"github.com/kubeflow/training-operator/pkg/config"
controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1"
"github.com/kubeflow/training-operator/pkg/controller.v1/common"
"github.com/kubeflow/training-operator/pkg/webhooks"
//+kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -72,8 +75,11 @@ func main() {
var enabledSchemes controllerv1.EnabledSchemes
var gangSchedulerName string
var namespace string
var webhookServerPort int
var controllerThreads int
var webhookServerPort int
var webhookServiceName string
var webhookSecretName string

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand All @@ -86,7 +92,6 @@ func main() {
" Note: If you set another scheduler name, the training-operator assumes it's the scheduler-plugins.")
flag.StringVar(&namespace, "namespace", os.Getenv(EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+
"If set, it only monitors kubeflow jobs in the given namespace.")
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.IntVar(&controllerThreads, "controller-threads", 1, "Number of worker threads used by the controller.")

// PyTorch related flags
Expand All @@ -101,6 +106,11 @@ func main() {
flag.StringVar(&config.Config.MPIKubectlDeliveryImage, "mpi-kubectl-delivery-image",
config.MPIKubectlDeliveryImageDefault, "The image for mpi launcher init container")

// Cert generation flags
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator-service", "Name of the Service used as part of the DNSName")
flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-secret", "Name of the Secret to store CA and server certs")

opts := zap.Options{
Development: true,
StacktraceLevel: zapcore.DPanicLevel,
Expand All @@ -124,9 +134,9 @@ func main() {
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
WebhookServer: &webhook.DefaultServer{Options: webhook.Options{
WebhookServer: webhook.NewServer(webhook.Options{
Port: webhookServerPort,
}},
}),
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
Expand All @@ -137,30 +147,36 @@ func main() {
os.Exit(1)
}

// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads)

//+kubebuilder:scaffold:builder

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
}
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up ready check")
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
}

setupProbeEndpoints(mgr, certsReady)
// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady)

//+kubebuilder:scaffold:builder

setupLog.Info("starting manager")
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
setupLog.Error(err, "problem running manager")
os.Exit(1)
}
}

func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int) {
setupLog.Info("registering controllers...")
func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")

setupLog.Info("registering controllers...")
// Prepare GangSchedulingSetupFunc
gangSchedulingSetupFunc := common.GenNonGangSchedulerSetupFunc()
if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerVolcano)) {
Expand All @@ -182,15 +198,52 @@ func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchem
}
errMsg := "failed to set up controllers"
for _, s := range enabledSchemes {
setupFunc, supported := controllerv1.SupportedSchemeReconciler[s]
if !supported {
setupReconcilerFunc, supportedReconciler := controllerv1.SupportedSchemeReconciler[s]
if !supportedReconciler {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil {
if err := setupReconcilerFunc(mgr, gangSchedulingSetupFunc, controllerThreads); err != nil {
setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s)
os.Exit(1)
}
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
}
}
}

func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
defer setupLog.Info("Probe endpoints are configured on healthz and readyz")

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}

// Wait for the webhook server to be listening before advertising the
// training-operator replica as ready. This allows users to wait with sending the first
// requests, requiring webhooks, until the training-operator deployment is available, so
// that the early requests are not rejected during the traininig-operator's startup.
// We wrap the call to GetWebhookServer in a closure to delay calling
// the function, otherwise a not fully-initialized webhook server (without
// ready certs) fails the start of the manager.
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}

Expand Down
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ require (
github.com/google/go-cmp v0.6.0
github.com/onsi/ginkgo/v2 v2.14.0
github.com/onsi/gomega v1.30.0
github.com/open-policy-agent/cert-controller v0.10.1
github.com/prometheus/client_golang v1.18.0
github.com/sirupsen/logrus v1.9.0
github.com/stretchr/testify v1.8.4
go.uber.org/zap v1.26.0
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e
k8s.io/api v0.29.3
k8s.io/apimachinery v0.29.3
k8s.io/client-go v0.29.3
Expand Down Expand Up @@ -58,8 +60,8 @@ require (
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.uber.org/atomic v1.11.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/oauth2 v0.12.0 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY
github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw=
github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8=
github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
github.com/open-policy-agent/cert-controller v0.10.1 h1:RXSYoyn8FdCenWecRP//UV5nbVfmstNpj4kHQFkvPK4=
github.com/open-policy-agent/cert-controller v0.10.1/go.mod h1:4uRbBLY5DsPOog+a9pqk3JLxuuhrWsbUedQW65HcLTI=
github.com/open-policy-agent/frameworks/constraint v0.0.0-20230822235116-f0b62fe1e4c4 h1:5dum5SLEz+95JDLkMls7Z7IDPjvSq3UhJSFe4f5einQ=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand Down Expand Up @@ -113,6 +116,8 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
Expand Down Expand Up @@ -202,6 +207,7 @@ k8s.io/gengo v0.0.0-20230829151522-9cce18d56c01/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAE
k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0=
k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo=
k8s.io/kube-aggregator v0.28.1 h1:rvG4llYnQKHjj6YjjoBPEJxfD1uH0DJwkrJTNKGAaCs=
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780=
k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA=
k8s.io/utils v0.0.0-20230726121419-3b25d923346b h1:sgn3ZU783SCgtaSJjpcVVlRqd6GSnlTLKgpAAttJvpI=
Expand Down
12 changes: 12 additions & 0 deletions manifests/base/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ spec:
name: training-operator
ports:
- containerPort: 8080
- containerPort: 9443
name: webhook-server
protocol: TCP
env:
- name: MY_POD_NAMESPACE
valueFrom:
Expand All @@ -34,6 +37,10 @@ spec:
fieldPath: metadata.name
securityContext:
allowPrivilegeEscalation: false
volumeMounts:
- mountPath: /tmp/k8s-webhook-server/serving-certs
name: cert
readOnly: true
livenessProbe:
httpGet:
path: /healthz
Expand All @@ -50,3 +57,8 @@ spec:
timeoutSeconds: 3
serviceAccountName: training-operator
terminationGracePeriodSeconds: 10
volumes:
- name: cert
secret:
defaultMode: 420
secretName: training-operator-webhook-secret
2 changes: 2 additions & 0 deletions manifests/base/internalcert/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
resources:
- secret.yaml
4 changes: 4 additions & 0 deletions manifests/base/internalcert/secret.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Secret
metadata:
name: training-operator-webhook-secret
3 changes: 2 additions & 1 deletion manifests/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@ resources:
- ./rbac/cluster-role-binding.yaml
- ./rbac/role.yaml
- ./rbac/service-account.yaml
- service.yaml
- ./internalcert
- ./webhook
- deployment.yaml
18 changes: 18 additions & 0 deletions manifests/base/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ rules:
- pods/exec
verbs:
- create
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- update
- watch
- apiGroups:
- ""
resources:
Expand All @@ -62,6 +71,15 @@ rules:
- get
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- list
- update
- watch
- apiGroups:
- autoscaling
resources:
Expand Down
17 changes: 17 additions & 0 deletions manifests/base/webhook/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- manifests.yaml
- service.yaml
namePrefix: training-operator-
commonLabels:
control-plane: kubeflow-training-operator
patches:
- path: patch.yaml
target:
group: admissionregistration.k8s.io
version: v1
kind: ValidatingWebhookConfiguration

configurations:
- kustomizeconfig.yaml
10 changes: 10 additions & 0 deletions manifests/base/webhook/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# the following config is for teaching kustomize where to look at when substituting vars.
# It requires kustomize v2.1.0 or newer to work properly.
namespace:
- kind: ValidatingWebhookConfiguration
group: admissionregistration.k8s.io
path: webhooks/clientConfig/service/namespace
create: true

varReference:
- path: metadata/annotations
26 changes: 26 additions & 0 deletions manifests/base/webhook/manifests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: validating-webhook-configuration
webhooks:
- admissionReviewVersions:
- v1
clientConfig:
service:
name: webhook-service
namespace: system
path: /validate-kubeflow-org-v1-pytorchjob
failurePolicy: Fail
name: validator.pytorchjob.training-operator.kubeflow.org
rules:
- apiGroups:
- kubeflow.org
apiVersions:
- v1
operations:
- CREATE
- UPDATE
resources:
- pytorchjobs
sideEffects: None
3 changes: 3 additions & 0 deletions manifests/base/webhook/patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- op: replace
path: /webhooks/0/clientConfig/service/name
value: training-operator-service
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
---
apiVersion: v1
kind: Service
metadata:
Expand All @@ -8,12 +7,16 @@ metadata:
prometheus.io/port: "8080"
labels:
app: training-operator
name: training-operator
name: service
spec:
ports:
- name: monitoring-port
port: 8080
targetPort: 8080
- name: monitoring-port
port: 8080
targetPort: 8080
- name: webhook-server
port: 443
protocol: TCP
targetPort: 9443
selector:
control-plane: kubeflow-training-operator
type: ClusterIP
Loading

0 comments on commit 611c496

Please sign in to comment.