From ae255b2a6c2f865e09266c2a95fef9ed52c2bc46 Mon Sep 17 00:00:00 2001
From: Daniel Gutowski <danielgutowski@google.com>
Date: Tue, 13 Jun 2023 02:11:26 -0700
Subject: [PATCH] Introduce AEP with Provisioning Request CRD

---
 .../proposals/provisioning-request.md         | 293 ++++++++++++++++++
 1 file changed, 293 insertions(+)
 create mode 100644 cluster-autoscaler/proposals/provisioning-request.md

diff --git a/cluster-autoscaler/proposals/provisioning-request.md b/cluster-autoscaler/proposals/provisioning-request.md
new file mode 100644
index 000000000000..092b5ad3a98e
--- /dev/null
+++ b/cluster-autoscaler/proposals/provisioning-request.md
@@ -0,0 +1,293 @@
+# Provisioning Request CRD
+
+author: kisieland
+
+## Background
+
+Currently CA does not provide any way to express that a group of pods would like
+to have a capacity available.  
+This is caused by the fact that each CA loop picks a group of unschedulable pods
+and works on provisioning capacity for them, meaning that the grouping is random
+(as it depends on the kube-scheduler and CA loop interactions).  
+This is especially problematic in couple of cases:
+
+  - Users would like to have all-or-nothing semantics for their workloads.
+    Currently CA will try to provision this capacity and if it is partially
+    successful it will leave it in cluster until user removes the workload.
+  - Users would like to lower e2e scale-up latency for huge scale-ups (100
+    nodes+). Due to CA nature and kube-scheduler throughput, CA will create
+    partial scale-ups, e.g. `0->200->400->600` rather than one `0->600`. This
+    significantly increases the e2e latency as there is non-negligible time tax
+    on each scale-up operation.
+
+## Proposal
+
+### High level
+
+Provisioning Request (abbr. ProvReq) is a new namespaced Custom Resource that
+aims to allow users to ask CA for capacity for groups of pods.  
+It allows users to express the fact that group of pods is connected and should
+be threated as one entity.  
+This AEP proposes an API that can have multiple provisioning classes and can be
+extended by cloud provider specific ones.  
+This object is meant as one-shot request to CA, so that if CA fails to provision
+the capacity it is up to users to retry (such retry functionality can be added
+later on).
+
+### ProvisioningRequest CRD
+
+The following code snippets assume [kubebuilder](https://book.kubebuilder.io/)
+is used to generate the CRD:
+
+```go
+// ProvisioningRequest is a way to express additional capacity
+// that we would like to provision in the cluster. Cluster Autoscaler
+// can use this information in its calculations and signal if the capacity
+// is available in the cluster or actively add capacity if needed.
+type ProvisioningRequest struct {
+	metav1.TypeMeta `json:",inline"`
+	// Standard object metadata. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#metadata
+	//
+	// +optional
+	metav1.ObjectMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
+	// Spec contains specification of the ProvisioningRequest object.
+	// More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status.
+	//
+	// +kubebuilder:validation:Required
+	Spec ProvisioningRequestSpec `json:"spec" protobuf:"bytes,2,name=spec"`
+	// Status of the ProvisioningRequest. CA constantly reconciles this field.
+	//
+	// +optional
+	Status ProvisioningRequestStatus `json:"status,omitempty" protobuf:"bytes,3,opt,name=status"`
+}
+
+// ProvisioningRequestList is a object for list of ProvisioningRequest.
+type ProvisioningRequestList struct {
+	metav1.TypeMeta `json:",inline"`
+	// Standard list metadata.
+	//
+	// +optional
+	metav1.ListMeta `json:"metadata" protobuf:"bytes,1,opt,name=metadata"`
+	// Items, list of ProvisioningRequest returned from API.
+	//
+	// +optional
+	Items []ProvisioningRequest `json:"items" protobuf:"bytes,2,rep,name=items"`
+}
+
+// ProvisioningRequestSpec is a specification of additional pods for which we
+// would like to provision additional resources in the cluster.
+type ProvisioningRequestSpec struct {
+	// PodSets lists groups of pods for which we would like to provision
+	// resources.
+	//
+	// +kubebuilder:validation:Required
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="Value is immutable"
+	PodSets []PodSet `json:"podSets" protobuf:"bytes,1,rep,name=podSets"`
+
+	// ProvisioningClass describes the different modes of provisioning the resources.
+	// Supported values:
+	// * GenericCheckCapacity - check if current cluster state can fullfil this request
+	// * GenericAtomicScaleUp - provision the resources in an atomic manner
+    // * ... - potential other classes that are specific to the cloud providers
+	//
+	// +kubebuilder:validation:Required
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="Value is immutable"
+	ProvisioningClass string `json:"provisioningClass" protobuf:"bytes,2,name=provisioningClass"`
+
+	// ValidUntilSeconds contains information on how long we can wait for the ProvReq to provide VMs.
+	// Field specific to GenericAtomicScaleUp provisioning class.
+	//
+	// +optional
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="Value is immutable"
+	ValidUntilSeconds *int64 `json:"validUntilSeconds" protobuf:"bytes,3,name=validUntilSeconds"`
+
+	// AdditionalParameters contains all other parameters custom classes may require.
+	//
+	// +optional
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="Value is immutable"
+	AdditionalParameters map[string]string `json:"additionalParameters" protobuf:"bytes,4,name=additionalParameters"`
+}
+
+type PodSet struct {
+	// Template representing pods that will consume this reservation.
+	// Requirements for resources (CPU, RAM, GPUs, TPUs, storage) are
+	// necessary (must be non-zero). Users need to make sure that the
+	// tolerations and label selectors are consistent between this template
+	// and actual pods consuming the Provisioning Request.
+	Template apiv1.PodTemplateSpec `json:"template" protobuf:"bytes,1,name=template"`
+	// Count contains the number of pods that will be created with a given
+	// template.
+	// +kubebuilder:validation:Minimum=1
+	Count int32 `json:"count" protobuf:"bytes,2,name=count"`
+}
+
+// ProvisioningRequestStatus represents the status of the resource reservation.
+type ProvisioningRequestStatus struct {
+	// Conditions represent the observations of a Provisioning Request's
+	// current state. Those will contain information whether the capacity
+    // was found/created or if there were any issues. The condition types
+    // may differ between different provisioning classes.
+	//
+	// +optional
+	Conditions []metav1.Condition `json:"conditions" protobuf:"bytes,1,rep,name=conditions"`
+
+	// AdditionalStatus contains all other status values custom provisioning classes may require.
+	//
+	// +optional
+	AdditionalStatus map[string]string `json:"additionalStatus" protobuf:"bytes,2,name=additionalStatus"`
+}
+```
+
+### Provisioning Classes
+
+#### GenericCheckCapacity class
+
+The `GenericCheckCapacity` is one-off check to verify that the in the cluster
+there is enough capacity to provision given set of pods.
+
+Note: If two of such objects are created around the same time, CA will consider
+them independently and place no guards for the capacity.
+Also the capacity is not reserved in any manner so it may be scaled-down.
+
+#### GenericAtomicScaleUp class
+
+The `GenericAtomicScaleUp` aims to provision the resources required for the
+specified pods in an atomic way. The proposed logic is to:
+1. Try to provision required VMs in one loop.
+2. If it failed, remove the partially provisioned VMs and back-off.
+3. Stop the back-off after a given duration (optional), which would be passed
+   via `ValidUntilSeconds` field and value containing seconds for which we should 
+   retry (measured since creation fo the CR).
+
+Note: that the VMs created in this mode are subject to the scale-down logic.
+So the duration during which users need to create the Pods is equal to the
+value of `--scale-down-unneeded-time` flag.
+
+### Adding pods that consume given ProvisioningRequest
+
+To avoid generating double scale-ups and exclude pods that are meant to consume
+given capacity CA should be able to differentiate those from all other pods.  
+To do so users need to specify the following pod annotation (it is not required
+in ProvReq’s template, though it can be specified):
+
+```yaml
+annotations:
+    "cluster-autoscaler.kubernetes.io/consume-provisioning-request": "provreq-name"
+```
+
+Note: CA will match all pods with this annotation to a corresponding ProvReq and
+ignore them when executing a scale-up loop (so that is up to users to make sure
+that the ProvReq count is matching the number of created pods).
+
+### CRD lifecycle
+
+1.  A ProvReq will be created either by the end user or by a framework.
+2.  CA will pick it up, choose a nodepool (or create a new one if NAP is
+    enabled), and try to create nodes.
+3.  If CA successfully creates capacity, ProvReq will receive information about
+    this fact in `Conditions` field.
+4.  At this moment, users can create pods in that will consume the ProvReq (in
+    the same namespace), those will be scheduled on the capacity that was
+    created by the CA.
+5.  Once all of the pods are scheduled users can delete the ProvReq object,
+    otherwise it will be garbage collected after some time.
+6.  When pods finish the work and nodes become unused the CA will scale them
+    down.
+
+Note: Users can create a ProvReq and pods consuming them at the same time (in a
+"fire and forget" manner), but this may result in the pods being unschedulable
+and triggering user configured alerts.
+
+### Conditions
+
+The following Condition states should encode the states of the ProvReq:
+
+  - Provisioned - VMs were created successfully (Atomic class)
+  - CapacityAvailable - cluster contains enough capacity to schedule pods (Check
+    class)
+    * `CapacityAvailable=true` will denote that cluster contains enough capacity to schedule pods
+	* `CapacityAvailable=false` will denote that cluster does not contain enough capacity to schedule pods
+  - Failed - failed to create or check capacity (both classes)
+
+The Reasons and Messages will contain more details about why the specific
+condition was triggered.
+
+### CA implementation details
+
+The proposed implementation is to handle each ProvReq in a separate scale-up
+loop. This will require changes in multiple parts of CA:
+
+1.  Listing unschedulable pods where:
+      - pods that consume ProvReq need to filtered-out
+      - pods that are represented by the ProvReq need to be injected (we need to
+        ensure those are threated as one group by the sharding logic)
+2.  Scale-up logic, which as of now has no notion atomicity and grouping of
+    pods. This is simplified as the ScaleUp logic was recently put [behind an
+    interface](https://github.com/kubernetes/autoscaler/pull/5597).
+      - This is a place where the biggest part of the change will be made. Here
+        many parts of the logic are assuming best-effort semantics and the scale
+        up size is lowered in many situations:
+          - Estimation logic, which stops after some time-out or number of
+            pods/nodes.
+          - Size limiting, which caps the scale-up to match the size
+            restrictions (on node group or cluster level).
+3.  Node creation, which needs to support atomic resize. Either via native cloud
+    provider APIs or best effort with node removal if CA is unable to fulfill
+    the scale-up.
+      - This is also quite substantial change, we can provide a generic
+        best-effort implementation that will try to scale up and clean-up nodes
+        if it is unsuccessful, but it is up to cloud providers to integrate with
+        provider specific APIs.
+4.  Scale down path is not expected to change much. But users should follow
+    [best
+    practices](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node)
+    to avoid CA disturbing their workloads.
+
+## Testing
+
+The following e2e test scenarios will be created to check whether ProvReq
+handling works as expected:
+
+1.  A new ProvReq with `GenericCheckCapacity` provisioning class is created, CA
+    checks if there is enough capacity in cluster to provision specified pods.
+2.  A new ProvReq with `GenericAtomicScaleUp` provisioning class is created, CA
+    picks an appropriate node group scales it up atomically.
+3.  A new atomic ProvReq is created for which a NAP needs to provision a new
+    node group. NAP creates it CA scales it atomically.
+      - Here we should cover some of the different reasons why NAP may be
+        required.
+4.  An atomic ProvReq fails due to node group size limits and NAP CPU and/or RAM
+    limits.
+5.  Scalability tests.
+      - Scenario in which many small ProvReqs are created (strain on the number
+        of scale-up loops).
+      - Scenario in which big ProvReq is created (strain on a single scale-up
+        loop).
+
+## Future Expansions
+
+### ProvisioningClass CRD
+
+One of the expansion of this approach is to introduce the ProvisioningClass CRD,
+which follows the same approach as
+[StorageClass object](https://kubernetes.io/docs/concepts/storage/storage-classes/).
+Such approach would allow administrators of the cluster to introduce a list of allowed
+ProvisioningClasses. Such CRD can also contain a pre set configuration, i.e.
+administrators may set that `GenericAtomicScaleUp` would retry up to `2h`.
+
+Possible CRD definition:
+```go
+// ProvisioningClass is a way to express provisioning classes available in the cluster.
+type ProvisioningClass struct {
+	// Name denotes the name of the object, which is to be used in the ProvisioningClass 
+	// field in Provisioning Request CRD.
+	// 
+	// +kubebuilder:validation:Required
+	Name string `json:"name" protobuf:"bytes,1,name=name"`
+
+	// AdditionalParameters contains all other parameters custom classes may require.
+	//
+	// +optional
+	AdditionalParameters map[string]string `json:"additionalParameters" protobuf:"bytes,2,name=additionalParameters"`
+}
+```