Skip to content
This repository has been archived by the owner on Jul 17, 2024. It is now read-only.

Commit

Permalink
feat: log pod start time
Browse files Browse the repository at this point in the history
Signed-off-by: cutecutecat <[email protected]>
  • Loading branch information
cutecutecat committed Oct 19, 2023
1 parent 0928cf2 commit e034eb6
Show file tree
Hide file tree
Showing 16 changed files with 139 additions and 56 deletions.
6 changes: 0 additions & 6 deletions agent/api/types/const.go

This file was deleted.

15 changes: 9 additions & 6 deletions agent/api/types/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ package types
import "time"

const (
DeploymentCreateEvent = "deployment-create"
DeploymentUpdateEvent = "deployment-update"
DeploymentDeleteEvent = "deployment-delete"
DeploymentScaleUpEvent = "deployment-scale-up"
DeploymentScaleDownEvent = "deployment-scale-down"
DeploymentScaleBlockEvent = "deployment-scale-block"
DeploymentCreateEvent = "deployment-create"
DeploymentUpdateEvent = "deployment-update"
DeploymentDeleteEvent = "deployment-delete"
DeploymentScaleUpEvent = "deployment-scale-up"
DeploymentScaleDownEvent = "deployment-scale-down"
DeploymentScaleBlockEvent = "deployment-scale-block"
DeploymentStartBeginEvent = "deployment-start-begin"
DeploymentStartFinishEvent = "deployment-start-finish"
DeploymentStartTimeoutEvent = "deployment-start-timeout"
)

type DeploymentEvent struct {
Expand Down
6 changes: 0 additions & 6 deletions agent/pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,6 @@ package consts
import "time"

const (
LabelBuildName = "ai.tensorchord.build"
LabelName = "ai.tensorchord.name"
LabelServerResource = "ai.tensorchord.server-resource"
AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
ModelzAnnotationValue = "modelz"

Domain = "modelz.live"
DefaultPrefix = "modelz-"
APIKEY_PREFIX = "mzi-"
Expand Down
38 changes: 21 additions & 17 deletions agent/pkg/k8s/convert_inference.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,23 +58,7 @@ func AsInferenceDeployment(inf *v2alpha1.Inference, item *appsv1.Deployment) *ty
res.Status.CreatedAt = &item.CreationTimestamp.Time
res.Status.InvocationCount = 0
res.Status.AvailableReplicas = item.Status.AvailableReplicas

res.Status.Phase = types.PhaseNotReady
for _, c := range item.Status.Conditions {
if c.Type == appsv1.DeploymentAvailable && c.Status == v1.ConditionTrue {
res.Status.Phase = types.PhaseReady
} else if c.Type == appsv1.DeploymentProgressing && c.Status == v1.ConditionFalse {
res.Status.Phase = types.PhaseScaling
}
}

if item.Spec.Replicas != nil && *item.Spec.Replicas == 0 {
res.Status.Phase = types.PhaseNoReplicas
}

if item.DeletionTimestamp != nil {
res.Status.Phase = types.PhaseTerminating
}
res.Status.Phase = AsStatusPhase(item)
}
return res
}
Expand All @@ -98,3 +82,23 @@ func AsResourceList(resources v1.ResourceList) types.ResourceList {
}
return res
}

func AsStatusPhase(item *appsv1.Deployment) types.Phase {
phase := types.PhaseNotReady
for _, c := range item.Status.Conditions {
if c.Type == appsv1.DeploymentAvailable && c.Status == v1.ConditionTrue {
phase = types.PhaseReady
} else if c.Type == appsv1.DeploymentProgressing && c.Status == v1.ConditionFalse {
phase = types.PhaseScaling
}
}

if item.Spec.Replicas != nil && *item.Spec.Replicas == 0 {
phase = types.PhaseNoReplicas
}

if item.DeletionTimestamp != nil {
phase = types.PhaseTerminating
}
return phase
}
2 changes: 1 addition & 1 deletion agent/pkg/k8s/generate_image_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ import (

kubefledged "github.com/senthilrch/kube-fledged/pkg/apis/kubefledged/v1alpha3"
"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/pkg/consts"
modelzetes "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down
7 changes: 3 additions & 4 deletions agent/pkg/k8s/generate_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ import (
"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
mzconsts "github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
)

func MakeBuild(req types.Build, inference *v2alpha1.Inference, builderImage, buildkitdAddr, buildctlBin, secret string) (*batchv1.Job, error) {
Expand Down Expand Up @@ -70,8 +69,8 @@ func MakeBuild(req types.Build, inference *v2alpha1.Inference, builderImage, bui
Namespace: req.Spec.Namespace,
OwnerReferences: ownerReference,
Labels: map[string]string{
consts.LabelBuildName: req.Spec.Name,
mzconsts.AnnotationBuilding: "true",
consts.LabelBuildName: req.Spec.Name,
consts.AnnotationBuilding: "true",
},
},
Spec: batchv1.JobSpec{
Expand Down
2 changes: 2 additions & 0 deletions agent/pkg/metrics/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
e.metricOptions.ServiceAvailableReplicasGauge.Reset()
e.metricOptions.ServiceTargetLoad.Reset()

e.metricOptions.PodStartHistogram.Collect(ch)

for _, service := range e.services {
var serviceName string
if len(service.Spec.Namespace) > 0 {
Expand Down
9 changes: 9 additions & 0 deletions agent/pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ type MetricOptions struct {
ServiceReplicasGauge *prometheus.GaugeVec
ServiceAvailableReplicasGauge *prometheus.GaugeVec
ServiceTargetLoad *prometheus.GaugeVec

PodStartHistogram *prometheus.HistogramVec
}

// ServiceMetricOptions provides RED metrics
Expand Down Expand Up @@ -108,6 +110,12 @@ func BuildMetricsOptions() MetricOptions {
[]string{"inference_name"},
)

podStartHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "pod_start_seconds",
Help: "Pod start time taken",
Buckets: []float64{5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 150.0, 300.0},
}, []string{"inference_name", "source_image"})

metricsOptions := MetricOptions{
GatewayInferencesHistogram: gatewayInferencesHistogram,
GatewayInferenceInvocation: gatewayInferenceInvocation,
Expand All @@ -116,6 +124,7 @@ func BuildMetricsOptions() MetricOptions {
ServiceTargetLoad: serviceTargetLoad,
GatewayInferenceInvocationStarted: gatewayInferenceInvocationStarted,
GatewayInferenceInvocationInflight: gatewayInferenceInvocationInflight,
PodStartHistogram: podStartHistogram,
}

return metricsOptions
Expand Down
8 changes: 4 additions & 4 deletions agent/pkg/runtime/inference_create.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"

localconsts "github.com/tensorchord/openmodelz/agent/pkg/consts"
ingressv1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/apis/modelzetes/v1"
v2alpha1 "github.com/tensorchord/openmodelz/modelzetes/pkg/apis/modelzetes/v2alpha1"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
Expand All @@ -14,7 +15,6 @@ import (
"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/errdefs"
"github.com/tensorchord/openmodelz/agent/pkg/config"
localconsts "github.com/tensorchord/openmodelz/agent/pkg/consts"
)

func (r generalRuntime) InferenceCreate(ctx context.Context,
Expand All @@ -37,7 +37,7 @@ func (r generalRuntime) InferenceCreate(ctx context.Context,
// Create the ingress
// TODO(gaocegege): Check if the domain is already used.
if r.ingressEnabled {
name := req.Spec.Labels[localconsts.LabelName]
name := req.Spec.Labels[consts.LabelName]

if r.ingressAnyIPToDomain {
// Get the service with type=loadbalancer.
Expand Down Expand Up @@ -208,8 +208,8 @@ func makeIngress(request types.InferenceDeployment, cfg config.IngressConfig) (*
}

annotation := map[string]string{}
if value, exist := request.Spec.Annotations[localconsts.AnnotationControlPlaneKey]; exist {
annotation[localconsts.AnnotationControlPlaneKey] = value
if value, exist := request.Spec.Annotations[consts.AnnotationControlPlaneKey]; exist {
annotation[consts.AnnotationControlPlaneKey] = value
}
ingress.Annotations = annotation

Expand Down
6 changes: 3 additions & 3 deletions agent/pkg/runtime/namespace.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ import (
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/errdefs"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
)

func (r generalRuntime) NamespaceList(ctx context.Context) ([]string, error) {
ns, err := r.kubeClient.CoreV1().Namespaces().List(ctx,
metav1.ListOptions{
LabelSelector: fmt.Sprintf("%s=true", types.LabelNamespace),
LabelSelector: fmt.Sprintf("%s=true", consts.LabelNamespace),
})
if err != nil {
if k8serrors.IsNotFound(err) {
Expand All @@ -37,7 +37,7 @@ func (r generalRuntime) NamespaceCreate(ctx context.Context, name string) error
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
types.LabelNamespace: "true",
consts.LabelNamespace: "true",
},
},
}
Expand Down
2 changes: 1 addition & 1 deletion agent/pkg/runtime/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import (
"context"

"github.com/sirupsen/logrus"
"github.com/tensorchord/openmodelz/agent/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand Down
1 change: 1 addition & 0 deletions agent/pkg/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"

"github.com/gin-gonic/gin"

"github.com/sirupsen/logrus"
apicorev1 "k8s.io/api/core/v1"
appsv1 "k8s.io/client-go/informers/apps/v1"
Expand Down
72 changes: 72 additions & 0 deletions agent/pkg/server/server_init_kubernetes.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
package server

import (
"context"
"fmt"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubeinformers "k8s.io/client-go/informers"
kubeinformersv1 "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/clientcmd"

kubefledged "github.com/senthilrch/kube-fledged/pkg/client/clientset/versioned"
"github.com/tensorchord/openmodelz/agent/api/types"
"github.com/tensorchord/openmodelz/agent/pkg/event"
"github.com/tensorchord/openmodelz/agent/pkg/k8s"
"github.com/tensorchord/openmodelz/agent/pkg/log"
"github.com/tensorchord/openmodelz/agent/pkg/runtime"
Expand Down Expand Up @@ -78,6 +87,7 @@ func (s *Server) initKubernetesResources() error {
}

pods := kubeInformerFactory.Core().V1().Pods()
s.podStartWatch(pods, kubeClient)
go pods.Informer().Run(stopCh)
if ok := cache.WaitForNamedCacheSync(
fmt.Sprintf("%s:pods", consts.ProviderName),
Expand Down Expand Up @@ -120,3 +130,65 @@ func (s *Server) initKubernetesResources() error {
}
return nil
}

// podStartWatch log event when pod start began and finished
func (s *Server) podStartWatch(pods kubeinformersv1.PodInformer, client *kubernetes.Clientset) {
pods.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
new := obj.(*v1.Pod)
if !strings.HasPrefix(new.Namespace, "modelz-") {
return
}
podWatchEventLog(s.eventRecorder, new, types.DeploymentStartBeginEvent)
start := time.Now()

// Ticker will keep watching until pod start or timeout
ticker := time.NewTicker(time.Second * 2)
timeout := time.After(5 * time.Minute)
go func() {
for {
select {
case <-timeout:
podWatchEventLog(s.eventRecorder, new, types.DeploymentStartTimeoutEvent)
return
case <-ticker.C:
pod, err := client.CoreV1().Pods(new.Namespace).Get(context.TODO(), new.Name, metav1.GetOptions{})
if err != nil {
logrus.WithFields(logrus.Fields{
"namespace": pod.Namespace,
"deployment": pod.Labels["app"],
"name": pod.Name,
}).Errorf("failed to get pod: %s", err)
}
for _, c := range pod.Status.Conditions {
if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
podWatchEventLog(s.eventRecorder, new, types.DeploymentStartFinishEvent)
label := prometheus.Labels{
"inference_name": new.Labels["app"],
"source_image": new.Annotations[consts.AnnotationDockerImage]}
s.metricsOptions.PodStartHistogram.With(label).
Observe(time.Since(start).Seconds())
return
}
}
break
}
}
}()
},
})
}

// log status for pod watch status transfer
func podWatchEventLog(recorder event.Interface, obj *v1.Pod, event string) {
deployment := obj.Labels["app"]
err := recorder.CreateDeploymentEvent(obj.Namespace, deployment, event, obj.Name)
if err != nil {
logrus.WithFields(logrus.Fields{
"namespace": obj.Namespace,
"deployment": deployment,
"name": obj.Name,
"event": event,
}).Errorf("failed to create deployment event: %s", err)
}
}
6 changes: 2 additions & 4 deletions ingress-operator/pkg/consts/consts.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package consts

const (
KeyCert = "cert"
EnvironmentPrefix = "MODELZ"
AnnotationControlPlaneKey = "ai.tensorchord.control-plane"
ModelzAnnotationValue = "modelz"
KeyCert = "cert"
EnvironmentPrefix = "MODELZ"
)
5 changes: 2 additions & 3 deletions ingress-operator/pkg/controller/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ import (
faasscheme "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/clientset/versioned/scheme"
v1 "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/informers/externalversions/modelzetes/v1"
listers "github.com/tensorchord/openmodelz/ingress-operator/pkg/client/listers/modelzetes/v1"
"github.com/tensorchord/openmodelz/ingress-operator/pkg/consts"
mdzconsts "github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
"github.com/tensorchord/openmodelz/modelzetes/pkg/consts"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
Expand Down Expand Up @@ -257,7 +256,7 @@ func MakeAnnotations(fni *faasv1.InferenceIngress, host string) map[string]strin
annotations := make(map[string]string)

annotations["ai.tensorchord.spec"] = string(specJSON)
inferenceNamespace := fni.Labels[mdzconsts.LabelInferenceNamespace]
inferenceNamespace := fni.Labels[consts.LabelInferenceNamespace]

if !fni.Spec.BypassGateway {
switch class {
Expand Down
10 changes: 9 additions & 1 deletion modelzetes/pkg/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,16 @@ const (

LabelInferenceName = "inference"
LabelInferenceNamespace = "inference-namespace"
LabelBuildName = "ai.tensorchord.build"
LabelName = "ai.tensorchord.name"
LabelNamespace = "modelz.tensorchord.ai/namespace"
LabelServerResource = "ai.tensorchord.server-resource"

AnnotationBuilding = "ai.tensorchord.building"
AnnotationBuilding = "ai.tensorchord.building"
AnnotationDockerImage = "ai.tensorchord.docker.image"
AnnotationControlPlaneKey = "ai.tensorchord.control-plane"

ModelzAnnotationValue = "modelz"

TolerationGPU = "ai.tensorchord.gpu"
TolerationNvidiaGPUPresent = "nvidia.com/gpu"
Expand Down

0 comments on commit e034eb6

Please sign in to comment.