From 70ea48d761d70496707572a30ac897dfd9f1ab99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jose=20Ramon=20Ma=C3=B1es?= <jose@celestia.org>
Date: Tue, 14 Nov 2023 10:59:07 +0100
Subject: [PATCH] feat(torch): merge watchdog - update docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jose Ramon Mañes <jose@celestia.org>
---
 README.md               |  67 +++++++++++++++++++-
 pkg/http/server.go      |  99 +++++++++++++++++++++++-------
 pkg/k8s/services.go     | 131 ++++++++++++++++++++++++++++++++++++++++
 pkg/k8s/statefulsets.go |  21 +++++--
 pkg/metrics/metrics.go  |  61 +++++++++++++++++--
 pkg/nodes/consensus.go  |  21 +++----
 6 files changed, 357 insertions(+), 43 deletions(-)
 create mode 100644 pkg/k8s/services.go

diff --git a/README.md b/README.md
index 28d72c7..d88a191 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,11 @@ You can use Torch to manage the nodes connections from a config file and Torch w
 
 Torch uses the Kubernetes API to manage the nodes, it gets their multi addresses information and stores them in a Redis instance, also, it provides some metrics to expose the node's IDs through the `/metrics` endpoint.
 
+Torch automatically detects Load Balancer resources in a Kubernetes cluster and exposes metrics related to these Load Balancers.
+The service uses OpenTelemetry to instrument the metrics and Prometheus to expose them.
+It uses the Kubernetes API server with a watcher to receive events from it. Then filters the list to include only services of type **LoadBalancer**. 
+For each LoadBalancer service found, it retrieves the LoadBalancer public IP and name and generates metrics with custom labels. These metrics are then exposed via a Prometheus endpoint, making them available for monitoring and visualization in Grafana or other monitoring tools.
+
 ---
 
 ## Workflow
@@ -243,6 +248,66 @@ Torch uses [Redis](https://redis.io/) as a DB, so to use Torch, you need to have
 
 We are using Redis in two different ways:
 - Store the Nodes IDs and reuse them.
-- As a message broker, where Torch uses Producer & Consumer approach to process data async.
+- As a message broker, Torch uses the Producer & Consumer approach to process data async.
+
+---
+
+## Metrics
+
+### Multi Address
+
+Custom metrics to expose the nodes multi-address:
+
+- `multiaddr`: This metric represents the nodes Multi Address:
+  - `service_name`: The service name. In this case, it is set to **torch**.
+  - `node_name`: The name of the node.
+  - `multiaddress`: Node Multi Address.
+  - `namespace`: The namespace in which the torch is deployed.
+  - `value`: The value of the metric. In this example, it is set to 1.
+
+### BlockHeight
+
+Custom metrics to expose the first block height of the chain:
+
+- `block_height_1`: Name of the metric to represent the first block height of the chain:
+  - `service_name`: The service name. In this case, it is set to **torch**.
+  - `block_height_1`: First block id generated
+  - `earliest_block_time`: Timestamp when the chain was created.
+  - `days_running`: Number of days that the chain is running.
+  - `namespace`: The namespace in which the torch is deployed.
+  - `value`: The value of the metric. In this example, it is set to 1.
+
+### Load Balancer
+
+Custom metrics to expose the LoadBalancer public IPs:
+
+- `load_balancer`: This metric represents the LoadBalancer resource and includes the following labels:
+  - `service_name`: The service name. In this case, it is set to **torch**.
+  - `load_balancer_name`: The name of the LoadBalancer service.
+  - `load_balancer_ip`: The IP address of the LoadBalancer.
+  - `namespace`: The namespace in which the LoadBalancer is deployed.
+  - `value`: The value of the metric. In this example, it is set to 1, but it can be customized to represent different load balancing states.
+
+  
+---
+  
+## Monitoring and Visualization
+
+Torch exposes some custom metrics through the Prometheus endpoint.
+You can use Grafana to connect to Prometheus and create custom dashboards to visualize these metrics.
+
+To access the Prometheus and Grafana dashboards and view the metrics, follow these steps:
+
+1. Access the Prometheus dashboard:
+- Open a web browser and navigate to the Prometheus server's URL (e.g., `http://prometheus-server:9090`).
+- In the Prometheus web interface, you can explore and query the metrics collected by the Service Torch.
+
+2. Access the Grafana dashboard:
+- Open a web browser and navigate to the Grafana server's URL (e.g., `http://grafana-server:3000`).
+- Log in to Grafana using your credentials.
+- Create a new dashboard or import an existing one to visualize the LoadBalancer metrics from Prometheus.
+- Use the `load_balancer` metric and its labels to filter and display the relevant information.
+
+Customizing dashboards and setting up alerts in Grafana will help you monitor the performance and health of your LoadBalancer resources effectively.
 
 ---
diff --git a/pkg/http/server.go b/pkg/http/server.go
index 1c5682a..75d7811 100644
--- a/pkg/http/server.go
+++ b/pkg/http/server.go
@@ -19,6 +19,10 @@ import (
 	"github.com/celestiaorg/torch/pkg/nodes"
 )
 
+const (
+	retryInterval = 10 * time.Second // retryInterval Retry interval in seconds to generate the consensus metric.
+)
+
 // GetHttpPort GetPort retrieves the namespace where the service will be deployed
 func GetHttpPort() string {
 	port := os.Getenv("HTTP_PORT")
@@ -57,13 +61,6 @@ func Run(cfg config.MutualPeersConfig) {
 		return
 	}
 
-	// generate the metric from the Genesis Hash data
-	notOk := GenerateHashMetrics(cfg, err)
-	if notOk {
-		log.Error("Error registering metric block_height_1")
-		return
-	}
-
 	// Create the server
 	server := &http.Server{
 		Addr:    ":" + httpPort,
@@ -81,6 +78,10 @@ func Run(cfg config.MutualPeersConfig) {
 	log.Info("Server Started...")
 	log.Info("Listening on port: " + httpPort)
 
+	// check if Torch has to generate the metric or not.
+	BackgroundGenerateHashMetric(cfg)
+	BackgroundGenerateLBMetric()
+
 	// Initialize the goroutine to check the nodes in the queue.
 	log.Info("Initializing queues to process the nodes...")
 	// Create a new context without timeout as we want to keep this goroutine running forever, if we specify a timeout,
@@ -127,23 +128,79 @@ func Run(cfg config.MutualPeersConfig) {
 	log.Info("Server Exited Properly")
 }
 
-func GenerateHashMetrics(cfg config.MutualPeersConfig, err error) bool {
-	// Get the genesisHash
-	// check if the config has the consensusNode field defined
+// BackgroundGenerateHashMetric check if we have defined the consensus in the config, if so, it creates a goroutine
+// to generate the metric
+func BackgroundGenerateHashMetric(cfg config.MutualPeersConfig) {
+	// Check if the config has the consensusNode field defined to generate the metric from the Genesis Hash data.
 	if cfg.MutualPeers[0].ConsensusNode != "" {
-		blockHash, earliestBlockTime := nodes.GenesisHash(cfg)
-		err = metrics.WithMetricsBlockHeight(
-			blockHash,
-			earliestBlockTime,
-			cfg.MutualPeers[0].ConsensusNode,
-			os.Getenv("POD_NAMESPACE"),
-		)
-		if err != nil {
-			log.Errorf("Error registering metric block_height_1: %v", err)
-			return true
+		log.Info("Initializing goroutine to generate the metric: block_height_1")
+		// Initialise the goroutine to generate the metric in the background, only if we specify the node in the config.
+		go func() {
+			log.Info("Consensus node defined to get the first block")
+			for {
+				err := GenerateHashMetrics(cfg)
+				// check if err is nil, if so, that means that Torch was able to generate the metric.
+				if err == nil {
+					log.Info("Metric generated for the first block...")
+					// The metric was successfully generated, stop the retries.
+					break
+				}
+
+				// Wait for the retry interval before the next execution
+				time.Sleep(retryInterval)
+			}
+		}()
+	}
+}
+
+// BackgroundGenerateLBMetric initializes a goroutine to generate the load_balancer metric.
+func BackgroundGenerateLBMetric() {
+	log.Info("Initializing goroutine to generate the metric: load_balancer ")
+
+	// Retrieve the list of Load Balancers
+	_, err := k8s.RetrieveAndGenerateMetrics()
+	if err != nil {
+		log.Printf("Failed to update metrics: %v", err)
+	}
+
+	// Start watching for changes to the services in a separate goroutine
+	done := make(chan error)
+	go k8s.WatchServices(done)
+
+	// Handle errors from WatchServices
+	for {
+		select {
+		case err := <-done:
+			if err != nil {
+				log.Error("Error in WatchServices: ", err)
+			}
 		}
 	}
-	return false
+}
+
+// GenerateHashMetrics generates the metric by getting the first block and calculating the days.
+func GenerateHashMetrics(cfg config.MutualPeersConfig) error {
+	log.Info("Generating the metric for the first block generated...")
+
+	// Get the genesisHash
+	blockHash, earliestBlockTime, err := nodes.GenesisHash(cfg.MutualPeers[0].ConsensusNode)
+	if err != nil {
+		return err
+	}
+
+	// check if earliestBlockTime is not empty, otherwise torch skips this process for now.
+	err = metrics.WithMetricsBlockHeight(
+		blockHash,
+		earliestBlockTime,
+		cfg.MutualPeers[0].ConsensusNode,
+		os.Getenv("POD_NAMESPACE"),
+	)
+	if err != nil {
+		log.Error("Error registering metric block_height_1: ", err)
+		return err
+	}
+
+	return nil
 }
 
 // RegisterMetrics generates and registers the metrics for all nodes in case they already exist in the DB.
diff --git a/pkg/k8s/services.go b/pkg/k8s/services.go
new file mode 100644
index 0000000..e368e56
--- /dev/null
+++ b/pkg/k8s/services.go
@@ -0,0 +1,131 @@
+package k8s
+
+import (
+	"context"
+	"fmt"
+	log "github.com/sirupsen/logrus"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+
+	"github.com/celestiaorg/torch/pkg/metrics"
+)
+
+// RetrieveAndGenerateMetrics retrieves the list of Load Balancers and generates metrics
+func RetrieveAndGenerateMetrics() ([]metrics.LoadBalancer, error) {
+	log.Info("Retrieving the list of Load Balancers")
+
+	// Get list of LBs
+	svc, err := ListServices()
+	if err != nil {
+		log.Printf("Failed to retrieve the LoadBalancers: %v", err)
+		return nil, err
+	}
+
+	// Get the list of the LBs
+	loadBalancers := GetLoadBalancers(svc)
+
+	// Generate the metrics with the LBs
+	err = metrics.WithMetricsLoadBalancer(loadBalancers)
+	if err != nil {
+		log.Printf("Failed to update metrics: %v", err)
+		return nil, err
+	}
+
+	return loadBalancers, nil
+}
+
+// ListServices retrieves the list of services in a namespace
+func ListServices() (*corev1.ServiceList, error) {
+	// Authentication in cluster - using Service Account, Role, RoleBinding
+	config, err := rest.InClusterConfig()
+	if err != nil {
+		log.Error("ERROR: ", err)
+		return nil, err
+	}
+
+	// Create the Kubernetes clientSet
+	clientSet, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		log.Error("ERROR: ", err)
+		return nil, err
+	}
+
+	// Get all services in the namespace
+	services, err := clientSet.CoreV1().Services(GetCurrentNamespace()).List(context.Background(), metav1.ListOptions{})
+	if err != nil {
+		log.Error("ERROR: ", err)
+		return nil, err
+	}
+
+	return services, nil
+}
+
+// GetLoadBalancers filters the list of services to include only Load Balancers and returns a list of them
+func GetLoadBalancers(svc *corev1.ServiceList) []metrics.LoadBalancer {
+	var loadBalancers []metrics.LoadBalancer
+
+	for _, svc := range svc.Items {
+		if svc.Spec.Type == corev1.ServiceTypeLoadBalancer {
+			for _, ingress := range svc.Status.LoadBalancer.Ingress {
+				log.Info(fmt.Sprintf("Updating metrics for service: [%s] with IP: [%s]", svc.Name, ingress.IP))
+
+				// Create a LoadBalancer struct and append it to the loadBalancers list
+				loadBalancer := metrics.LoadBalancer{
+					ServiceName:      "torch",
+					LoadBalancerName: svc.Name,
+					LoadBalancerIP:   ingress.IP,
+					Namespace:        svc.Namespace,
+					Value:            1, // Set the value of the metric here (e.g., 1)
+				}
+				loadBalancers = append(loadBalancers, loadBalancer)
+			}
+		}
+	}
+
+	return loadBalancers
+}
+
+// WatchServices watches for changes to the services in the specified namespace and updates the metrics accordingly
+func WatchServices(done chan<- error) {
+	defer close(done)
+
+	// Authentication in cluster - using Service Account, Role, RoleBinding
+	config, err := rest.InClusterConfig()
+	if err != nil {
+		log.Error("Failed to get in-cluster config: ", err)
+		done <- err
+		return
+	}
+
+	// Create the Kubernetes clientSet
+	clientSet, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		log.Error("Failed to create Kubernetes clientSet: ", err)
+		done <- err
+		return
+	}
+
+	// Create a service watcher
+	watcher, err := clientSet.CoreV1().Services(GetCurrentNamespace()).Watch(context.Background(), metav1.ListOptions{})
+	if err != nil {
+		log.Error("Failed to create service watcher: ", err)
+		done <- err
+		return
+	}
+
+	// Watch for events on the watcher channel
+	for event := range watcher.ResultChan() {
+		if service, ok := event.Object.(*corev1.Service); ok {
+			if service.Spec.Type == corev1.ServiceTypeLoadBalancer {
+				loadBalancers := GetLoadBalancers(&corev1.ServiceList{Items: []corev1.Service{*service}})
+				if err := metrics.WithMetricsLoadBalancer(loadBalancers); err != nil {
+					log.Error("Failed to update metrics with load balancers: ", err)
+					done <- err
+					return
+				}
+			}
+		}
+	}
+}
diff --git a/pkg/k8s/statefulsets.go b/pkg/k8s/statefulsets.go
index b556847..9defb87 100644
--- a/pkg/k8s/statefulsets.go
+++ b/pkg/k8s/statefulsets.go
@@ -13,7 +13,10 @@ import (
 	"github.com/celestiaorg/torch/pkg/db/redis"
 )
 
-const queueK8SNodes = "k8s"
+const (
+	queueK8SNodes = "k8s" // queueK8SNodes name of the queue.
+	daNodePrefix  = "da"  // daNodePrefix name prefix that Torch will use to filter the StatefulSets.
+)
 
 // WatchStatefulSets watches for changes to the StatefulSets in the specified namespace and updates the metrics accordingly
 func WatchStatefulSets() error {
@@ -42,11 +45,11 @@ func WatchStatefulSets() error {
 
 	// Watch for events on the watcher channel
 	for event := range watcher.ResultChan() {
+		// Check if the event object is of type *v1.StatefulSet
 		if statefulSet, ok := event.Object.(*v1.StatefulSet); ok {
-			//log.Info("StatefulSet containers: ", statefulSet.Spec.Template.Spec.Containers)
-
-			// check if the node is DA, if so, send it to the queue to generate the multi address
-			if strings.HasPrefix(statefulSet.Name, "da") {
+			// Check if the StatefulSet is valid based on the conditions
+			if isStatefulSetValid(statefulSet) {
+				// Perform necessary actions, such as adding the node to the Redis queue
 				err := redis.Producer(statefulSet.Name, queueK8SNodes)
 				if err != nil {
 					log.Error("ERROR adding the node to the queue: ", err)
@@ -58,3 +61,11 @@ func WatchStatefulSets() error {
 
 	return nil
 }
+
+// isStatefulSetValid validates the StatefulSet received.
+// checks if the StatefulSet name contains the daNodePrefix, and if the StatefulSet is in the "Running" state.
+func isStatefulSetValid(statefulSet *v1.StatefulSet) bool {
+	return strings.HasPrefix(statefulSet.Name, daNodePrefix) &&
+		statefulSet.Status.CurrentReplicas > 0 &&
+		statefulSet.Status.Replicas == statefulSet.Status.ReadyReplicas
+}
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
index ba712ce..c9b2e3d 100644
--- a/pkg/metrics/metrics.go
+++ b/pkg/metrics/metrics.go
@@ -77,6 +77,14 @@ func WithMetricsBlockHeight(blockHeight, earliestBlockTime, serviceName, namespa
 		log.Fatalf(err.Error())
 		return err
 	}
+
+	// Calculate the days that the chain is live.
+	daysRunning, err := calculateDaysDifference(earliestBlockTime)
+	if err != nil {
+		log.Error("ERROR: ", err)
+		return err
+	}
+
 	callback := func(ctx context.Context, observer metric.Observer) error {
 		// Define the callback function that will be called periodically to observe metrics.
 		// Create labels with attributes for each block_height_1.
@@ -84,7 +92,7 @@ func WithMetricsBlockHeight(blockHeight, earliestBlockTime, serviceName, namespa
 			attribute.String("service_name", serviceName),
 			attribute.String("block_height_1", blockHeight),
 			attribute.String("earliest_block_time", earliestBlockTime),
-			attribute.Int("days_running", CalculateDaysDifference(earliestBlockTime)),
+			attribute.Int("days_running", daysRunning),
 			attribute.String("namespace", namespace),
 		)
 		// Observe the float64 value for the current block_height_1 with the associated labels.
@@ -98,18 +106,61 @@ func WithMetricsBlockHeight(blockHeight, earliestBlockTime, serviceName, namespa
 	return err
 }
 
-// CalculateDaysDifference based on the date received, returns the number of days since this day.
-func CalculateDaysDifference(inputTimeString string) int {
+// calculateDaysDifference based on the date received, returns the number of days since this day.
+func calculateDaysDifference(inputTimeString string) (int, error) {
 	layout := "2006-01-02T15:04:05.999999999Z"
 	inputTime, err := time.Parse(layout, inputTimeString)
 	if err != nil {
 		log.Error("Error parsing time: [", inputTimeString, "]", err)
-		return -1
+		return -1, err
 	}
 
 	currentTime := time.Now()
 	timeDifference := currentTime.Sub(inputTime)
 	daysDifference := int(timeDifference.Hours() / 24)
 
-	return daysDifference
+	return daysDifference, nil
+}
+
+// LoadBalancer represents the information for a load balancer.
+type LoadBalancer struct {
+	ServiceName      string  // ServiceName Name of the service associated with the load balancer.
+	LoadBalancerName string  // LoadBalancerName Name of the load balancer.
+	LoadBalancerIP   string  // LoadBalancerIP IP address of the load balancer.
+	Namespace        string  // Namespace where the service is deployed.
+	Value            float64 // Value to be observed for the load balancer.
+}
+
+// WithMetricsLoadBalancer creates a callback function to observe metrics for multiple load balancers.
+func WithMetricsLoadBalancer(loadBalancers []LoadBalancer) error {
+	// Create a Float64ObservableGauge named "load_balancer" with a description for the metric.
+	loadBalancersGauge, err := meter.Float64ObservableGauge(
+		"load_balancer",
+		metric.WithDescription("Torch - Load Balancers"),
+	)
+	if err != nil {
+		log.Fatalf(err.Error())
+		return err
+	}
+
+	// Define the callback function that will be called periodically to observe metrics.
+	callback := func(ctx context.Context, observer metric.Observer) error {
+		for _, lb := range loadBalancers {
+			// Create labels with attributes for each load balancer.
+			labels := metric.WithAttributes(
+				attribute.String("service_name", lb.ServiceName),
+				attribute.String("load_balancer_name", lb.LoadBalancerName),
+				attribute.String("load_balancer_ip", lb.LoadBalancerIP),
+				attribute.String("namespace", lb.Namespace),
+			)
+			// Observe the float64 value for the current load balancer with the associated labels.
+			observer.ObserveFloat64(loadBalancersGauge, lb.Value, labels)
+		}
+
+		return nil
+	}
+
+	// Register the callback with the meter and the Float64ObservableGauge.
+	_, err = meter.RegisterCallback(callback, loadBalancersGauge)
+	return err
 }
diff --git a/pkg/nodes/consensus.go b/pkg/nodes/consensus.go
index 03d3a18..a265be2 100644
--- a/pkg/nodes/consensus.go
+++ b/pkg/nodes/consensus.go
@@ -15,7 +15,7 @@ import (
 var (
 	consContainerSetupName = "consensus-setup"         // consContainerSetupName initContainer that we use to configure the nodes.
 	consContainerName      = "consensus"               // consContainerName container name which the pod runs.
-	namespace              = k8s.GetCurrentNamespace() // ns namespace of the node.
+	namespace              = k8s.GetCurrentNamespace() // namespace of the node.
 )
 
 // SetConsNodeDefault sets all the default values in case they are empty
@@ -34,26 +34,25 @@ func SetConsNodeDefault(peer config.Peer) config.Peer {
 
 // GenesisHash connects to the node specified in: config.MutualPeersConfig.ConsensusNode
 // makes a request to the API and gets the info about the genesis and return it
-func GenesisHash(pods config.MutualPeersConfig) (string, string) {
-	consensusNode := pods.MutualPeers[0].ConsensusNode
+func GenesisHash(consensusNode string) (string, string, error) {
 	url := fmt.Sprintf("http://%s:26657/block?height=1", consensusNode)
 
 	response, err := http.Get(url)
 	if err != nil {
-		log.Error("Error making GET request:", err)
-		return "", ""
+		log.Error("Error making the request to the node [", consensusNode, "] - ", err)
+		return "", "", err
 	}
 	defer response.Body.Close()
 
 	if response.StatusCode != http.StatusOK {
 		log.Error("Non-OK response:", response.Status)
-		return "", ""
+		return "", "", err
 	}
 
 	bodyBytes, err := ioutil.ReadAll(response.Body)
 	if err != nil {
 		log.Error("Error reading response body:", err)
-		return "", ""
+		return "", "", err
 	}
 
 	bodyString := string(bodyBytes)
@@ -64,26 +63,26 @@ func GenesisHash(pods config.MutualPeersConfig) (string, string) {
 	err = json.Unmarshal([]byte(bodyString), &jsonResponse)
 	if err != nil {
 		log.Error("Error parsing JSON:", err)
-		return "", ""
+		return "", "", err
 	}
 
 	// Access and print the .block_id.hash field
 	blockIDHash, ok := jsonResponse["result"].(map[string]interface{})["block_id"].(map[string]interface{})["hash"].(string)
 	if !ok {
 		log.Error("Unable to access .block_id.hash")
-		return "", ""
+		return "", "", err
 	}
 
 	// Access and print the .block.header.time field
 	blockTime, ok := jsonResponse["result"].(map[string]interface{})["block"].(map[string]interface{})["header"].(map[string]interface{})["time"].(string)
 	if !ok {
 		log.Error("Unable to access .block.header.time")
-		return "", ""
+		return "", "", err
 	}
 
 	log.Info("Block ID Hash: ", blockIDHash)
 	log.Info("Block Time: ", blockTime)
 	log.Info("Full output: ", bodyString)
 
-	return blockIDHash, blockTime
+	return blockIDHash, blockTime, nil
 }