doitintl · aledenardis · Jul 4, 2021 · Jul 4, 2021 · Jul 16, 2021 · Jul 27, 2021
diff --git a/client/.eslintrc b/client/.eslintrc
@@ -2,6 +2,11 @@
   "extends": "react-app",
   "plugins": ["prettier"],
   "rules": {
-    "prettier/prettier": "error"
+    "prettier/prettier": [
+              "error",
+              {
+                "endOfLine": "auto"
+              },
+            ]
   }
 }
diff --git a/gcp/compute.py b/gcp/compute.py
@@ -29,7 +29,7 @@ def change_status(self, to_status, tagkey, tagvalue):
         Returns:
 
         """
-        tag_filter = "labels." + tagkey + "=" + tagvalue
+        tag_filter = "labels." + tagkey + "=" + tagvalue + " NOT labels.goog-gke-node=*"
         logging.debug("Filter %s", filter)
         for zone in gcp.get_zones():
             try:

diff --git a/gcp/gke.py b/gcp/gke.py
@@ -1,7 +1,5 @@
 """Interactions with GKE."""
-
 import logging
-
 import backoff
 from google.cloud import ndb
 from googleapiclient import discovery
@@ -11,88 +9,123 @@
 
 CREDENTIALS = None
 
-
 class Gke(object):
     """GKE engine actions."""
-
     def __init__(self, project):
         self.gke = discovery.build("container", "v1", cache_discovery=False)
         self.project = project
-
     def change_status(self, to_status, tagkey, tagvalue):
-        logging.debug("GKE change_status")
+        logging.info("GKE change_status")        
         client = ndb.Client()
         with client.context():
             try:
+                logging.info("List cluster")
+                # List all the clusters in the project
                 clusters = self.list_clusters()
+                # Considering one cluster at a time
                 for cluster in clusters:
+                    logging.info("Cluster location " + cluster["location"])
+                    # Check if the cluster has to be managed
                     if (
                         "resourceLabels" in cluster
                         and tagkey in cluster["resourceLabels"]
                         and cluster["resourceLabels"][tagkey] == tagvalue
                     ):
-                        logging.debug("GKE change_status cluster %s %s %s", cluster,cluster["resourceLabels"],cluster["resourceLabels"][tagkey])
+                        logging.info("GKE change_status cluster %s %s %s", cluster,cluster["resourceLabels"],cluster["resourceLabels"][tagkey])
+                        # Considering one node pool at a time for a specific cluster
                         for nodePool in cluster["nodePools"]:
-                            logging.debug(nodePool["instanceGroupUrls"])
-                            for instanceGroup in nodePool["instanceGroupUrls"]:
-                                url = instanceGroup
-                                node_pool_name = url[url.rfind("/") + 1 :]
-                                no_of_nodes = gcp.get_instancegroup_no_of_nodes_from_url(
-                                    url
+                            logging.info("extract number of nodes")
+                            logging.info(cluster["location"])
+                            logging.info(nodePool["instanceGroupUrls"])
+                            ndb_identifier=str(self.project + "_" + cluster["name"] + "_" + nodePool["name"])
+                            logging.info("ndb_identifier")
+                            logging.info(ndb_identifier)
+                            # Sizing up
+                            if int(to_status) == 1:
+                                logging.info(
+                                    "Sizing up node pool %s in cluster %s "
+                                    "tagkey "
+                                    "%s tagvalue %s",
+                                    nodePool["name"],
+                                    cluster["name"],
+                                    tagkey,
+                                    tagvalue,
+                                )
+                                # Query Datastore to get the number of nodes of the specific node pool
+                                res = GkeNodePoolModel.query(
+                                    GkeNodePoolModel.Name == ndb_identifier
+                                ).get()
+                                logging.info(res)
+                                # If the node pool is not on Datastore, pass-by
+                                if not res:
+                                    continue
+                                # Call the function to size up the node pool, we pass the number of nodes read on Datastore (res.NumberOfNodes)
+                                gcp.resize_node(self.project, cluster["name"], nodePool["name"], cluster["location"], res.NumberOfNodes)
+                                # Clear the information on Datastore
+                                res.key.delete()
+                            # Sizing down
+                            else:
+                                logging.info(
+                                    "Sizing down node pool %s in cluster %s "
+                                    "tagkey "
+                                    "%s tagvalue %s",
+                                    nodePool["name"],
+                                    cluster["name"],
+                                    tagkey,
+                                    tagvalue,
                                 )
-                                if int(to_status) == 1:
-                                    logging.debug(
-                                        "Sizing up node pool %s in cluster %s "
-                                        "tagkey "
-                                        "%s tagvalue %s",
-                                        nodePool["name"],
-                                        cluster["name"],
-                                        tagkey,
-                                        tagvalue,
-                                    )
-                                    res = GkeNodePoolModel.query(
-                                        GkeNodePoolModel.Name == node_pool_name
-                                    ).get()
-                                    logging.debug(res)
-                                    if not res:
-                                        continue
-                                    gcp.resize_node_pool(res.NumberOfNodes, url)
-                                    res.key.delete()
+                                # Valorizing variables to put on Datastore
+                                node_pool_model = GkeNodePoolModel()
+                                node_pool_model.Name = ndb_identifier
+                                no_of_nodes=0
+                                # Check one instance group at a time for a specific node pool, to count the total number of nodes
+                                for instanceGroup in nodePool["instanceGroupUrls"]:
+                                    logging.info("Counting instanceGroups")
+                                    url = instanceGroup
+                                    # (get_instancegroup_no_of_nodes_from_url) returns the size of an instance group
+                                    no_of_nodes_inst_group = gcp.get_instancegroup_no_of_nodes_from_url(url)
+                                    # Sum the size of an instance group to the total number of nodes
+                                    no_of_nodes += no_of_nodes_inst_group
+                                logging.info(no_of_nodes)    
+                                # Check if the cluster is regional or not. (cluster["location"]) returns a region if
+                                # the cluster is regional, or a zone if it's not
+                                if gcp.is_regional(cluster["location"]): 
+                                    logging.info("cluster is regional")
+                                    # (num_zones) is the number of zones in the region we are considering.
+                                    # please note: (cluster["locations"]) returns a list of zones, unlike (cluster["location"])
+                                    num_zones = len(cluster["locations"])
+                                    # Divide (no_of_nodes) for (num_zones) to get the number of nodes per zone.
+                                    # this has to be done because the API call for sizing up needs this parameter,
+                                    # otherwise the node pool grows uncontrollably.
+                                    no_of_nodes = int(no_of_nodes/num_zones)
+                                    logging.info(no_of_nodes)
+                                # If the cluster is not regional we took (no_of_nodes) without dividing it for (num_zones)
                                 else:
-                                    logging.debug(
-                                        "Sizing down node pool %s in cluster %s "
-                                        "tagkey "
-                                        "%s tagvalue %s",
-                                        nodePool["name"],
-                                        cluster["name"],
-                                        tagkey,
-                                        tagvalue,
-                                    )
-                                    if no_of_nodes == 0:
-                                        continue
-                                    node_pool_model = GkeNodePoolModel()
-                                    node_pool_model.Name = node_pool_name
-                                    node_pool_model.NumberOfNodes = no_of_nodes
-                                    node_pool_model.key = ndb.Key(
-                                        "GkeNodePoolModel", node_pool_name
-                                    )
-                                    node_pool_model.put()
-                                    gcp.resize_node_pool(0, url)
+                                    logging.info("cluster is not regional")
+                                if no_of_nodes == 0:
+                                   continue
+                                logging.info("number of nodes")
+                                logging.info(no_of_nodes)
+                                # Valorizing variables and putting them on Datastore
+                                node_pool_model.NumberOfNodes = no_of_nodes
+                                node_pool_model.key = ndb.Key(
+                                    "GkeNodePoolModel", ndb_identifier
+                                )
+                                node_pool_model.put()
+                                # Sizing down node pool, in this case the number of nodes we pass is zero
+                                gcp.resize_node(self.project, cluster["name"], nodePool["name"], cluster["location"], 0)
             except HttpError as http_error:
                 logging.error(http_error)
                 return "Error", 500
         return "ok", 200
-
     @backoff.on_exception(backoff.expo, HttpError, max_tries=8, giveup=utils.fatal_code)
     def list_clusters(self):
         """
         List all clusters with the requested tags
         Args:
             zone: zone
             tags_filter: tags
-
         Returns:
-
         """
         parent = "projects/%s/locations/-" % self.project
         result = (
@@ -102,3 +135,4 @@ def list_clusters(self):
             return result["clusters"]
         else:
             return []
+
diff --git a/util/gcp.py b/util/gcp.py
@@ -4,6 +4,7 @@
 
 import backoff
 import googleapiclient.discovery
+import time
 from googleapiclient.errors import HttpError
 from util import utils
 
@@ -62,32 +63,48 @@ def get_instancegroup_no_of_nodes_from_url(url):
 
 
 @backoff.on_exception(backoff.expo, HttpError, max_tries=8, giveup=utils.fatal_code)
-def resize_node_pool(size, url):
-    """
-    resize a node pool
-    Args:
-        size: requested size
-        url: instance group url
-
-    Returns:
-
-    """
-    compute = googleapiclient.discovery.build("compute", "v1", cache_discovery=False)
-    url = url[47:]
-    project = url[: url.find("/")]
-    zone = url[url.find("zones") + 6 : url.find("instanceGroupManagers") - 1]
-    instance_group_manager = url[url.rfind("/") + 1 :]
-    try:
-        res = (
-            compute.instanceGroupManagers()
-            .resize(
-                project=project,
-                zone=zone,
-                instanceGroupManager=instance_group_manager,
-                size=size,
-            )
-            .execute()
-        )
-    except Exception as e:
-        logging.error(e)
-    return res
+def resize_node(project_id, cluster_id, node_pool_id, location, node_pool_size):
+
+    # This function handles gke clusters with one or more node pools. Scaling more than one node pool in a cluster
+    # at the same time generates an error in gcp, this particular error is set as a warning in the except section.
+    # API: https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/projects.locations.clusters.nodePools/setSize
+
+    service = googleapiclient.discovery.build('container', 'v1', cache_discovery=False)
+
+    # Parameters for the while cycle
+    max_retry=7
+    count=0
+    response = None
+
+    # Parameters for the API call
+    name="projects/%s/locations/%s/clusters/%s/nodePools/%s" % (project_id, location, cluster_id, node_pool_id)
+    body={"nodeCount":node_pool_size}
+
+    # In case of exception 'response' remain equals to None. We retry the resize for 7 times in a total time of 21 minues.
+    # When the previous node pool completes the resize, 'response' stops going in exception and returns a value != None
+    while response == None and count < max_retry:
+        try:
+            response = (service.projects().locations().clusters().nodePools().setSize(name=name, body=body).execute())
+
+        except Exception as e:
+            # Setting the specific error as warning.
+            # At least one of this warning logs will appear in case of multiple node pools.
+            if(str('currently operating on cluster ' + cluster_id + '. Please wait and try again once it is done.') in str(e.content)):
+                logging.warning(e)
+            else:
+                logging.error(e)
+            count += 1
+            # Wait 3 minutes for the precedent node pool to finish the resize before retry. Generate less warning logs
+            time.sleep(180)
+    return response
+
+
+def is_regional(location):
+
+# Check if the cluster is regional. We pass (cluster["location"]) as parameter, that is a region or a zone.
+# every region name in gcp ends with a number and every zone name ends with a letter.
+
+    if location.endswith(("0","1","2","3","4","5","6","7","8","9")): 
+        return True
+    else:
+        return False