Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/gke refactor nodepools #120

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion client/.eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
"extends": "react-app",
"plugins": ["prettier"],
"rules": {
"prettier/prettier": "error"
"prettier/prettier": [
"error",
{
"endOfLine": "auto"
},
]
}
}
2 changes: 1 addition & 1 deletion gcp/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def change_status(self, to_status, tagkey, tagvalue):
Returns:

"""
tag_filter = "labels." + tagkey + "=" + tagvalue
tag_filter = "labels." + tagkey + "=" + tagvalue + " NOT labels.goog-gke-node=*"
logging.debug("Filter %s", filter)
for zone in gcp.get_zones():
try:
Expand Down
140 changes: 87 additions & 53 deletions gcp/gke.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""Interactions with GKE."""

import logging

import backoff
from google.cloud import ndb
from googleapiclient import discovery
Expand All @@ -11,88 +9,123 @@

CREDENTIALS = None


class Gke(object):
"""GKE engine actions."""

def __init__(self, project):
self.gke = discovery.build("container", "v1", cache_discovery=False)
self.project = project

def change_status(self, to_status, tagkey, tagvalue):
logging.debug("GKE change_status")
logging.info("GKE change_status")
client = ndb.Client()
with client.context():
try:
logging.info("List cluster")
# List all the clusters in the project
clusters = self.list_clusters()
# Considering one cluster at a time
for cluster in clusters:
logging.info("Cluster location " + cluster["location"])
# Check if the cluster has to be managed
if (
"resourceLabels" in cluster
and tagkey in cluster["resourceLabels"]
and cluster["resourceLabels"][tagkey] == tagvalue
):
logging.debug("GKE change_status cluster %s %s %s", cluster,cluster["resourceLabels"],cluster["resourceLabels"][tagkey])
logging.info("GKE change_status cluster %s %s %s", cluster,cluster["resourceLabels"],cluster["resourceLabels"][tagkey])
# Considering one node pool at a time for a specific cluster
for nodePool in cluster["nodePools"]:
logging.debug(nodePool["instanceGroupUrls"])
for instanceGroup in nodePool["instanceGroupUrls"]:
url = instanceGroup
node_pool_name = url[url.rfind("/") + 1 :]
no_of_nodes = gcp.get_instancegroup_no_of_nodes_from_url(
url
logging.info("extract number of nodes")
logging.info(cluster["location"])
logging.info(nodePool["instanceGroupUrls"])
ndb_identifier=str(self.project + "_" + cluster["name"] + "_" + nodePool["name"])
logging.info("ndb_identifier")
logging.info(ndb_identifier)
# Sizing up
if int(to_status) == 1:
logging.info(
"Sizing up node pool %s in cluster %s "
"tagkey "
"%s tagvalue %s",
nodePool["name"],
cluster["name"],
tagkey,
tagvalue,
)
# Query Datastore to get the number of nodes of the specific node pool
res = GkeNodePoolModel.query(
GkeNodePoolModel.Name == ndb_identifier
).get()
logging.info(res)
# If the node pool is not on Datastore, pass-by
if not res:
continue
# Call the function to size up the node pool, we pass the number of nodes read on Datastore (res.NumberOfNodes)
gcp.resize_node(self.project, cluster["name"], nodePool["name"], cluster["location"], res.NumberOfNodes)
# Clear the information on Datastore
res.key.delete()
# Sizing down
else:
logging.info(
"Sizing down node pool %s in cluster %s "
"tagkey "
"%s tagvalue %s",
nodePool["name"],
cluster["name"],
tagkey,
tagvalue,
)
if int(to_status) == 1:
logging.debug(
"Sizing up node pool %s in cluster %s "
"tagkey "
"%s tagvalue %s",
nodePool["name"],
cluster["name"],
tagkey,
tagvalue,
)
res = GkeNodePoolModel.query(
GkeNodePoolModel.Name == node_pool_name
).get()
logging.debug(res)
if not res:
continue
gcp.resize_node_pool(res.NumberOfNodes, url)
res.key.delete()
# Valorizing variables to put on Datastore
node_pool_model = GkeNodePoolModel()
node_pool_model.Name = ndb_identifier
no_of_nodes=0
# Check one instance group at a time for a specific node pool, to count the total number of nodes
for instanceGroup in nodePool["instanceGroupUrls"]:
logging.info("Counting instanceGroups")
url = instanceGroup
# (get_instancegroup_no_of_nodes_from_url) returns the size of an instance group
no_of_nodes_inst_group = gcp.get_instancegroup_no_of_nodes_from_url(url)
# Sum the size of an instance group to the total number of nodes
no_of_nodes += no_of_nodes_inst_group
logging.info(no_of_nodes)
# Check if the cluster is regional or not. (cluster["location"]) returns a region if
# the cluster is regional, or a zone if it's not
if gcp.is_regional(cluster["location"]):
logging.info("cluster is regional")
# (num_zones) is the number of zones in the region we are considering.
# please note: (cluster["locations"]) returns a list of zones, unlike (cluster["location"])
num_zones = len(cluster["locations"])
# Divide (no_of_nodes) for (num_zones) to get the number of nodes per zone.
# this has to be done because the API call for sizing up needs this parameter,
# otherwise the node pool grows uncontrollably.
no_of_nodes = int(no_of_nodes/num_zones)
logging.info(no_of_nodes)
# If the cluster is not regional we took (no_of_nodes) without dividing it for (num_zones)
else:
logging.debug(
"Sizing down node pool %s in cluster %s "
"tagkey "
"%s tagvalue %s",
nodePool["name"],
cluster["name"],
tagkey,
tagvalue,
)
if no_of_nodes == 0:
continue
node_pool_model = GkeNodePoolModel()
node_pool_model.Name = node_pool_name
node_pool_model.NumberOfNodes = no_of_nodes
node_pool_model.key = ndb.Key(
"GkeNodePoolModel", node_pool_name
)
node_pool_model.put()
gcp.resize_node_pool(0, url)
logging.info("cluster is not regional")
if no_of_nodes == 0:
continue
logging.info("number of nodes")
logging.info(no_of_nodes)
# Valorizing variables and putting them on Datastore
node_pool_model.NumberOfNodes = no_of_nodes
node_pool_model.key = ndb.Key(
"GkeNodePoolModel", ndb_identifier
)
node_pool_model.put()
# Sizing down node pool, in this case the number of nodes we pass is zero
gcp.resize_node(self.project, cluster["name"], nodePool["name"], cluster["location"], 0)
except HttpError as http_error:
logging.error(http_error)
return "Error", 500
return "ok", 200

@backoff.on_exception(backoff.expo, HttpError, max_tries=8, giveup=utils.fatal_code)
def list_clusters(self):
"""
List all clusters with the requested tags
Args:
zone: zone
tags_filter: tags

Returns:

"""
parent = "projects/%s/locations/-" % self.project
result = (
Expand All @@ -102,3 +135,4 @@ def list_clusters(self):
return result["clusters"]
else:
return []

75 changes: 46 additions & 29 deletions util/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import backoff
import googleapiclient.discovery
import time
from googleapiclient.errors import HttpError
from util import utils

Expand Down Expand Up @@ -62,32 +63,48 @@ def get_instancegroup_no_of_nodes_from_url(url):


@backoff.on_exception(backoff.expo, HttpError, max_tries=8, giveup=utils.fatal_code)
def resize_node_pool(size, url):
"""
resize a node pool
Args:
size: requested size
url: instance group url

Returns:

"""
compute = googleapiclient.discovery.build("compute", "v1", cache_discovery=False)
url = url[47:]
project = url[: url.find("/")]
zone = url[url.find("zones") + 6 : url.find("instanceGroupManagers") - 1]
instance_group_manager = url[url.rfind("/") + 1 :]
try:
res = (
compute.instanceGroupManagers()
.resize(
project=project,
zone=zone,
instanceGroupManager=instance_group_manager,
size=size,
)
.execute()
)
except Exception as e:
logging.error(e)
return res
def resize_node(project_id, cluster_id, node_pool_id, location, node_pool_size):

# This function handles gke clusters with one or more node pools. Scaling more than one node pool in a cluster
# at the same time generates an error in gcp, this particular error is set as a warning in the except section.
# API: https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/projects.locations.clusters.nodePools/setSize

service = googleapiclient.discovery.build('container', 'v1', cache_discovery=False)

# Parameters for the while cycle
max_retry=7
count=0
response = None

# Parameters for the API call
name="projects/%s/locations/%s/clusters/%s/nodePools/%s" % (project_id, location, cluster_id, node_pool_id)
body={"nodeCount":node_pool_size}

# In case of exception 'response' remain equals to None. We retry the resize for 7 times in a total time of 21 minues.
# When the previous node pool completes the resize, 'response' stops going in exception and returns a value != None
while response == None and count < max_retry:
try:
response = (service.projects().locations().clusters().nodePools().setSize(name=name, body=body).execute())

except Exception as e:
# Setting the specific error as warning.
# At least one of this warning logs will appear in case of multiple node pools.
if(str('currently operating on cluster ' + cluster_id + '. Please wait and try again once it is done.') in str(e.content)):
logging.warning(e)
else:
logging.error(e)
count += 1
# Wait 3 minutes for the precedent node pool to finish the resize before retry. Generate less warning logs
time.sleep(180)
return response


def is_regional(location):

# Check if the cluster is regional. We pass (cluster["location"]) as parameter, that is a region or a zone.
# every region name in gcp ends with a number and every zone name ends with a letter.

if location.endswith(("0","1","2","3","4","5","6","7","8","9")):
return True
else:
return False