Skip to content

Commit

Permalink
add the microservice anomaly detection and explanation implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
IndikaKuma committed Sep 12, 2021
1 parent 9ed2251 commit 41c4658
Show file tree
Hide file tree
Showing 52 changed files with 125,496 additions and 0 deletions.
1,719 changes: 1,719 additions & 0 deletions made/Data_Scraping_Preparation.ipynb

Large diffs are not rendered by default.

497 changes: 497 additions & 0 deletions made/Data_Scraping_Preparation.py

Large diffs are not rendered by default.

260 changes: 260 additions & 0 deletions made/Deployment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
# Packages are imported. Unusual packages are installed.

import os

os.system('pip3 install pyyaml')
os.system('pip3 install ruamel.yaml')

import yaml
import ruamel.yaml
import copy
from ruamel.yaml.scalarstring import (DoubleQuotedScalarString as dq,
SingleQuotedScalarString as sq)

# Create GCP firewall rules for ingress and egress directions
os.system('gcloud compute firewall-rules create ingressrule --allow tcp:30000-33000,tcp:8000-11000,tcp:443,tcp:80,tcp:24226 --direction INGRESS')
os.system('gcloud compute firewall-rules create egressrule --allow tcp:30000-33000,tcp:8000-11000,tcp:443,tcp:80 --direction EGRESS')

# table of hostnames and external ip's is requested
nodeinfo_list = os.popen("gcloud compute instances list --format='table(NAME,EXTERNAL_IP)'").read().split()


# list of dictionaries for each gke cluster node is made
nodeinfo_list = [{'name' : nodeinfo_list[i],'ip' : nodeinfo_list[i+1]} for i in range(0, len(nodeinfo_list), 1) if 'gke' in nodeinfo_list[i]]

# label nodes 1 to 6
for id in range(1, len(nodeinfo_list)+1):
os.system('kubectl label nodes ' + nodeinfo_list[id-1]['name'] + ' nodeid=' + str(id))

# Get working directory
directory = os.getcwd()

# Download Istio if necessary
if os.path.isdir((os.getcwd()[:-4]) + 'istio-1.9.2') == False:
os.system('curl -L https://istio.io/downloadIstio | ISTIO_VERSION=1.9.2 TARGET_ARCH=x86_64 sh -')
os.system('cd istio-1.9.2')
os.system('export PATH=$PWD/bin:$PATH')
os.system('cd')

# install istio
os.system('istioctl install --set profile=demo -y')

# annotate default namespace for sidecar injection
os.system('kubectl label namespace default istio-injection=enabled')

# deploy teastore microservices and label them as teastore
os.system('kubectl apply -f Infrastructure/Teastore/teastore-clusterip.yaml')
os.system('kubectl label deploy --all app=teastore -n default')

# create ingressgateway for service mesh
os.system('kubectl apply -f Infrastructure/Istio/istioGateway.yml')

# install litmus operator
os.system('kubectl apply -f https://litmuschaos.github.io/litmus/litmus-operator-v1.13.6.yaml')

# install litmus experiments
os.system('kubectl apply -f https://hub.litmuschaos.io/api/chaos/1.13.7?file=charts/generic/experiments.yaml -n default')

# deploy litmus admin service account
os.system('kubectl apply -f Infrastructure/Litmus/adminrbac.yml -n default')

# deploy litmus chaos exporter
os.system('kubectl apply -f Infrastructure/Litmus/chaos-exporter.yml -n litmus')

# create monitoring namespace
os.system('kubectl create ns monitoring')

# deploy skydive analyzer and agents
os.system('kubectl apply -f Infrastructure/Skydive/skydive_agents_analyzer.yml')

# ip change in prom_skydive_con.yml to existing k8s cluster node ip (in 2 places)
file_path = 'Infrastructure/Skydive/prom_skydive_con.yml'
with open(file_path, 'r') as f:
deployment_dict = yaml.safe_load_all(f)
deployment_dict = list(deployment_dict)
deployment_dict[0]['spec']['template']['spec']['containers'][0]['env'][0]['value'] = str(nodeinfo_list[0]['ip']) + ":32505"
deployment_dict[0]['spec']['template']['spec']['containers'][0]['env'][2]['value'] = "ws://" + str(nodeinfo_list[0]['ip']) + ":32505/ws/subscriber/flow"

with open(file_path, 'w') as f:
yaml.safe_dump_all(deployment_dict, f, default_flow_style=False)

# deploy prometheus_skydive_connector
os.system('kubectl apply -f Infrastructure/Skydive/prom_skydive_con.yml -n monitoring')

# run skydive gremlin query !!!TODO
#os.system('sudo docker run --net=host -e SKYDIVE_ANALYZERS=35.246.242.101:32505 skydive/skydive client capture create --type pcap --gremlin "G.V().Has(\'Type\', \'device\', \'Name\', \'eth0\')"')

# deploy kube-state-metric
os.system('kubectl apply -f Infrastructure/Kubernetes/kube-state-metrics-configs/')

# change ip in prometheus config file

#change ip prom config for skydive connector and litmus export to right values.
testyaml = ruamel.yaml.YAML()
with open('Infrastructure/Prometheus/kubernetes-prometheus/config-map.yaml', 'r') as read_file:
prom_config = testyaml.load_all(read_file)

prom_config = list(prom_config)
#print(prom_config[0]['data']['prometheus.yml'])
prom_config[0]['data']['prometheus.yml'] = prom_config[0]['data']['prometheus.yml'].replace('skydive_ip_port', str(nodeinfo_list[0]['ip']) + ':32222')
prom_config[0]['data']['prometheus.yml'] = prom_config[0]['data']['prometheus.yml'].replace('litmus_ip_port', str(nodeinfo_list[3]['ip']) + ':31111')


with open('Infrastructure/Prometheus/kubernetes-prometheus/config-map.yaml', 'w') as created_file:
testyaml.default_flow_style = False
testyaml.dump(prom_config[0], created_file)

# deploy prometheus
os.system('kubectl apply -f Infrastructure/Prometheus/kubernetes-prometheus/ -n monitoring')

# deploy prometheus node exporters
os.system('kubectl apply -f Infrastructure/Prometheus/kubernetes-node-exporter/ -n monitoring')

# Function for defining unique parameters
# define the experiment parameters based on specific experiments
def define_exp_parameters(experiment, exp_doc, node_list):

deployment_list = []
# Node cpu hog parameters
if ('cpu' in experiment) and ('node' in experiment):

for node in node_list:
for duration in range(50, 81, 1):
for cpu_cores in range(2, 5):

exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(cpu_cores)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = sq(node)
deployment_list.append(copy.deepcopy(exp_doc))

# Pod cpu hog parameters
elif ('cpu' in experiment) and ('pod' in experiment):

for duration in range(50, 81, 1):
for cpu_cores in range(2, 5):

exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(cpu_cores)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(duration)
deployment_list.append(copy.deepcopy(exp_doc))

# Node memory hog parameters
elif ('mem' in experiment) and ('node' in experiment):

for node in node_list:
for duration in range(50, 81, 1):
for ram_perc in range(20, 41):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(ram_perc)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = sq(node)
deployment_list.append(copy.deepcopy(exp_doc))

# Pod memory hog parameters
elif ('mem' in experiment) and ('pod' in experiment):

for duration in range(50, 81, 1):
for ram in range(1500, 3000, 50):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(ram)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(duration)

deployment_list.append(copy.deepcopy(exp_doc))

# Node io stress parameters
elif ('iostress' in experiment) and ('node' in experiment):

for node in node_list:
for duration in range(50, 81, 1):
for fsu_perc in range(20, 41):
for nr_workers in range(3, 6):
for nr_cpu in [1, 2]:
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(fsu_perc)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = str(nr_cpu)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'] = str(nr_workers)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'] = sq(node)
deployment_list.append(copy.deepcopy(exp_doc))

# Pod io stress parameters
elif ('iostress' in experiment) and ('pod' in experiment):

for duration in range(50, 81, 1):
for fsu_perc in range(20, 41):
for nr_workers in range(3, 6):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(fsu_perc)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = str(nr_workers)
deployment_list.append(copy.deepcopy(exp_doc))

# Pod delete parameters
elif ('del' in experiment):

for duration in range(12, 20, 1):
for interval in range(3, 7):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = str(interval)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(duration)

deployment_list.append(copy.deepcopy(exp_doc))

# Network latency parameters
elif ('netlat' in experiment):

for duration in range(50, 81, 1):
for latency in range(1600, 2800, 50):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'])
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(latency)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'])
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'])

deployment_list.append(copy.deepcopy(exp_doc))

# Network loss parameters
elif 'netloss' in experiment:

for duration in range(50, 81, 1):
for packet_loss_perc in range(80, 101):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'])
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(packet_loss_perc)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'])
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'])
deployment_list.append(copy.deepcopy(exp_doc))

# Network corruption parameters
elif 'netcorr' in experiment:
for duration in range(50, 81, 1):
for packet_corr_perc in range(80, 101):
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][0]['value'])
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][1]['value'] = str(duration)
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][2]['value'] = str(packet_corr_perc)

exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][3]['value'])
exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'] = sq(exp_doc[0]['spec']['experiments'][0]['spec']['components']['env'][4]['value'])
deployment_list.append(copy.deepcopy(exp_doc))

return deployment_list


# Loading template files, generating different parameters, and dumping them as seperate YAML deployment files.
#node related
template_dir = 'Experiments/templates'
base_dir = 'Experiments/'
experiment_list = os.listdir(template_dir)
node_list = [node['name'] for node in nodeinfo_list]
testyaml = ruamel.yaml.YAML()
for experiment in [experiment for experiment in experiment_list if 'disk' not in experiment]:

with open(template_dir + "/" + experiment, 'r') as read_file:
deployments = testyaml.load_all(read_file)
deployments = list(deployments)
deployments = define_exp_parameters(experiment, deployments, node_list)


i = 1
for deployment in deployments:
with open(base_dir + experiment.split('.')[0] + "/" + experiment.split('.')[0] + "_" + str(i) + ".yml", 'w') as created_file:

testyaml.default_flow_style = False

testyaml.dump(deployment[0], created_file)

i += 1
3 changes: 3 additions & 0 deletions made/Experiments/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Randomly running experiments using runExperiments.py:

python3 runExperiments.py
69 changes: 69 additions & 0 deletions made/Experiments/runExperiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import sys
import time
import random

#os.system('kubectl apply -f adminrbac.yml -n default')
experimentCategories = []
#random.shuffle(experimentCategories)

#print(experimentCategories)
i=0
limit = 10000
while i < limit:


if not experimentCategories:


experimentCategories = [x[0] for x in os.walk(os.path.dirname(os.path.realpath(__file__))) if 'templates' not in x[0] if 'del' not in x[0]][1:]
print(experimentCategories)
#print(experimentCategories)

#experimentCategories = [x[0] for x in os.walk(r'D:\Master thesis\Experimental procedure\LitmusExperiments\testex')][1:]

i=i+1
print("Iteration: " + str(i))

if i == limit:
break



choiceExperiment = random.randint(0, len(experimentCategories)-1)

#print(choiceExperiment)
print(experimentCategories[choiceExperiment])


experiments = os.listdir(experimentCategories[choiceExperiment])





choiceExperimentConfig = random.randint(0, len(experiments)-1)

#experimentConfig = experimentCategories[choiceExperiment] + "//" + experiments[choiceExperimentConfig]
experimentConfig = os.path.join(experimentCategories[choiceExperiment], experiments[choiceExperimentConfig])
#print(experimentConfig)



del experimentCategories[choiceExperiment]

print("Choice: " + experiments[choiceExperimentConfig])

os.system('kubectl apply -f ' + experimentConfig + ' -n default')

sleepTime = random.randint(90, 110)

time.sleep(sleepTime)

#print(i)
#os.system('kubectl apply -f pod-memhog.yml -n default')
#i=i+1
#time.sleep(100)
#os.system('kubectl apply -f pod-cpuhog.yml -n default')

print('Done')
42 changes: 42 additions & 0 deletions made/Experiments/templates/node-cpuhog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
name: teastore-chaos
namespace: default
spec:
appinfo:
appns: "default"
applabel: "app=teastore"
# It can be true/false
annotationCheck: 'false'
# It can be active/stop
engineState: 'active'
#ex. values: ns1:name=percona,ns2:run=nginx
auxiliaryAppInfo: ''
chaosServiceAccount: litmus-admin
# monitoring: 'false'
components:
runner:
runnerAnnotations:
sidecar.istio.io/inject: 'false'
# It can be delete/retain
jobCleanUpPolicy: 'delete'
experiments:
- name: node-cpu-hog
spec:
components:
experimentAnnotations:
sidecar.istio.io/inject: 'false'
env:
# set chaos duration (in sec) as desired
- name: TOTAL_CHAOS_DURATION
value: '60'

## Specify the size as percent of total node capacity Ex: '30'
## Note: For consuming memory in mebibytes change the variable to MEMORY_CONSUMPTION_MEBIBYTES
- name: NODE_CPU_CORE
value: '2'

# ENTER THE COMMA SEPARATED TARGET NODES NAME
- name: TARGET_NODES
value: 'gke-testcluster-default-pool-01550e5a-2h29'
Loading

0 comments on commit 41c4658

Please sign in to comment.