-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add leader election retry #66
Changes from 24 commits
75b2fd8
9a049f1
d8fdf7f
23f6904
fd8725a
e365dd7
4b2acd8
c149556
1dbb678
ba46c40
3f1d48d
7339da4
228920b
2405e3b
83c81ab
c711ae6
b8fc6b7
8236150
c63fd0e
e7cedfa
19a89af
6ad6446
38ff2ad
d4a3947
b056bce
ba9d1e0
985831e
610ba31
b7bd4cf
8bea0bf
9552e9e
d720074
fae4bab
af888e6
89b8a6e
3f6bab5
7310411
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,7 +79,7 @@ func (s *scaler) compareScales(current, desired int32) { | |
s.desiredScale = desired | ||
} | ||
|
||
if s.currentScale == -1 || s.desiredScale == -1 { | ||
if s.currentScale == -1 || s.desiredScale == -1 || desired == -1 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch 👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't actually solve issue #67 . So I might revert this |
||
// Nothing to compare if we only have partial information | ||
return | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ import ( | |
"k8s.io/client-go/kubernetes" | ||
"k8s.io/client-go/tools/leaderelection" | ||
"k8s.io/client-go/tools/leaderelection/resourcelock" | ||
"k8s.io/client-go/util/flowcontrol" | ||
) | ||
|
||
func NewElection(clientset kubernetes.Interface, id, namespace string) *Election { | ||
|
@@ -35,11 +36,11 @@ func NewElection(clientset kubernetes.Interface, id, namespace string) *Election | |
RetryPeriod: 2 * time.Second, | ||
Callbacks: leaderelection.LeaderCallbacks{ | ||
OnStartedLeading: func(ctx context.Context) { | ||
log.Println("Started leading") | ||
log.Printf("%v started leading", id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't know about %q that might actually be nice here. |
||
isLeader.Store(true) | ||
}, | ||
OnStoppedLeading: func() { | ||
log.Println("Stopped leading") | ||
log.Printf("%v stopped leading", id) | ||
isLeader.Store(false) | ||
}, | ||
OnNewLeader: func(identity string) { | ||
|
@@ -54,14 +55,28 @@ func NewElection(clientset kubernetes.Interface, id, namespace string) *Election | |
return &Election{ | ||
IsLeader: isLeader, | ||
config: config, | ||
ID: id, | ||
} | ||
} | ||
|
||
type Election struct { | ||
config leaderelection.LeaderElectionConfig | ||
IsLeader *atomic.Bool | ||
ID string | ||
} | ||
|
||
func (le *Election) Start(ctx context.Context) { | ||
leaderelection.RunOrDie(ctx, le.config) | ||
func (le *Election) Start(ctx context.Context) error { | ||
backoff := flowcontrol.NewBackOff(1*time.Second, 15*time.Second) | ||
const backoffID = "lingo-leader-election" | ||
for { | ||
leaderelection.RunOrDie(ctx, le.config) | ||
backoff.Next(backoffID, backoff.Clock.Now()) | ||
delay := backoff.Get(backoffID) | ||
log.Printf("Leader election stopped on %v, retrying in %v", le.ID, delay) | ||
select { | ||
case <-ctx.Done(): | ||
return ctx.Err() | ||
case <-time.After(delay): | ||
} | ||
} | ||
samos123 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,9 @@ set -xe | |
HOST=127.0.0.1 | ||
PORT=30080 | ||
BASE_URL="http://$HOST:$PORT/v1" | ||
REPLICAS=${REPLICAS:-3} | ||
REQUESTS=60 | ||
samos123 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
EXPECTED_REPLICAS=1 | ||
|
||
|
||
if kind get clusters | grep -q substratus-test; then | ||
|
@@ -42,6 +45,9 @@ if ! kubectl get deployment lingo; then | |
skaffold run | ||
fi | ||
|
||
kubectl patch deployment lingo --patch "{\"spec\": {\"replicas\": $REPLICAS}}" | ||
|
||
kubectl logs -f deployment/lingo & | ||
|
||
kubectl wait --for=condition=available --timeout=30s deployment/lingo | ||
|
||
|
@@ -77,53 +83,42 @@ pip3 install openai==1.2.3 | |
|
||
# Send 60 requests in parallel to stapi backend using openai python client and threading | ||
python3 $SCRIPT_DIR/test_openai_embedding.py \ | ||
--requests 60 --timeout 300 --base-url "${BASE_URL}" \ | ||
--requests ${REQUESTS} --timeout 300 --base-url "${BASE_URL}" \ | ||
--model text-embedding-ada-002 | ||
|
||
# Ensure replicas has been scaled up to 1 after sending 60 requests | ||
replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}') | ||
if [ "$replicas" -eq 1 ]; then | ||
echo "Test passed: Expected 1 replica after sending requests 60 requests" | ||
if [ "$replicas" -ge "${EXPECTED_REPLICAS}" ]; then | ||
echo "Test passed: Expected ${EXPECTED_REPLICAS} or more replicas and got ${replicas} after sending requests ${REQUESTS} requests" | ||
else | ||
echo "Test failed: Expected 1 replica after sending requests 60 requests, got $replicas" | ||
echo "Test failed: Expected ${EXPECTED_REPLICAS} or more replicas after sending requests ${REQUESTS} requests, got ${replicas}" | ||
exit 1 | ||
fi | ||
|
||
echo "Waiting for deployment to scale down back to 0 within 2 minutes" | ||
for i in {1..15}; do | ||
if [ "$i" -eq 15 ]; then | ||
# Verify that leader election works by forcing a 120 second apiserver outage | ||
KIND_NODE=$(kind get nodes --name=substratus-test) | ||
docker exec ${KIND_NODE} iptables -I INPUT -p tcp --dport 6443 -j DROP | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep and credit to chatGPT with coming up with the high-level idea to simply block the traffic.
|
||
sleep 120 | ||
docker exec ${KIND_NODE} iptables -D INPUT -p tcp --dport 6443 -j DROP | ||
|
||
until kubectl get pods; do | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so even after the iptable rule is removed it takes sometimes some time for kubernetes to recover. This until statement will wait until kubectl get pods start working again, meaning only if kubectl get pods returns exit code 0 it will continue. |
||
echo "Waiting for apiserver to be back up, waiting for 1 second and trying again" | ||
sleep 1 | ||
done | ||
|
||
# rerun kubectl logs because previous one got killed when apiserver was down | ||
kubectl logs --tail=500 -f deployment/lingo & | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you might want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't want to get all the logs which may already have 1000+ entries and only the last 500. I tried and last 100 wasn't enough but last 500 was more than enough. |
||
|
||
echo "Waiting for deployment to scale down back to 0 within ~2 minutes" | ||
for i in {1..30}; do | ||
if [ "$i" -eq 30 ]; then | ||
echo "Test failed: Expected 0 replica after not having requests for more than 1 minute, got $replicas" | ||
exit 1 | ||
fi | ||
replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}') | ||
replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}' || true) | ||
if [ "$replicas" -eq 0 ]; then | ||
echo "Test passed: Expected 0 replica after not having requests for more than 1 minute" | ||
break | ||
fi | ||
sleep 8 | ||
sleep 6 | ||
done | ||
|
||
echo "Patching stapi deployment to sleep on startup" | ||
cat <<EOF | kubectl patch deployment stapi-minilm-l6-v2 --type merge --patch "$(cat)" | ||
spec: | ||
template: | ||
spec: | ||
initContainers: | ||
- name: sleep | ||
image: busybox | ||
command: ["sh", "-c", "sleep 10"] | ||
EOF | ||
|
||
requests=300 | ||
echo "Send $requests requests in parallel to stapi backend using openai python client and threading" | ||
python3 $SCRIPT_DIR/test_openai_embedding.py \ | ||
--requests $requests --timeout 600 --base-url "${BASE_URL}" \ | ||
--model text-embedding-ada-002 | ||
|
||
replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}') | ||
if [ "$replicas" -ge 2 ]; then | ||
echo "Test passed: Expected 2 or more replicas after sending more than $requests requests, got $replicas" | ||
else | ||
echo "Test failed: Expected 2 or more replicas after sending more than $requests requests, got $replicas" | ||
exit 1 | ||
fi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When the context is canceled on a graceful shutdown (sigterm) then
Start()
would return the "cancel" error that results in anExit(1)
.This error handling strategy is used in other places in this file as well. Instead, we could wrap the context in L110
ctx, cancel := context.WithCancel(ctrl.SetupSignalHandler())
and callcancel()
instead. WDYT?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As the error comes from the context already, it may be simpler to do nothing and let the go routine complete
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't fully understand how to change it. Could you elaborate? Are you saying instead of calling os.Exit() just call cancel()?
I didn't see any other example of using cancel on the context in main.go. I tried to follow the same pattern as the other start functions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The only error returned from
Start()
is coming from the context, now. Thanks to your for loop. Calling cancel on the context again is not needed. I would not return an error inStart()
and let the Go routine complete. The main thread should handle the shutdown. I should have deleted my first comment. Sorry for the confusion.To elaborate for a different use case where other errors can happen, this is an example:
Please note that the error is not returned to the main thread.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for that example! I think I get it now. To me it feels cleaner to just directly exit instead of cancelling a context which then may in the future cause an exit? I favor being explicit and going to the exit path as soon as it makes sense.