Skip to content

Commit

Permalink
Add sidekiq backoff service to handle OS Places outages
Browse files Browse the repository at this point in the history
- When OS Places has a slowdown or outage, we can avoid our own alerting problems by backing off calls (since they're not time-critical).
- Add a service which is initialised on startup (relevant mainly to worker processes) which adjusts the scheduled interval of PostcodeProcessWorker creation (currently once per second). We record OS Places API failures, and with each failure we double the interval, until we reach a max of 180s. When we record a successful call, we reduce the interval by 1s, so it will quickly back off if many errors occur, and slowly creep back to full speed when the errors are over.
  • Loading branch information
KludgeKML committed Oct 12, 2023
1 parent 809cc00 commit fb6f7e6
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 0 deletions.
2 changes: 2 additions & 0 deletions app/workers/process_postcode_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ class ProcessPostcodeWorker

def perform(postcode)
PostcodeManager.new.update_postcode(postcode)
Rails.application.config.sidekiq_scheduler_backoff_service.record_success
rescue OsPlacesApi::ClientError => e
GovukError.notify(e)
Rails.application.config.sidekiq_scheduler_backoff_service.record_failure
end
end
8 changes: 8 additions & 0 deletions config/initializers/sidekiq.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require "sidekiq-unique-jobs"
require "sidekiq_scheduler_backoff_service"

Sidekiq.configure_server do |config|
config.client_middleware do |chain|
Expand All @@ -17,3 +18,10 @@
chain.add SidekiqUniqueJobs::Middleware::Client
end
end

# Set backoff service to slow down schedule to once per 180 seconds if lots of errors
Rails.application.config.sidekiq_scheduler_backoff_service = SidekiqSchedulerBackoffService.new(
name: "queue_oldest_postcodes_for_updating",
min_interval: 1,
max_interval: 180,
)
33 changes: 33 additions & 0 deletions lib/sidekiq_scheduler_backoff_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
class SidekiqSchedulerBackoffService
attr_reader :name, :min_interval, :max_interval

def initialize(name:, min_interval:, max_interval:)
@name = name.to_s
@min_interval = min_interval
@max_interval = max_interval
end

def record_success
initial_interval = current_interval
target_interval = [initial_interval - 1, min_interval].max
restart_schedule(target_interval) if target_interval != initial_interval
end

def record_failure
initial_interval = current_interval
target_interval = [initial_interval * 2, max_interval].min
restart_schedule(target_interval) if target_interval != initial_interval
end

private

def current_interval
schedule = Sidekiq.get_schedule[name]
Integer(schedule["every"].first.chop)
end

def restart_schedule(target_interval)
schedule = Sidekiq.get_schedule[name]
Sidekiq.set_schedule(name, schedule.merge("every" => ["#{target_interval}s"]))
end
end
108 changes: 108 additions & 0 deletions spec/lib/sidekiq_scheduler_backoff_service_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
require "spec_helper"

RSpec.describe SidekiqSchedulerBackoffService do
let(:min_interval) { 2 }
let(:max_interval) { 180 }
let(:name) { :queue_oldest_postcodes_for_updating }
subject { SidekiqSchedulerBackoffService.new(name:, min_interval:, max_interval:) }

describe "#record_failure" do
context "when the scheduler is going faster than maximum speed" do
before do
set_scheduled_interval(min_interval - 1)
end

it "sets the scheduler to maximum speed and reloads the schedule" do
subject.record_failure
expect(scheduled_interval).to eq(["#{min_interval}s"])
end
end

context "when the scheduler is going faster than minimum speed" do
before do
set_scheduled_interval(max_interval / 2)
end

it "halves the scheduler speed and reloads the schedule" do
subject.record_failure
expect(scheduled_interval).to eq(["#{max_interval}s"])
end
end

context "when the scheduler is going at minimum speed" do
before do
set_scheduled_interval(max_interval)
end

it "does nothing" do
subject.record_failure
expect(scheduled_interval).to eq(["#{max_interval}s"])
end
end

context "when the scheduler is going slower than minimum speed" do
before do
set_scheduled_interval(max_interval * 2)
end

it "sets the scheduler to minimum speed and reloads the schedule" do
subject.record_failure
expect(scheduled_interval).to eq(["#{max_interval}s"])
end
end
end

describe "#record_success" do
context "when the scheduler is going faster than maximum speed" do
before do
set_scheduled_interval(min_interval - 1)
end

it "sets the scheduler to maximum speed and reloads the schedule" do
subject.record_success
expect(scheduled_interval).to eq(["#{min_interval}s"])
end
end

context "when the scheduler is going at maximum speed" do
before do
set_scheduled_interval(min_interval)
end

it "does nothing" do
subject.record_success
expect(scheduled_interval).to eq(["#{min_interval}s"])
end
end

context "when the scheduler is going slower than maximum speed" do
before do
set_scheduled_interval(min_interval * 4)
end

it "decremenincrements the scheduler speed by 1 second and reloads the schedule" do
subject.record_success
expect(scheduled_interval).to eq(["#{(min_interval * 4) - 1}s"])
end
end

context "when the scheduler is going slower than minimum speed" do
before do
set_scheduled_interval(max_interval + 1)
end

it "sets the scheduler to minimum speed and reloads the schedule" do
subject.record_success
expect(scheduled_interval).to eq(["#{max_interval}s"])
end
end
end
end

def set_scheduled_interval(interval)
Sidekiq.set_schedule(name.to_s, { "every" => ["#{interval}s"], "class" => "PostcodesCollectionWorker" })
end

def scheduled_interval
Sidekiq.get_schedule["queue_oldest_postcodes_for_updating"]["every"]
end

0 comments on commit fb6f7e6

Please sign in to comment.