Skip to content

Commit

Permalink
Add rate limiting (#4365)
Browse files Browse the repository at this point in the history
Add rate limiting for tasks.
Don't allow a task to be run too many times within a 6 hour window,
especially if it's erroring.
Also minor refactor of commands.py
  • Loading branch information
jonathanmetzman authored Nov 7, 2024
1 parent da28b8b commit c26e353
Show file tree
Hide file tree
Showing 18 changed files with 276 additions and 34 deletions.
2 changes: 1 addition & 1 deletion src/appengine/handlers/upload_testcase.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
from clusterfuzz._internal import fuzzing
from clusterfuzz._internal.base import external_users
from clusterfuzz._internal.base import memoize
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base import tasks
from clusterfuzz._internal.base import utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.crash_analysis.stack_parsing import stack_analyzer
from clusterfuzz._internal.datastore import data_handler
from clusterfuzz._internal.datastore import data_types
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@

from clusterfuzz._internal.base import external_tasks
from clusterfuzz._internal.base import persistent_cache
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base import utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.config import local_config
from clusterfuzz._internal.datastore import data_types
from clusterfuzz._internal.datastore import ndb_utils
Expand Down
102 changes: 102 additions & 0 deletions src/clusterfuzz/_internal/base/tasks/task_rate_limiting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Task rate limiting."""

import datetime

from clusterfuzz._internal.datastore import data_types
from clusterfuzz._internal.datastore import ndb_utils
from clusterfuzz._internal.metrics import logs
from clusterfuzz._internal.system import environment


def _get_datetime_now():
return datetime.datetime.now()


# Things that are sometimes run as tasks by commands.py but are really portions
# of actual tasks.
_UTASK_PSEUDO_TASKS = {'uworker_main', 'postprocess', 'preprocess'}


class TaskRateLimiter:
"""Rate limiter for tasks. This limits tasks to 100 erroneous runs or 2000
succesful runs in 6 hours. It keeps track of task completion when record_task
is called at the end of every task."""
TASK_RATE_LIMIT_WINDOW = datetime.timedelta(hours=6)
TASK_RATE_LIMIT_MAX_ERRORS = 100
# TODO(metzman): Reevaluate this number, it's probably too high.
TASK_RATE_LIMIT_MAX_COMPLETIONS = 2000

def __init__(self, task_name, task_argument, job_name):
self.task_name = task_name
self.task_argument = task_argument
self.job_name = job_name

def __str__(self):
return ' '.join([self.task_name, self.task_argument, self.job_name])

def record_task(self, success: bool) -> None:
"""Records a task and whether it completed succesfully."""
if self.task_name in _UTASK_PSEUDO_TASKS:
# Don't rate limit these fake uworker tasks.
return
if success:
status = data_types.TaskState.FINISHED
else:
status = data_types.TaskState.ERROR
window_task = data_types.WindowRateLimitTask(
task_name=self.task_name,
task_argument=self.task_argument,
job_name=self.job_name,
status=status)
window_task.put()

def is_rate_limited(self) -> bool:
"""Checks if the given task is rate limited."""
if self.task_name in _UTASK_PSEUDO_TASKS:
# Don't rate limit these fake tasks.
return False
if environment.get_value('COMMAND_OVERRIDE'):
# A user wants to run this task.
return False
window_start = _get_datetime_now() - self.TASK_RATE_LIMIT_WINDOW
query = data_types.WindowRateLimitTask.query(
data_types.WindowRateLimitTask.task_name == self.task_name,
data_types.WindowRateLimitTask.task_argument == self.task_argument,
data_types.WindowRateLimitTask.job_name == self.job_name,
data_types.WindowRateLimitTask.timestamp >= window_start)
tasks = ndb_utils.get_all_from_query(query)
completed_count = 0
error_count = 0
for task in tasks:
# Limit based on completions.
completed_count += 1
if completed_count > self.TASK_RATE_LIMIT_MAX_COMPLETIONS:
logs.warning(
f'{str(self)} rate limited. '
f'It ran at least {self.TASK_RATE_LIMIT_MAX_COMPLETIONS} in window.'
)
return True

# Limit based on errors.
if task.status == data_types.TaskState.ERROR:
error_count += 1
if error_count > self.TASK_RATE_LIMIT_MAX_ERRORS:
logs.warning(
f'{str(self)} rate limited. '
f'It errored at least {self.TASK_RATE_LIMIT_MAX_ERRORS} in window.')
return True

return False
42 changes: 21 additions & 21 deletions src/clusterfuzz/_internal/bot/tasks/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from clusterfuzz._internal.base import errors
from clusterfuzz._internal.base import tasks
from clusterfuzz._internal.base import utils
from clusterfuzz._internal.base.tasks import task_rate_limiting
from clusterfuzz._internal.bot.tasks import blame_task
from clusterfuzz._internal.bot.tasks import impact_task
from clusterfuzz._internal.bot.tasks import task_types
Expand Down Expand Up @@ -190,12 +191,8 @@ def start_web_server_if_needed():
logs.error('Failed to start web server, skipping.')


def run_command(task_name,
task_argument,
job_name,
uworker_env,
preprocess=False):
"""Run the command."""
def run_command(task_name, task_argument, job_name, uworker_env):
"""Runs the command."""
task = COMMAND_MAP.get(task_name)
if not task:
logs.error("Unknown command '%s'" % task_name)
Expand All @@ -211,23 +208,32 @@ def run_command(task_name,
raise AlreadyRunningError

result = None
rate_limiter = task_rate_limiting.TaskRateLimiter(task_name, task_argument,
job_name)
if rate_limiter.is_rate_limited():
logs.error(f'Rate limited task: {task_name} {task_argument} {job_name}')
if task_name == 'fuzz':
# Wait 10 seconds. We don't want to try again immediately because if we
# tried to run a fuzz task then there is no other task to run.
time.sleep(environment.get_value('FAIL_WAIT'))
return None
try:
if not preprocess:
result = task.execute(task_argument, job_name, uworker_env)
else:
result = task.preprocess(task_argument, job_name, uworker_env)
result = task.execute(task_argument, job_name, uworker_env)
except errors.InvalidTestcaseError:
# It is difficult to try to handle the case where a test case is deleted
# during processing. Rather than trying to catch by checking every point
# where a test case is reloaded from the datastore, just abort the task.
logs.warning('Test case %s no longer exists.' % task_argument)
rate_limiter.record_task(success=False)
except BaseException:
# On any other exceptions, update state to reflect error and re-raise.
if should_update_task_status(task_name):
data_handler.update_task_status(task_state_name,
data_types.TaskState.ERROR)

rate_limiter.record_task(success=False)
raise
else:
rate_limiter.record_task(success=True)

# Task completed successfully.
if should_update_task_status(task_name):
Expand All @@ -254,12 +260,8 @@ def _get_task_id(task_name, task_argument, job_name):
# pylint: disable=too-many-nested-blocks
# TODO(mbarbella): Rewrite this function to avoid nesting issues.
@set_task_payload
def process_command_impl(task_name,
task_argument,
job_name,
high_end,
is_command_override,
preprocess=False):
def process_command_impl(task_name, task_argument, job_name, high_end,
is_command_override):
"""Implementation of process_command."""
uworker_env = None
environment.set_value('TASK_NAME', task_name)
Expand Down Expand Up @@ -320,8 +322,7 @@ def process_command_impl(task_name,
logs.error('Failed to fix platform and re-add task.')

# Add a wait interval to avoid overflowing task creation.
failure_wait_interval = environment.get_value('FAIL_WAIT')
time.sleep(failure_wait_interval)
time.sleep(environment.get_value('FAIL_WAIT'))
return None

if task_name != 'fuzz':
Expand Down Expand Up @@ -441,8 +442,7 @@ def process_command_impl(task_name,
start_web_server_if_needed()

try:
return run_command(task_name, task_argument, job_name, uworker_env,
preprocess)
return run_command(task_name, task_argument, job_name, uworker_env)
finally:
# Final clean up.
cleanup_task_state()
2 changes: 1 addition & 1 deletion src/clusterfuzz/_internal/bot/tasks/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@

from clusterfuzz._internal.base import dates
from clusterfuzz._internal.base import errors
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base import tasks
from clusterfuzz._internal.base import utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.bot import testcase_manager
from clusterfuzz._internal.bot.tasks.utasks import uworker_handle_errors
from clusterfuzz._internal.bot.tasks.utasks import uworker_io
Expand Down
2 changes: 1 addition & 1 deletion src/clusterfuzz/_internal/bot/tasks/task_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
# limitations under the License.
"""Common functions for task creation for test cases."""
from clusterfuzz._internal.base import bisection
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base import tasks
from clusterfuzz._internal.base import utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.bot.tasks import task_types
from clusterfuzz._internal.build_management import build_manager
from clusterfuzz._internal.datastore import data_handler
Expand Down
2 changes: 1 addition & 1 deletion src/clusterfuzz/_internal/bot/tasks/task_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
base/tasks.py depends on this module and many things commands.py imports depend
on base/tasks.py (i.e. avoiding circular imports)."""
from clusterfuzz._internal import swarming
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base import tasks
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.bot.tasks import utasks
from clusterfuzz._internal.google_cloud_utils import batch
from clusterfuzz._internal.metrics import logs
Expand Down
2 changes: 1 addition & 1 deletion src/clusterfuzz/_internal/bot/tasks/utasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from google.protobuf import timestamp_pb2

from clusterfuzz._internal import swarming
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.bot.tasks.utasks import uworker_io
from clusterfuzz._internal.bot.webserver import http_server
from clusterfuzz._internal.metrics import logs
Expand Down
2 changes: 1 addition & 1 deletion src/clusterfuzz/_internal/bot/tasks/utasks/uworker_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from google.protobuf import timestamp_pb2
import google.protobuf.message

from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.google_cloud_utils import storage
from clusterfuzz._internal.metrics import logs
from clusterfuzz._internal.protos import uworker_msg_pb2
Expand Down
13 changes: 13 additions & 0 deletions src/clusterfuzz/_internal/datastore/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,19 @@ class TaskStatus(Model):
time = ndb.DateTimeProperty()


class WindowRateLimitTask(Model):
"""Records the completion of a task. This cannot be part of TaskStatus because
it will have a different lifecycle (it's not needed after the window
completes). This should have a TTL as TASK_RATE_LIMIT_WINDOW in
task_rate_limiting.py (6 hours)."""
# TODO(metzman): Consider using task_id.
timestamp = ndb.DateTimeProperty(auto_now_add=True, indexed=True)
task_name = ndb.StringProperty(indexed=True)
task_argument = ndb.StringProperty(indexed=True)
job_name = ndb.StringProperty(indexed=True)
status = ndb.StringProperty(choices=[TaskState.ERROR, TaskState.FINISHED])


class BuildMetadata(Model):
"""Metadata associated with a particular archived build."""
# Job type that this build belongs to.
Expand Down
2 changes: 1 addition & 1 deletion src/clusterfuzz/_internal/google_cloud_utils/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
from google.cloud import batch_v1 as batch

from clusterfuzz._internal.base import retry
from clusterfuzz._internal.base import task_utils
from clusterfuzz._internal.base import utils
from clusterfuzz._internal.base.tasks import task_utils
from clusterfuzz._internal.config import local_config
from clusterfuzz._internal.datastore import data_types
from clusterfuzz._internal.metrics import logs
Expand Down
13 changes: 13 additions & 0 deletions src/clusterfuzz/_internal/tests/core/base/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Loading

0 comments on commit c26e353

Please sign in to comment.