Skip to content

Commit

Permalink
Merge pull request #584 from DocNow/rate_limit_handling
Browse files Browse the repository at this point in the history
Catch 1r/s rate limit and sleep less, instrument overlong sleeps
  • Loading branch information
SamHames authored Jan 6, 2022
2 parents 4bff06b + 11995ce commit adec782
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 39 deletions.
8 changes: 7 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@
python_requires=">=3.3",
install_requires=dependencies,
setup_requires=["pytest-runner"],
tests_require=["pytest", "pytest-black", "python-dotenv", "pytz"],
tests_require=[
"pytest",
"pytest-black",
"python-dotenv",
"pytz",
"tomli<2.0.0,>=0.2.6",
],
entry_points={
"console_scripts": [
"twarc = twarc.command:main",
Expand Down
14 changes: 3 additions & 11 deletions twarc/client2.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,25 +226,17 @@ def _search(
place_fields=place_fields,
)

count = 0
made_call = time.monotonic()

for response in self.get_paginated(url, params=params):

# Calculate the amount of time to sleep, accounting for any
# processing time used by the rest of the application.
# This is to satisfy the 1 request / 1 second rate limit
# on the search/all endpoint.
# Note that we're ensuring the appropriate amount of sleep is
# taken before yielding every item. This ensures that we won't
# exceed the rate limit even in cases where a response generator
# is not completely consumed.
time.sleep(max(0, sleep_between - (time.monotonic() - made_call)))
made_call = time.monotonic()
# is not completely consumed. This might be more conservative
# than necessary.
time.sleep(sleep_between)

# can't return without 'data' if there are no results
if "data" in response:
count += len(response["data"])
yield response

else:
Expand Down
79 changes: 53 additions & 26 deletions twarc/decorators2.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,36 +29,63 @@ def new_f(*args, **kwargs):
errors = 0
return resp
elif resp.status_code == 429:
reset = int(resp.headers["x-rate-limit-reset"])
now = time.time()

# The time to sleep depends on having an accurate system time,
# so check to see if there's something really bad happening
# to warn the user.
target_sleep_seconds = reset - now

# Never sleep longer than 15 minutes, as that is the basis for
# all of the read time based rate limits in the Twitter API
seconds = min(901, max(10, (target_sleep_seconds + 10)))

if target_sleep_seconds >= 900:
# If we need to sleep for more than a rate limit period, the
# system clock could be wrong.
log.warning(
"Detected overlong sleep interval - is your system clock accurate? "
"An accurate system time is needed to calculate how long to sleep for, "
"and data collection might be slowed."
)
elif target_sleep_seconds < 0:
# If we need to sleep for negative time something weird might be up.
# Check the headers, and try to infer why we're hitting the
# rate limit. Because the search/all endpoints also have a
# 1r/s rate limit that isn't obvious in the headers, we need
# to infer the reason for the rate limit. Note that this is
# included to help debug problems with multiple concurrent
# clients - this shouldn't be hit in normal of operation of a
# single twarc client.
remaining = int(resp.headers["x-rate-limit-remaining"])

# If we have a 429 rate limit, but there are remaining calls for
# this endpoint, we've probably hit the 1r/s limit.
if remaining:
log.warning(
"Detected negative sleep interval - is your system clock accurate? "
"If your system time is running fast, rate limiting may not be "
"effective."
"Hit the 1 request/second rate limit, sleeping for 10 seconds. "
"This shouldn't happen with normal usage of twarc, and may indicate "
"multiple clients interacting with the Twitter API at the "
"same time."
)
time.sleep(10)
continue

# Just a regular 15 minute window rate limit.
else:
reset = int(resp.headers["x-rate-limit-reset"])
now = time.time()

# The time to sleep depends on having an accurate system time,
# so check to see if there's something really bad happening
# to warn the user.
target_sleep_seconds = reset - now

# Never sleep longer than 15 minutes, as that is the basis for
# all of the read time based rate limits in the Twitter API
seconds = min(901, max(10, (target_sleep_seconds + 10)))

if target_sleep_seconds >= 900:
# If we need to sleep for more than a rate limit period, the
# system clock could be wrong.
log.warning(
"Detected overlong sleep interval - is your system clock accurate? "
"An accurate system time is needed to calculate how long to sleep for, "
"and data collection might be slowed. "
f"The rate limit resets at {reset} and the current time is {now}."
)
elif target_sleep_seconds < 0:
# If we need to sleep for negative time something weird might be up.
log.warning(
"Detected negative sleep interval - is your system clock accurate? "
"If your system time is running fast, rate limiting may not be "
"effective. "
f"The rate limit resets at {reset} and the current time is {now}."
)

log.warning("rate limit exceeded: sleeping %s secs", seconds)
time.sleep(seconds)

log.warning("rate limit exceeded: sleeping %s secs", seconds)
time.sleep(seconds)
elif resp.status_code >= 500:
errors += 1
if errors > tries:
Expand Down
2 changes: 1 addition & 1 deletion twarc/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version = "2.8.2"
version = "2.8.3"

0 comments on commit adec782

Please sign in to comment.