From c2153e65a303ab4090ef9bd404fc7a35956592a2 Mon Sep 17 00:00:00 2001 From: Sam Hames Date: Wed, 5 Jan 2022 10:19:54 +1000 Subject: [PATCH 1/5] Catch 1r/s rate limit and sleep less, instrument overlong sleeps --- twarc/decorators2.py | 79 +++++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 26 deletions(-) diff --git a/twarc/decorators2.py b/twarc/decorators2.py index aefa3f6b..24e40370 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -29,36 +29,63 @@ def new_f(*args, **kwargs): errors = 0 return resp elif resp.status_code == 429: - reset = int(resp.headers["x-rate-limit-reset"]) - now = time.time() - # The time to sleep depends on having an accurate system time, - # so check to see if there's something really bad happening - # to warn the user. - target_sleep_seconds = reset - now - - # Never sleep longer than 15 minutes, as that is the basis for - # all of the read time based rate limits in the Twitter API - seconds = min(901, max(10, (target_sleep_seconds + 10))) - - if target_sleep_seconds >= 900: - # If we need to sleep for more than a rate limit period, the - # system clock could be wrong. - log.warning( - "Detected overlong sleep interval - is your system clock accurate? " - "An accurate system time is needed to calculate how long to sleep for, " - "and data collection might be slowed." - ) - elif target_sleep_seconds < 0: - # If we need to sleep for negative time something weird might be up. + # Check the headers, and try to infer why we're hitting the + # rate limit. Because the search/all endpoints also have a + # 1r/s rate limit that isn't obvious in the headers, we need + # to infer the reason for the rate limit. Note that this is + # included to help debug problems with multiple concurrent + # clients - this shouldn't be hit in normal of operation of a + # single twarc client. + remaining = int(resp.headers["x-rate-limit-remaining"]) + + # If we have a 429 rate limit, but there are remaining calls for + # this endpoint, we've probably hit the 1r/s limit. + if remaining: log.warning( - "Detected negative sleep interval - is your system clock accurate? " - "If your system time is running fast, rate limiting may not be " - "effective." + "Hit the 1 request/second rate limit, sleeping for 10 seconds. " + "This shouldn't happen with normal usage of twarc, and may indicate " + "multiple clients interacting with the Twitter API at the " + "same time." ) + time.sleep(10) + continue + + # Just a regular 15 minute window rate limit. + else: + reset = int(resp.headers["x-rate-limit-reset"]) + now = time.time() + + # The time to sleep depends on having an accurate system time, + # so check to see if there's something really bad happening + # to warn the user. + target_sleep_seconds = reset - now + + # Never sleep longer than 15 minutes, as that is the basis for + # all of the read time based rate limits in the Twitter API + seconds = min(901, max(10, (target_sleep_seconds + 10))) + + if target_sleep_seconds >= 900: + # If we need to sleep for more than a rate limit period, the + # system clock could be wrong. + log.warning( + "Detected overlong sleep interval - is your system clock accurate? " + "An accurate system time is needed to calculate how long to sleep for, " + "and data collection might be slowed. " + f"The rate limit resets at {reset} and the current time is {now}." + ) + elif target_sleep_seconds < 0: + # If we need to sleep for negative time something weird might be up. + log.warning( + "Detected negative sleep interval - is your system clock accurate? " + "If your system time is running fast, rate limiting may not be " + "effective. " + f"The rate limit resets at {reset} and the current time is {now}." + ) + + log.warning("rate limit exceeded: sleeping %s secs", seconds) + time.sleep(seconds) - log.warning("rate limit exceeded: sleeping %s secs", seconds) - time.sleep(seconds) elif resp.status_code >= 500: errors += 1 if errors > tries: From 38e3fec69c9f2503aa9e554380451f537775ade3 Mon Sep 17 00:00:00 2001 From: Sam Hames Date: Wed, 5 Jan 2022 10:43:41 +1000 Subject: [PATCH 2/5] Temporary workaround for black dependency issue? --- setup.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e4507864..9b5e6a17 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,13 @@ python_requires=">=3.3", install_requires=dependencies, setup_requires=["pytest-runner"], - tests_require=["pytest", "pytest-black", "python-dotenv", "pytz"], + tests_require=[ + "pytest", + "pytest-black", + "python-dotenv", + "pytz", + "tomli<2.0.0,>=0.2.6", + ], entry_points={ "console_scripts": [ "twarc = twarc.command:main", From 836c80ae203b8fd4f9a1bca08c6751949594cff9 Mon Sep 17 00:00:00 2001 From: Sam Hames Date: Wed, 5 Jan 2022 10:54:09 +1000 Subject: [PATCH 3/5] Create a prerelease version Can't skip CI if this needs to be released... --- twarc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/version.py b/twarc/version.py index de337e7d..070acd9d 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = "2.8.2" +version = "2.8.3.dev0" From b84e2a3cb4a197a091d0ccdd16bdd40165aa9e01 Mon Sep 17 00:00:00 2001 From: Sam Hames Date: Wed, 5 Jan 2022 14:52:43 +1000 Subject: [PATCH 4/5] Always sleep for 1.05 s between calls to search/all Avoid trying to calculate the right amount of sleep, and just wait the necessary amount of time between each page. --- twarc/client2.py | 14 +++----------- twarc/version.py | 2 +- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index fc1dc40a..1d7e8a21 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -226,25 +226,17 @@ def _search( place_fields=place_fields, ) - count = 0 - made_call = time.monotonic() - for response in self.get_paginated(url, params=params): - # Calculate the amount of time to sleep, accounting for any - # processing time used by the rest of the application. - # This is to satisfy the 1 request / 1 second rate limit - # on the search/all endpoint. # Note that we're ensuring the appropriate amount of sleep is # taken before yielding every item. This ensures that we won't # exceed the rate limit even in cases where a response generator - # is not completely consumed. - time.sleep(max(0, sleep_between - (time.monotonic() - made_call))) - made_call = time.monotonic() + # is not completely consumed. This might be more conservative + # than necessary. + time.sleep(sleep_between) # can't return without 'data' if there are no results if "data" in response: - count += len(response["data"]) yield response else: diff --git a/twarc/version.py b/twarc/version.py index 070acd9d..91b956fe 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = "2.8.3.dev0" +version = "2.8.3.dev1" From 11995ce1c01a291152844864aaddc4cd52f4ad66 Mon Sep 17 00:00:00 2001 From: Sam Hames Date: Thu, 6 Jan 2022 11:55:27 +1000 Subject: [PATCH 5/5] Prep for real release --- twarc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/version.py b/twarc/version.py index 91b956fe..6b1d15a7 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = "2.8.3.dev1" +version = "2.8.3"