diff --git a/setup.py b/setup.py index e4507864..9b5e6a17 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,13 @@ python_requires=">=3.3", install_requires=dependencies, setup_requires=["pytest-runner"], - tests_require=["pytest", "pytest-black", "python-dotenv", "pytz"], + tests_require=[ + "pytest", + "pytest-black", + "python-dotenv", + "pytz", + "tomli<2.0.0,>=0.2.6", + ], entry_points={ "console_scripts": [ "twarc = twarc.command:main", diff --git a/twarc/client2.py b/twarc/client2.py index fc1dc40a..1d7e8a21 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -226,25 +226,17 @@ def _search( place_fields=place_fields, ) - count = 0 - made_call = time.monotonic() - for response in self.get_paginated(url, params=params): - # Calculate the amount of time to sleep, accounting for any - # processing time used by the rest of the application. - # This is to satisfy the 1 request / 1 second rate limit - # on the search/all endpoint. # Note that we're ensuring the appropriate amount of sleep is # taken before yielding every item. This ensures that we won't # exceed the rate limit even in cases where a response generator - # is not completely consumed. - time.sleep(max(0, sleep_between - (time.monotonic() - made_call))) - made_call = time.monotonic() + # is not completely consumed. This might be more conservative + # than necessary. + time.sleep(sleep_between) # can't return without 'data' if there are no results if "data" in response: - count += len(response["data"]) yield response else: diff --git a/twarc/decorators2.py b/twarc/decorators2.py index aefa3f6b..24e40370 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -29,36 +29,63 @@ def new_f(*args, **kwargs): errors = 0 return resp elif resp.status_code == 429: - reset = int(resp.headers["x-rate-limit-reset"]) - now = time.time() - # The time to sleep depends on having an accurate system time, - # so check to see if there's something really bad happening - # to warn the user. - target_sleep_seconds = reset - now - - # Never sleep longer than 15 minutes, as that is the basis for - # all of the read time based rate limits in the Twitter API - seconds = min(901, max(10, (target_sleep_seconds + 10))) - - if target_sleep_seconds >= 900: - # If we need to sleep for more than a rate limit period, the - # system clock could be wrong. - log.warning( - "Detected overlong sleep interval - is your system clock accurate? " - "An accurate system time is needed to calculate how long to sleep for, " - "and data collection might be slowed." - ) - elif target_sleep_seconds < 0: - # If we need to sleep for negative time something weird might be up. + # Check the headers, and try to infer why we're hitting the + # rate limit. Because the search/all endpoints also have a + # 1r/s rate limit that isn't obvious in the headers, we need + # to infer the reason for the rate limit. Note that this is + # included to help debug problems with multiple concurrent + # clients - this shouldn't be hit in normal of operation of a + # single twarc client. + remaining = int(resp.headers["x-rate-limit-remaining"]) + + # If we have a 429 rate limit, but there are remaining calls for + # this endpoint, we've probably hit the 1r/s limit. + if remaining: log.warning( - "Detected negative sleep interval - is your system clock accurate? " - "If your system time is running fast, rate limiting may not be " - "effective." + "Hit the 1 request/second rate limit, sleeping for 10 seconds. " + "This shouldn't happen with normal usage of twarc, and may indicate " + "multiple clients interacting with the Twitter API at the " + "same time." ) + time.sleep(10) + continue + + # Just a regular 15 minute window rate limit. + else: + reset = int(resp.headers["x-rate-limit-reset"]) + now = time.time() + + # The time to sleep depends on having an accurate system time, + # so check to see if there's something really bad happening + # to warn the user. + target_sleep_seconds = reset - now + + # Never sleep longer than 15 minutes, as that is the basis for + # all of the read time based rate limits in the Twitter API + seconds = min(901, max(10, (target_sleep_seconds + 10))) + + if target_sleep_seconds >= 900: + # If we need to sleep for more than a rate limit period, the + # system clock could be wrong. + log.warning( + "Detected overlong sleep interval - is your system clock accurate? " + "An accurate system time is needed to calculate how long to sleep for, " + "and data collection might be slowed. " + f"The rate limit resets at {reset} and the current time is {now}." + ) + elif target_sleep_seconds < 0: + # If we need to sleep for negative time something weird might be up. + log.warning( + "Detected negative sleep interval - is your system clock accurate? " + "If your system time is running fast, rate limiting may not be " + "effective. " + f"The rate limit resets at {reset} and the current time is {now}." + ) + + log.warning("rate limit exceeded: sleeping %s secs", seconds) + time.sleep(seconds) - log.warning("rate limit exceeded: sleeping %s secs", seconds) - time.sleep(seconds) elif resp.status_code >= 500: errors += 1 if errors > tries: diff --git a/twarc/version.py b/twarc/version.py index de337e7d..6b1d15a7 100644 --- a/twarc/version.py +++ b/twarc/version.py @@ -1 +1 @@ -version = "2.8.2" +version = "2.8.3"