From 7013274ec27b02c67300544e585fe6578715569b Mon Sep 17 00:00:00 2001 From: Sourabh Gandhi <105213416+sgandhi1311@users.noreply.github.com> Date: Mon, 15 May 2023 17:15:02 +0530 Subject: [PATCH] TDL-22921 Fix the api limit error (#190) * to avoid api rate limit error, tap will sleep for the seconds mentioned in header - X-RateLimit-Remaining * recursively call the function(afterwards) if the tap is paused for sometime. * fix the existing unit tests * fixed pylint issue * update comments * setup and changelog --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_github/client.py | 27 +++++++++++++-------------- tests/unittests/test_rate_limit.py | 27 +++++---------------------- 4 files changed, 22 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f9cd9ed..040bfa3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +# 2.0.2 + * Make the tap sleep for `X-RateLimit-Reset` + `2` seconds, whenever the API rate limit is hit [#187](https://github.com/singer-io/tap-github/pull/187) + # 2.0.1 * Allow `commits` stream sync to continue when we hit an empty repo [#187](https://github.com/singer-io/tap-github/pull/187) diff --git a/setup.py b/setup.py index de812020..6d9ba88d 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-github', - version='2.0.1', + version='2.0.2', description='Singer.io tap for extracting data from the GitHub API', author='Stitch', url='http://singer.io', diff --git a/tap_github/client.py b/tap_github/client.py index c7f2a217..fe6b3de4 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -6,7 +6,6 @@ from singer import metrics LOGGER = singer.get_logger() -DEFAULT_SLEEP_SECONDS = 600 DEFAULT_DOMAIN = "https://api.github.com" # Set default timeout of 300 seconds @@ -136,24 +135,23 @@ def calculate_seconds(epoch): current = time.time() return int(round((epoch - current), 0)) -def rate_throttling(response, max_sleep_seconds): +def rate_throttling(response): """ For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request. """ if 'X-RateLimit-Remaining' in response.headers: if int(response.headers['X-RateLimit-Remaining']) == 0: seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset'])) - - if seconds_to_sleep > max_sleep_seconds: - message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep) - raise RateLimitExceeded(message) from None - LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep) - time.sleep(seconds_to_sleep) - else: - # Raise an exception if `X-RateLimit-Remaining` is not found in the header. - # API does include this key header if provided base URL is not a valid github custom domain. - raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") + # add the buffer 2 seconds + time.sleep(seconds_to_sleep + 2) + #returns True if tap sleeps + return True + return False + + # Raise an exception if `X-RateLimit-Remaining` is not found in the header. + # API does include this key header if provided base URL is not a valid github custom domain. + raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.") class GithubClient: """ @@ -163,7 +161,6 @@ def __init__(self, config): self.config = config self.session = requests.Session() self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN - self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS) self.set_auth_in_session() self.not_accessible_repos = set() @@ -199,10 +196,12 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True) with metrics.http_request_timer(source) as timer: self.session.headers.update(headers) resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout()) + if rate_throttling(resp): + # If the API rate limit is reached, the function will be recursively + self.authed_get(source, url, headers, stream, should_skip_404) if resp.status_code != 200: raise_for_error(resp, source, stream, self, should_skip_404) timer.tags[metrics.Tag.http_status_code] = resp.status_code - rate_throttling(resp, self.max_sleep_seconds) if resp.status_code in {404, 409}: # Return an empty response body since we're not raising a NotFoundException diff --git a/tests/unittests/test_rate_limit.py b/tests/unittests/test_rate_limit.py index 987c60a0..a10525d8 100644 --- a/tests/unittests/test_rate_limit.py +++ b/tests/unittests/test_rate_limit.py @@ -19,7 +19,7 @@ class TestRateLimit(unittest.TestCase): def test_rate_limt_wait(self, mocked_sleep): """ - Test `rate_throttling` for 'sleep_time' less than `MAX_SLEEP_SECONDS` + Test `rate_throttling` for 'sleep_time' """ mocked_sleep.side_effect = None @@ -28,30 +28,13 @@ def test_rate_limt_wait(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 120 resp.headers["X-RateLimit-Remaining"] = 0 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp) # Verify `time.sleep` is called with expected seconds in response - mocked_sleep.assert_called_with(120) + mocked_sleep.assert_called_with(122) self.assertTrue(mocked_sleep.called) - def test_rate_limit_exception(self, mocked_sleep): - """ - Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS` - """ - - mocked_sleep.side_effect = None - - resp = api_call() - resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 601 - resp.headers["X-RateLimit-Remaining"] = 0 - - # Verify exception is raised with proper message - with self.assertRaises(tap_github.client.RateLimitExceeded) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) - self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.") - - def test_rate_limit_not_exceeded(self, mocked_sleep): """ Test `rate_throttling` if sleep time does not exceed limit @@ -63,7 +46,7 @@ def test_rate_limit_not_exceeded(self, mocked_sleep): resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 10 resp.headers["X-RateLimit-Remaining"] = 5 - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp) # Verify that `time.sleep` is not called self.assertFalse(mocked_sleep.called) @@ -76,7 +59,7 @@ def test_rate_limt_header_not_found(self, mocked_sleep): resp.headers={} with self.assertRaises(GithubException) as e: - rate_throttling(resp, DEFAULT_SLEEP_SECONDS) + rate_throttling(resp) # Verifying the message formed for the invalid base URL self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.")