Skip to content

Commit

Permalink
TDL-22921 Fix the api limit error (#190)
Browse files Browse the repository at this point in the history
* to avoid api rate limit error, tap will sleep for the seconds mentioned in header - X-RateLimit-Remaining

* recursively call the function(afterwards) if the tap is paused for sometime.

* fix the existing unit tests

* fixed pylint issue

* update comments

* setup and changelog
  • Loading branch information
sgandhi1311 authored May 15, 2023
1 parent 099df74 commit 7013274
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 37 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# Changelog

# 2.0.2
* Make the tap sleep for `X-RateLimit-Reset` + `2` seconds, whenever the API rate limit is hit [#187](https://github.com/singer-io/tap-github/pull/187)

# 2.0.1
* Allow `commits` stream sync to continue when we hit an empty repo [#187](https://github.com/singer-io/tap-github/pull/187)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from setuptools import setup, find_packages

setup(name='tap-github',
version='2.0.1',
version='2.0.2',
description='Singer.io tap for extracting data from the GitHub API',
author='Stitch',
url='http://singer.io',
Expand Down
27 changes: 13 additions & 14 deletions tap_github/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from singer import metrics

LOGGER = singer.get_logger()
DEFAULT_SLEEP_SECONDS = 600
DEFAULT_DOMAIN = "https://api.github.com"

# Set default timeout of 300 seconds
Expand Down Expand Up @@ -136,24 +135,23 @@ def calculate_seconds(epoch):
current = time.time()
return int(round((epoch - current), 0))

def rate_throttling(response, max_sleep_seconds):
def rate_throttling(response):
"""
For rate limit errors, get the remaining time before retrying and calculate the time to sleep before making a new request.
"""
if 'X-RateLimit-Remaining' in response.headers:
if int(response.headers['X-RateLimit-Remaining']) == 0:
seconds_to_sleep = calculate_seconds(int(response.headers['X-RateLimit-Reset']))

if seconds_to_sleep > max_sleep_seconds:
message = "API rate limit exceeded, please try after {} seconds.".format(seconds_to_sleep)
raise RateLimitExceeded(message) from None

LOGGER.info("API rate limit exceeded. Tap will retry the data collection after %s seconds.", seconds_to_sleep)
time.sleep(seconds_to_sleep)
else:
# Raise an exception if `X-RateLimit-Remaining` is not found in the header.
# API does include this key header if provided base URL is not a valid github custom domain.
raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.")
# add the buffer 2 seconds
time.sleep(seconds_to_sleep + 2)
#returns True if tap sleeps
return True
return False

# Raise an exception if `X-RateLimit-Remaining` is not found in the header.
# API does include this key header if provided base URL is not a valid github custom domain.
raise GithubException("The API call using the specified base url was unsuccessful. Please double-check the provided base URL.")

class GithubClient:
"""
Expand All @@ -163,7 +161,6 @@ def __init__(self, config):
self.config = config
self.session = requests.Session()
self.base_url = config['base_url'] if config.get('base_url') else DEFAULT_DOMAIN
self.max_sleep_seconds = self.config.get('max_sleep_seconds', DEFAULT_SLEEP_SECONDS)
self.set_auth_in_session()
self.not_accessible_repos = set()

Expand Down Expand Up @@ -199,10 +196,12 @@ def authed_get(self, source, url, headers={}, stream="", should_skip_404 = True)
with metrics.http_request_timer(source) as timer:
self.session.headers.update(headers)
resp = self.session.request(method='get', url=url, timeout=self.get_request_timeout())
if rate_throttling(resp):
# If the API rate limit is reached, the function will be recursively
self.authed_get(source, url, headers, stream, should_skip_404)
if resp.status_code != 200:
raise_for_error(resp, source, stream, self, should_skip_404)
timer.tags[metrics.Tag.http_status_code] = resp.status_code
rate_throttling(resp, self.max_sleep_seconds)
if resp.status_code in {404, 409}:
# Return an empty response body since we're not raising a NotFoundException

Expand Down
27 changes: 5 additions & 22 deletions tests/unittests/test_rate_limit.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class TestRateLimit(unittest.TestCase):

def test_rate_limt_wait(self, mocked_sleep):
"""
Test `rate_throttling` for 'sleep_time' less than `MAX_SLEEP_SECONDS`
Test `rate_throttling` for 'sleep_time'
"""

mocked_sleep.side_effect = None
Expand All @@ -28,30 +28,13 @@ def test_rate_limt_wait(self, mocked_sleep):
resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 120
resp.headers["X-RateLimit-Remaining"] = 0

rate_throttling(resp, DEFAULT_SLEEP_SECONDS)
rate_throttling(resp)

# Verify `time.sleep` is called with expected seconds in response
mocked_sleep.assert_called_with(120)
mocked_sleep.assert_called_with(122)
self.assertTrue(mocked_sleep.called)


def test_rate_limit_exception(self, mocked_sleep):
"""
Test `rate_throttling` for 'sleep_time' greater than `MAX_SLEEP_SECONDS`
"""

mocked_sleep.side_effect = None

resp = api_call()
resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 601
resp.headers["X-RateLimit-Remaining"] = 0

# Verify exception is raised with proper message
with self.assertRaises(tap_github.client.RateLimitExceeded) as e:
rate_throttling(resp, DEFAULT_SLEEP_SECONDS)
self.assertEqual(str(e.exception), "API rate limit exceeded, please try after 601 seconds.")


def test_rate_limit_not_exceeded(self, mocked_sleep):
"""
Test `rate_throttling` if sleep time does not exceed limit
Expand All @@ -63,7 +46,7 @@ def test_rate_limit_not_exceeded(self, mocked_sleep):
resp.headers["X-RateLimit-Reset"] = int(round(time.time(), 0)) + 10
resp.headers["X-RateLimit-Remaining"] = 5

rate_throttling(resp, DEFAULT_SLEEP_SECONDS)
rate_throttling(resp)

# Verify that `time.sleep` is not called
self.assertFalse(mocked_sleep.called)
Expand All @@ -76,7 +59,7 @@ def test_rate_limt_header_not_found(self, mocked_sleep):
resp.headers={}

with self.assertRaises(GithubException) as e:
rate_throttling(resp, DEFAULT_SLEEP_SECONDS)
rate_throttling(resp)

# Verifying the message formed for the invalid base URL
self.assertEqual(str(e.exception), "The API call using the specified base url was unsuccessful. Please double-check the provided base URL.")

0 comments on commit 7013274

Please sign in to comment.