Skip to content

Commit

Permalink
Merge pull request #604 from DocNow/workaround_counts_API_zero_count
Browse files Browse the repository at this point in the history
Workaround a limitation in the Twitter API returning empty page
  • Loading branch information
edsu authored Mar 4, 2022
2 parents eb0ced8 + 23c977f commit 79a35fa
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 12 deletions.
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ jobs:
ACCESS_TOKEN: ${{ secrets.access_token }}
ACCESS_TOKEN_SECRET: ${{ secrets.access_token_secret }}
BEARER_TOKEN: ${{ secrets.bearer_token }}
SKIP_ACADEMIC_PRODUCT_TRACK: true
run: python setup.py test

- name: Ensure packages can be built
Expand Down
20 changes: 20 additions & 0 deletions test_twarc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,26 @@ def test_counts_recent():
assert 7 <= found_counts <= 8


@pytest.mark.skipif(
os.environ.get("SKIP_ACADEMIC_PRODUCT_TRACK") != None,
reason="No Academic Research Product Track access",
)
def test_counts_empty_page():

found_counts = 0

for response_page in T.counts_all(
"beans",
start_time=datetime.datetime(2006, 3, 21),
end_time=datetime.datetime(2006, 6, 1),
granularity="day",
):
counts = response_page["data"]
found_counts += len(counts)

assert found_counts == 72


def test_search_times():
found = False
now = datetime.datetime.now(tz=pytz.timezone("Australia/Melbourne"))
Expand Down
75 changes: 63 additions & 12 deletions twarc/client2.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,9 @@ def _search(
if granularity:
# Do not specify anything else when calling counts endpoint
params["granularity"] = granularity
# Mark that we're using counts, to workaround a limitation of the
# Twitter API with long running counts.
using_counts = True
else:
params = self._prepare_params(
**params,
Expand All @@ -225,22 +228,69 @@ def _search(
poll_fields=poll_fields,
place_fields=place_fields,
)
using_counts = False

# Workaround for observed odd behaviour in the Twitter counts
# functionality.
if using_counts:
while True:
for response in self.get_paginated(url, params=params):

# Note that we're ensuring the appropriate amount of sleep is
# taken before yielding every item. This ensures that we won't
# exceed the rate limit even in cases where a response generator
# is not completely consumed. This might be more conservative
# than necessary.
time.sleep(sleep_between)

# can't return without 'data' if there are no results
if "data" in response:
last_time_start = response["data"][0]["start"]
yield response

for response in self.get_paginated(url, params=params):
else:
log.info(f"Retrieved an empty page of results.")

# Check that we've actually reached the end, and restart if necessary.
# Note we need to exactly match the Twitter format, which is a little
# fiddly because Python doesn't let you specify milliseconds only for
# strftime.
if (
start_time is None
or (start_time.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z")
== last_time_start
):
break
else:
# Note that we're passing the Twitter start_time straight
# back to it - this avoids parsing and reformatting the date.
params["end_time"] = last_time_start

# Note that we're ensuring the appropriate amount of sleep is
# taken before yielding every item. This ensures that we won't
# exceed the rate limit even in cases where a response generator
# is not completely consumed. This might be more conservative
# than necessary.
time.sleep(sleep_between)
# Remove the next_token reference, we're restarting the search.
if "next_token" in params:
del params["next_token"]

# can't return without 'data' if there are no results
if "data" in response:
yield response
log.info(
"Detected incomplete counts, restarting with "
f"{last_time_start} as the new end_time"
)

else:
log.info(f"Retrieved an empty page of results.")
else:
for response in self.get_paginated(url, params=params):

# Note that we're ensuring the appropriate amount of sleep is
# taken before yielding every item. This ensures that we won't
# exceed the rate limit even in cases where a response generator
# is not completely consumed. This might be more conservative
# than necessary.
time.sleep(sleep_between)

# can't return without 'data' if there are no results
if "data" in response:
yield response

else:
log.info(f"Retrieved an empty page of results.")

log.info(f"No more results for search {query}.")

Expand Down Expand Up @@ -1200,6 +1250,7 @@ def get_paginated(self, *args, **kwargs):
token_param = "next_token"

while "meta" in page and "next_token" in page["meta"]:

if "params" in kwargs:
kwargs["params"][token_param] = page["meta"]["next_token"]
else:
Expand Down

0 comments on commit 79a35fa

Please sign in to comment.