Merge pull request #604 from DocNow/workaround_counts_API_zero_count

Workaround a limitation in the Twitter API returning empty page
DocNow · Mar 4, 2022 · 79a35fa · 79a35fa
2 parents eb0ced8 + 23c977f
commit 79a35fa
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 12 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -37,6 +37,7 @@ jobs:
         ACCESS_TOKEN: ${{ secrets.access_token }}
         ACCESS_TOKEN_SECRET: ${{ secrets.access_token_secret }}
         BEARER_TOKEN: ${{ secrets.bearer_token }}
+        SKIP_ACADEMIC_PRODUCT_TRACK: true
       run: python setup.py test
 
     - name: Ensure packages can be built

diff --git a/test_twarc2.py b/test_twarc2.py
@@ -127,6 +127,26 @@ def test_counts_recent():
     assert 7 <= found_counts <= 8
 
 
+@pytest.mark.skipif(
+    os.environ.get("SKIP_ACADEMIC_PRODUCT_TRACK") != None,
+    reason="No Academic Research Product Track access",
+)
+def test_counts_empty_page():
+
+    found_counts = 0
+
+    for response_page in T.counts_all(
+        "beans",
+        start_time=datetime.datetime(2006, 3, 21),
+        end_time=datetime.datetime(2006, 6, 1),
+        granularity="day",
+    ):
+        counts = response_page["data"]
+        found_counts += len(counts)
+
+    assert found_counts == 72
+
+
 def test_search_times():
     found = False
     now = datetime.datetime.now(tz=pytz.timezone("Australia/Melbourne"))

diff --git a/twarc/client2.py b/twarc/client2.py
@@ -215,6 +215,9 @@ def _search(
         if granularity:
             # Do not specify anything else when calling counts endpoint
             params["granularity"] = granularity
+            # Mark that we're using counts, to workaround a limitation of the
+            # Twitter API with long running counts.
+            using_counts = True
         else:
             params = self._prepare_params(
                 **params,
@@ -225,22 +228,69 @@ def _search(
                 poll_fields=poll_fields,
                 place_fields=place_fields,
             )
+            using_counts = False
+
+        # Workaround for observed odd behaviour in the Twitter counts
+        # functionality.
+        if using_counts:
+            while True:
+                for response in self.get_paginated(url, params=params):
+
+                    # Note that we're ensuring the appropriate amount of sleep is
+                    # taken before yielding every item. This ensures that we won't
+                    # exceed the rate limit even in cases where a response generator
+                    # is not completely consumed. This might be more conservative
+                    # than necessary.
+                    time.sleep(sleep_between)
+
+                    # can't return without 'data' if there are no results
+                    if "data" in response:
+                        last_time_start = response["data"][0]["start"]
+                        yield response
 
-        for response in self.get_paginated(url, params=params):
+                    else:
+                        log.info(f"Retrieved an empty page of results.")
+
+                # Check that we've actually reached the end, and restart if necessary.
+                # Note we need to exactly match the Twitter format, which is a little
+                # fiddly because Python doesn't let you specify milliseconds only for
+                # strftime.
+                if (
+                    start_time is None
+                    or (start_time.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z")
+                    == last_time_start
+                ):
+                    break
+                else:
+                    # Note that we're passing the Twitter start_time straight
+                    # back to it - this avoids parsing and reformatting the date.
+                    params["end_time"] = last_time_start
 
-            # Note that we're ensuring the appropriate amount of sleep is
-            # taken before yielding every item. This ensures that we won't
-            # exceed the rate limit even in cases where a response generator
-            # is not completely consumed. This might be more conservative
-            # than necessary.
-            time.sleep(sleep_between)
+                    # Remove the next_token reference, we're restarting the search.
+                    if "next_token" in params:
+                        del params["next_token"]
 
-            # can't return without 'data' if there are no results
-            if "data" in response:
-                yield response
+                    log.info(
+                        "Detected incomplete counts, restarting with "
+                        f"{last_time_start} as the new end_time"
+                    )
 
-            else:
-                log.info(f"Retrieved an empty page of results.")
+        else:
+            for response in self.get_paginated(url, params=params):
+
+                # Note that we're ensuring the appropriate amount of sleep is
+                # taken before yielding every item. This ensures that we won't
+                # exceed the rate limit even in cases where a response generator
+                # is not completely consumed. This might be more conservative
+                # than necessary.
+                time.sleep(sleep_between)
+
+                # can't return without 'data' if there are no results
+                if "data" in response:
+                    yield response
+
+                else:
+                    log.info(f"Retrieved an empty page of results.")
 
         log.info(f"No more results for search {query}.")
 
@@ -1200,6 +1250,7 @@ def get_paginated(self, *args, **kwargs):
             token_param = "next_token"
 
         while "meta" in page and "next_token" in page["meta"]:
+
             if "params" in kwargs:
                 kwargs["params"][token_param] = page["meta"]["next_token"]
             else: