Fix paginating issue

Manual skip parameters are disabled for now. The code instead detects the next page supplied by the API and uses that to load the next page of data.
pvdmeulen · Dec 23, 2024 · 30965b3 · 30965b3
1 parent c066024
commit 30965b3
Show file tree

Hide file tree

Showing 9 changed files with 85 additions and 37 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: uktrade
 Type: Package
 Title: Accessing Data Through HMRC's API
-Version: 0.7.1
+Version: 0.7.2
 Authors@R: person("van der Meulen", "Peter", email = "[email protected]", role = c("aut", "cre"))
 Description: Convenient functions to load HMRC Overseas Trade Statistics, Regional Trade Statistics, and custom URLs using HMRC's API.
 License: MIT + file LICENSE
@@ -17,4 +17,4 @@ Imports:
   stringr,
   dplyr,
   rlang
-RoxygenNote: 7.2.1
+RoxygenNote: 7.3.2
diff --git a/R/load_custom.R b/R/load_custom.R
@@ -6,7 +6,7 @@
 #' @param base_url Base URL for use in API. Defaults to https://api.uktradeinfo.com.
 #' @param endpoint Endpoint for use in API. Takes a single character string with no default.
 #' @param custom_search Custom query. Takes a single character string with no default.
-#' @param skip_interval A non-negative integer value showing the skip interval for paginated results. Defaults to 40,000 rows.
+#' @param skip_interval Indicates the skip interval for paginated results. Defaults to NULL; this uses the skip parameter automatically generated by the API. Setting this to a different non-negative integer value is currently not supported.
 #' @param output A character specifying if a tibble ("tibble") or dataframe ("df") should be returned. Defaults to "tibble".
 #' @param request A non-negative integer value to keep track of the starting number of requests made. Defaults to zero. This can be increased in case you are making multiple requests using this function in succession and do not want to exceed the API limit (60 requests per minute).
 #' @param timer A non-negative integer value (seconds) to keep track of the time taken so far. Defaults to NULL. This can be increased in case you are making multiple requests using this function in succession and do not want to exceed the API limit (60 requests per minute).
@@ -48,15 +48,31 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
                         endpoint,
                         custom_search = "",
                         request = 1,
-                        skip_interval = 4e4,
                         timer = NULL,
+                        skip_interval = NULL,
                         output = "tibble",
                         print_url = FALSE,
                         debug = FALSE,
                         use_proxy = FALSE,
                         ...
 ){
 
+  # Hard code the following variables for now - HMRC API does not paginate when
+  # setting custom skip and top variables. The above will force the code to use
+  # the pagination set by the API itself when a request exceeds some predefined
+  # amount of rows.
+
+  if(!is.null(skip_interval)){
+    warning("Setting skip_interval to NULL. This uses the skip parameter automatically generated by the API. A manual skip paramater is currently not supported.")
+  }
+
+  skip_interval <- NULL
+  top <- NULL
+
+  max_page_size <- 4e4 # Used for a warning when skip_interval is greater
+
+  # Continue with request:
+
   check_internet()
 
   done <- FALSE
@@ -66,6 +82,14 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
   skip <- 0
   page <- 1
 
+  if(!is.null(skip_interval)){
+    if(skip_interval > max_page_size){
+      warning(paste0("Manual skip interval of ", skip_interval,
+                     " exceeds the maximum page size of ",
+                     max_page_size, ". Consider setting a lower skip interval."))
+    }
+  }
+
   # Create timer reference point:
   timer <- if(is.null(timer)){ proc.time() } else { timer }
 
@@ -83,8 +107,20 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
                                    ), " seconds"))}
 
     # Construct skip suffix:
-    skip_suffix <- if(skip == 0) { NULL } else {
-      paste0("&$skip=", format(skip, scientific = FALSE)) }
+    skip_suffix <- if(skip == 0 & is.null(skip_interval)) {
+      # If we are on the first page and we're not using a custom skip interval:
+      NULL
+    } else if(skip > 0 & is.null(skip_interval)){
+      # If skip > 0 and we're not using a custom skip interval:
+      paste0("&$skip=", format(skip, scientific = FALSE))
+    } else if(skip == 0 & !is.null(skip_interval)){
+      # If we are on the first page and we're using a custom skip interval:
+      paste0("&$top=", format(top, scientific = FALSE))
+    } else{
+      # If skip > 0 and we're using a custom skip interval:
+      paste0("&$skip=", format(skip, scientific = FALSE), "&$top=",
+             format(top, scientific = FALSE))
+    }
 
     # Construct URL:
     url <- paste0(base_url, "/", endpoint, custom_search, skip_suffix)
@@ -135,7 +171,17 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
     # If response indicates there's a next page:
     if(length(names(content)) < 3) { done <- TRUE } else {
 
-      skip <- skip + skip_interval
+      if(is.null(skip_interval)){
+
+        skip <- as.numeric(
+          gsub(x = content[[3]], pattern = ".*\\$skip=", replacement = "")
+        )
+
+      } else {
+
+        skip <- skip + skip_interval
+
+      }
 
       page <- page + 1
 

diff --git a/R/load_ots.R b/R/load_ots.R
@@ -14,7 +14,7 @@
 #' @param join_lookup A logical value indicating whether results should be joined with lookups from the API. Defaults to TRUE. Setting to FALSE will return a smaller but less human-readable dataframe containing only codes.
 #' @param print_url A logical. Defaults to FALSE. Setting this to TRUE will print the URL(s) used to load the trade data to the console.
 #' @param output A character specifying if a tibble ("tibble") or dataframe ("df") should be returned. Defaults to "tibble".
-#' @param skip_interval Passed to load_custom(). A non-negative integer value showing the skip interval for paginated results. Defaults to 40,000 rows.
+#' @param skip_interval Passed to load_custom(). Indicates the skip interval for paginated results. Defaults to NULL; this uses the skip parameter automatically generated by the API. Setting this to a different non-negative integer value is currently not supported.
 #' @param use_proxy A logical. Defaults to FALSE. Setting this to TRUE will allow the use of a proxy connection using `use_proxy()` from `httr`.
 #' @param ... Optional arguments to be passed along to `use_proxy()` when using a proxy connection (by setting use_proxy to TRUE). See the `httr` documentation for more details.
 #'
@@ -53,7 +53,7 @@ load_ots <- function(month = NULL,
                      join_lookup = TRUE,
                      print_url = FALSE,
                      output = "tibble",
-                     skip_interval = 4e4,
+                     skip_interval = NULL,
                      use_proxy = FALSE,
                      ...
                      ){

diff --git a/R/load_rts.R b/R/load_rts.R
@@ -12,7 +12,7 @@
 #' @param join_lookup A logical value indicating whether results should be joined with lookups from the API. Defaults to TRUE. Setting to FALSE will return a smaller but less human-readable dataframe containing only codes.
 #' @param print_url A logical. Defaults to FALSE. Setting this to TRUE will print the URL(s) used to load the trade data to the console.
 #' @param output A character specifying if a tibble ("tibble") or dataframe ("df") should be returned. Defaults to "tibble".
-#' @param skip_interval Passed to load_custom(). A non-negative integer value showing the skip interval for paginated results. Defaults to 40,000 rows.
+#' @param skip_interval Passed to load_custom(). Indicates the skip interval for paginated results. Defaults to NULL; this uses the skip parameter automatically generated by the API. Setting this to a different non-negative integer value is currently not supported.
 #' @param use_proxy A logical. Defaults to FALSE. Setting this to TRUE will allow the use of a proxy connection using `use_proxy()` from `httr`.
 #' @param ... Optional arguments to be passed along to `use_proxy()` when using a proxy connection (by setting use_proxy to TRUE). See the `httr` documentation for more details.
 #'
@@ -51,18 +51,18 @@ load_rts <- function(month = NULL,
                      join_lookup = TRUE,
                      print_url = FALSE,
                      output = "tibble",
-                     skip_interval = 4e4,
+                     skip_interval = NULL,
                      use_proxy = FALSE,
                      ...
-                     ){
+){
 
   # If no commodities are chosen, load all (detailed):
 
   if(is.null(sitc)){
 
     message("Loading trade data for all commodities. This may take a while.")
 
-    }
+  }
 
   # Check for internet:
   check_internet()
@@ -85,12 +85,12 @@ load_rts <- function(month = NULL,
 
     NULL
 
-    } else {
+  } else {
 
     unique(c(country_region_lookup[is.element(
       country_region_lookup$CountryCodeAlpha, country), "CountryId"]))
 
-      }
+  }
 
   chosen_region_id <- if(is.null(region)){
 
@@ -101,14 +101,15 @@ load_rts <- function(month = NULL,
     unique(c(country_region_lookup[is.element(
       country_region_lookup$Area1a, region), "RegionId"]))
 
-    }
+  }
 
   # UK country lookup ----------------------------------------------------------
 
   ukcountry_lookup <- load_custom(endpoint = "Region",
                                   output = output,
                                   request = request,
-                                  timer = timer, skip_interval = skip_interval,
+                                  timer = timer,
+                                  skip_interval = skip_interval,
                                   print_url = FALSE,
                                   use_proxy = use_proxy, ...)
 
@@ -124,7 +125,7 @@ load_rts <- function(month = NULL,
     unique(c(ukcountry_lookup[is.element(
       ukcountry_lookup$GovRegionGroupName, uk_country), "GovRegionId"]))
 
-    }
+  }
 
   # Build filter ---------------------------------------------------------------
 
@@ -155,8 +156,8 @@ load_rts <- function(month = NULL,
                           custom_search = paste0("?$filter=", filter),
                           output = output,
                           request = request, timer = timer,
-                          skip_interval = skip_interval, print_url = print_url,
-                          use_proxy = use_proxy, ...)
+                          skip_interval = skip_interval,
+                          print_url = print_url, use_proxy = use_proxy, ...)
 
   if(join_lookup == FALSE) { return(rts_data) } else {
 
@@ -169,22 +170,22 @@ load_rts <- function(month = NULL,
 
       paste0("?$filter=", paste0(stringr::str_extract_all(
         filter, "\\(CommoditySitc2Id[^()]+\\)"
-        )[[1]], collapse = " or "))
+      )[[1]], collapse = " or "))
 
     }
 
     # Rename filter:
     sitc_filter <- stringr::str_replace_all(
       sitc_filter,"CommoditySitc2Id", "CommoditySitcId"
-      )
+    )
 
     sitc_lookup <- load_custom(endpoint = "SITC",
                                custom_search = sitc_filter,
                                output = output,
                                request = request,
-                               timer = timer, skip_interval = skip_interval,
-                               print_url = FALSE,
-                               use_proxy = use_proxy, ...)
+                               timer = timer,
+                               skip_interval = skip_interval,
+                               print_url = FALSE, use_proxy = use_proxy, ...)
 
     # Remove potential odata column:
 
@@ -199,17 +200,17 @@ load_rts <- function(month = NULL,
 
       paste0("?$filter=", paste0(stringr::str_extract_all(
         filter, "\\(FlowTypeId[^()]+\\)"
-        )[[1]], collapse = " and "))
+      )[[1]], collapse = " and "))
 
     }
 
     flow_lookup <- load_custom(endpoint = "FlowType",
                                custom_search = flow_filter,
                                output = output,
                                request = request,
-                               timer = timer, skip_interval = skip_interval,
-                               print_url = FALSE,
-                               use_proxy = use_proxy, ...)
+                               timer = timer,
+                               skip_interval = skip_interval,
+                               print_url = FALSE, use_proxy = use_proxy, ...)
 
     # Remove potential odata column:
 
@@ -251,7 +252,7 @@ load_rts <- function(month = NULL,
 
       dplyr::as_tibble(rts_data)
 
-      }
+    }
 
     return(rts_data)
 

diff --git a/man/load_custom.Rd b/man/load_custom.Rd
diff --git a/man/load_ots.Rd b/man/load_ots.Rd
diff --git a/man/load_rts.Rd b/man/load_rts.Rd
diff --git a/uktrade.Rproj b/uktrade.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: b2c19a9c-1a7c-4b3e-b7a2-5b6fa65f4194
 
 RestoreWorkspace: Default
 SaveWorkspace: Default