Skip to content

Commit

Permalink
Fix paginating issue
Browse files Browse the repository at this point in the history
Manual skip parameters are disabled for now. The code instead detects the next page supplied by the API and uses that to load the next page of data.
  • Loading branch information
pvdmeulen committed Dec 23, 2024
1 parent c066024 commit 30965b3
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 37 deletions.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: uktrade
Type: Package
Title: Accessing Data Through HMRC's API
Version: 0.7.1
Version: 0.7.2
Authors@R: person("van der Meulen", "Peter", email = "[email protected]", role = c("aut", "cre"))
Description: Convenient functions to load HMRC Overseas Trade Statistics, Regional Trade Statistics, and custom URLs using HMRC's API.
License: MIT + file LICENSE
Expand All @@ -17,4 +17,4 @@ Imports:
stringr,
dplyr,
rlang
RoxygenNote: 7.2.1
RoxygenNote: 7.3.2
56 changes: 51 additions & 5 deletions R/load_custom.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @param base_url Base URL for use in API. Defaults to https://api.uktradeinfo.com.
#' @param endpoint Endpoint for use in API. Takes a single character string with no default.
#' @param custom_search Custom query. Takes a single character string with no default.
#' @param skip_interval A non-negative integer value showing the skip interval for paginated results. Defaults to 40,000 rows.
#' @param skip_interval Indicates the skip interval for paginated results. Defaults to NULL; this uses the skip parameter automatically generated by the API. Setting this to a different non-negative integer value is currently not supported.
#' @param output A character specifying if a tibble ("tibble") or dataframe ("df") should be returned. Defaults to "tibble".
#' @param request A non-negative integer value to keep track of the starting number of requests made. Defaults to zero. This can be increased in case you are making multiple requests using this function in succession and do not want to exceed the API limit (60 requests per minute).
#' @param timer A non-negative integer value (seconds) to keep track of the time taken so far. Defaults to NULL. This can be increased in case you are making multiple requests using this function in succession and do not want to exceed the API limit (60 requests per minute).
Expand Down Expand Up @@ -48,15 +48,31 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
endpoint,
custom_search = "",
request = 1,
skip_interval = 4e4,
timer = NULL,
skip_interval = NULL,
output = "tibble",
print_url = FALSE,
debug = FALSE,
use_proxy = FALSE,
...
){

# Hard code the following variables for now - HMRC API does not paginate when
# setting custom skip and top variables. The above will force the code to use
# the pagination set by the API itself when a request exceeds some predefined
# amount of rows.

if(!is.null(skip_interval)){
warning("Setting skip_interval to NULL. This uses the skip parameter automatically generated by the API. A manual skip paramater is currently not supported.")
}

skip_interval <- NULL
top <- NULL

max_page_size <- 4e4 # Used for a warning when skip_interval is greater

# Continue with request:

check_internet()

done <- FALSE
Expand All @@ -66,6 +82,14 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
skip <- 0
page <- 1

if(!is.null(skip_interval)){
if(skip_interval > max_page_size){
warning(paste0("Manual skip interval of ", skip_interval,
" exceeds the maximum page size of ",
max_page_size, ". Consider setting a lower skip interval."))
}
}

# Create timer reference point:
timer <- if(is.null(timer)){ proc.time() } else { timer }

Expand All @@ -83,8 +107,20 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
), " seconds"))}

# Construct skip suffix:
skip_suffix <- if(skip == 0) { NULL } else {
paste0("&$skip=", format(skip, scientific = FALSE)) }
skip_suffix <- if(skip == 0 & is.null(skip_interval)) {
# If we are on the first page and we're not using a custom skip interval:
NULL
} else if(skip > 0 & is.null(skip_interval)){
# If skip > 0 and we're not using a custom skip interval:
paste0("&$skip=", format(skip, scientific = FALSE))
} else if(skip == 0 & !is.null(skip_interval)){
# If we are on the first page and we're using a custom skip interval:
paste0("&$top=", format(top, scientific = FALSE))
} else{
# If skip > 0 and we're using a custom skip interval:
paste0("&$skip=", format(skip, scientific = FALSE), "&$top=",
format(top, scientific = FALSE))
}

# Construct URL:
url <- paste0(base_url, "/", endpoint, custom_search, skip_suffix)
Expand Down Expand Up @@ -135,7 +171,17 @@ load_custom <- function(base_url = "https://api.uktradeinfo.com",
# If response indicates there's a next page:
if(length(names(content)) < 3) { done <- TRUE } else {

skip <- skip + skip_interval
if(is.null(skip_interval)){

skip <- as.numeric(
gsub(x = content[[3]], pattern = ".*\\$skip=", replacement = "")
)

} else {

skip <- skip + skip_interval

}

page <- page + 1

Expand Down
4 changes: 2 additions & 2 deletions R/load_ots.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#' @param join_lookup A logical value indicating whether results should be joined with lookups from the API. Defaults to TRUE. Setting to FALSE will return a smaller but less human-readable dataframe containing only codes.
#' @param print_url A logical. Defaults to FALSE. Setting this to TRUE will print the URL(s) used to load the trade data to the console.
#' @param output A character specifying if a tibble ("tibble") or dataframe ("df") should be returned. Defaults to "tibble".
#' @param skip_interval Passed to load_custom(). A non-negative integer value showing the skip interval for paginated results. Defaults to 40,000 rows.
#' @param skip_interval Passed to load_custom(). Indicates the skip interval for paginated results. Defaults to NULL; this uses the skip parameter automatically generated by the API. Setting this to a different non-negative integer value is currently not supported.
#' @param use_proxy A logical. Defaults to FALSE. Setting this to TRUE will allow the use of a proxy connection using `use_proxy()` from `httr`.
#' @param ... Optional arguments to be passed along to `use_proxy()` when using a proxy connection (by setting use_proxy to TRUE). See the `httr` documentation for more details.
#'
Expand Down Expand Up @@ -53,7 +53,7 @@ load_ots <- function(month = NULL,
join_lookup = TRUE,
print_url = FALSE,
output = "tibble",
skip_interval = 4e4,
skip_interval = NULL,
use_proxy = FALSE,
...
){
Expand Down
43 changes: 22 additions & 21 deletions R/load_rts.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#' @param join_lookup A logical value indicating whether results should be joined with lookups from the API. Defaults to TRUE. Setting to FALSE will return a smaller but less human-readable dataframe containing only codes.
#' @param print_url A logical. Defaults to FALSE. Setting this to TRUE will print the URL(s) used to load the trade data to the console.
#' @param output A character specifying if a tibble ("tibble") or dataframe ("df") should be returned. Defaults to "tibble".
#' @param skip_interval Passed to load_custom(). A non-negative integer value showing the skip interval for paginated results. Defaults to 40,000 rows.
#' @param skip_interval Passed to load_custom(). Indicates the skip interval for paginated results. Defaults to NULL; this uses the skip parameter automatically generated by the API. Setting this to a different non-negative integer value is currently not supported.
#' @param use_proxy A logical. Defaults to FALSE. Setting this to TRUE will allow the use of a proxy connection using `use_proxy()` from `httr`.
#' @param ... Optional arguments to be passed along to `use_proxy()` when using a proxy connection (by setting use_proxy to TRUE). See the `httr` documentation for more details.
#'
Expand Down Expand Up @@ -51,18 +51,18 @@ load_rts <- function(month = NULL,
join_lookup = TRUE,
print_url = FALSE,
output = "tibble",
skip_interval = 4e4,
skip_interval = NULL,
use_proxy = FALSE,
...
){
){

# If no commodities are chosen, load all (detailed):

if(is.null(sitc)){

message("Loading trade data for all commodities. This may take a while.")

}
}

# Check for internet:
check_internet()
Expand All @@ -85,12 +85,12 @@ load_rts <- function(month = NULL,

NULL

} else {
} else {

unique(c(country_region_lookup[is.element(
country_region_lookup$CountryCodeAlpha, country), "CountryId"]))

}
}

chosen_region_id <- if(is.null(region)){

Expand All @@ -101,14 +101,15 @@ load_rts <- function(month = NULL,
unique(c(country_region_lookup[is.element(
country_region_lookup$Area1a, region), "RegionId"]))

}
}

# UK country lookup ----------------------------------------------------------

ukcountry_lookup <- load_custom(endpoint = "Region",
output = output,
request = request,
timer = timer, skip_interval = skip_interval,
timer = timer,
skip_interval = skip_interval,
print_url = FALSE,
use_proxy = use_proxy, ...)

Expand All @@ -124,7 +125,7 @@ load_rts <- function(month = NULL,
unique(c(ukcountry_lookup[is.element(
ukcountry_lookup$GovRegionGroupName, uk_country), "GovRegionId"]))

}
}

# Build filter ---------------------------------------------------------------

Expand Down Expand Up @@ -155,8 +156,8 @@ load_rts <- function(month = NULL,
custom_search = paste0("?$filter=", filter),
output = output,
request = request, timer = timer,
skip_interval = skip_interval, print_url = print_url,
use_proxy = use_proxy, ...)
skip_interval = skip_interval,
print_url = print_url, use_proxy = use_proxy, ...)

if(join_lookup == FALSE) { return(rts_data) } else {

Expand All @@ -169,22 +170,22 @@ load_rts <- function(month = NULL,

paste0("?$filter=", paste0(stringr::str_extract_all(
filter, "\\(CommoditySitc2Id[^()]+\\)"
)[[1]], collapse = " or "))
)[[1]], collapse = " or "))

}

# Rename filter:
sitc_filter <- stringr::str_replace_all(
sitc_filter,"CommoditySitc2Id", "CommoditySitcId"
)
)

sitc_lookup <- load_custom(endpoint = "SITC",
custom_search = sitc_filter,
output = output,
request = request,
timer = timer, skip_interval = skip_interval,
print_url = FALSE,
use_proxy = use_proxy, ...)
timer = timer,
skip_interval = skip_interval,
print_url = FALSE, use_proxy = use_proxy, ...)

# Remove potential odata column:

Expand All @@ -199,17 +200,17 @@ load_rts <- function(month = NULL,

paste0("?$filter=", paste0(stringr::str_extract_all(
filter, "\\(FlowTypeId[^()]+\\)"
)[[1]], collapse = " and "))
)[[1]], collapse = " and "))

}

flow_lookup <- load_custom(endpoint = "FlowType",
custom_search = flow_filter,
output = output,
request = request,
timer = timer, skip_interval = skip_interval,
print_url = FALSE,
use_proxy = use_proxy, ...)
timer = timer,
skip_interval = skip_interval,
print_url = FALSE, use_proxy = use_proxy, ...)

# Remove potential odata column:

Expand Down Expand Up @@ -251,7 +252,7 @@ load_rts <- function(month = NULL,

dplyr::as_tibble(rts_data)

}
}

return(rts_data)

Expand Down
6 changes: 3 additions & 3 deletions man/load_custom.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/load_ots.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/load_rts.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions uktrade.Rproj
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
Version: 1.0
ProjectId: b2c19a9c-1a7c-4b3e-b7a2-5b6fa65f4194

RestoreWorkspace: Default
SaveWorkspace: Default
Expand Down

0 comments on commit 30965b3

Please sign in to comment.