From ee22389b559e2c4d85aac9b2113d0428816068a9 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:52:26 +0200 Subject: [PATCH 1/3] apply toupper() on unique strings and match them --- R/countrycode.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/countrycode.R b/R/countrycode.R index 7a8a147..8af4fa2 100644 --- a/R/countrycode.R +++ b/R/countrycode.R @@ -198,7 +198,12 @@ countrycode <- function(sourcevar, origin, destination, warn = TRUE, nomatch = N if(is.null(custom_dict)){ # only for built-in dictionary # unicode.symbol breaks uppercase on Windows R-devel 2022-02-02; rejected by CRAN if(inherits(origin_vector, 'character') & !grepl('country|unicode.symbol', origin)){ - origin_vector = toupper(origin_vector) + # only apply toupper() on unique values and match after. + # much faster than applying toupper() on the whole vector when vector is very large + uniques = unique(origin_vector) + uppercase = toupper(uniques) + names(uppercase) = uniques + origin_vector = unname(uppercase[match(origin_vector, names(uppercase))]) } } From 43e49703e1af4c7c4cae39da78258d8e26ee616a Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:29:37 +0200 Subject: [PATCH 2/3] save some memory --- R/countrycode.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/countrycode.R b/R/countrycode.R index 8af4fa2..eb716ae 100644 --- a/R/countrycode.R +++ b/R/countrycode.R @@ -199,11 +199,11 @@ countrycode <- function(sourcevar, origin, destination, warn = TRUE, nomatch = N # unicode.symbol breaks uppercase on Windows R-devel 2022-02-02; rejected by CRAN if(inherits(origin_vector, 'character') & !grepl('country|unicode.symbol', origin)){ # only apply toupper() on unique values and match after. - # much faster than applying toupper() on the whole vector when vector is very large + # much faster than applying toupper() on the whole vector + # when vector is very large uniques = unique(origin_vector) uppercase = toupper(uniques) - names(uppercase) = uniques - origin_vector = unname(uppercase[match(origin_vector, names(uppercase))]) + origin_vector = unname(uppercase[match(origin_vector, uniques)]) } } From efa0ff6342473332792dc13e3648a1b62ce387d2 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:45:36 +0200 Subject: [PATCH 3/3] Update R/countrycode.R Co-authored-by: CJ Yetman --- R/countrycode.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/countrycode.R b/R/countrycode.R index eb716ae..24e8b0a 100644 --- a/R/countrycode.R +++ b/R/countrycode.R @@ -201,9 +201,9 @@ countrycode <- function(sourcevar, origin, destination, warn = TRUE, nomatch = N # only apply toupper() on unique values and match after. # much faster than applying toupper() on the whole vector # when vector is very large - uniques = unique(origin_vector) - uppercase = toupper(uniques) - origin_vector = unname(uppercase[match(origin_vector, uniques)]) + uniques <- unique(origin_vector) + uppercase <- toupper(uniques) + origin_vector <- unname(uppercase[match(origin_vector, uniques)]) } }