From f219ebca90e636cf73cdc9572d13b5856a56e422 Mon Sep 17 00:00:00 2001 From: Bruce Delo Date: Wed, 14 Feb 2024 09:58:18 -0400 Subject: [PATCH 1/4] Added code and functions to get the worms AphiaID. Still gotta add comments and properly packagify it. --- R/get_aphiaIDs.R | 30 ++++++++++++++++++++++++++++++ R/otn_imos_column_map.R | 5 ++++- 2 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 R/get_aphiaIDs.R diff --git a/R/get_aphiaIDs.R b/R/get_aphiaIDs.R new file mode 100644 index 0000000..5da79ad --- /dev/null +++ b/R/get_aphiaIDs.R @@ -0,0 +1,30 @@ +#We can use the worrms library to get aphiaIDs, but since we could be dealing with very large datasets, we don't want to query the worms DB as part of the mutate/column mapping process, since we'd be doing a query for +#every row and therefore doing a lot of duplicate work and slowing down the code (I tried this- setting the 'WORMS_species_aphia_id' column equal to an lapply of wm_name2id on the scientific name column- and even on a dataset +#of only a few hundred rows, it bloated the time quite badly.) What we're going to do instead is collect and pre-query aphiaIDs for only the unique scientific names, for which we'll create a lookup table that we can then +#use to build out the column a little more quickly. + +#get a table of unique scientific names and aphiaIDs. +get_unique_aphiaids <- function(scinames) { + #Get the unique names. + unique_names <- unique(scinames) + + #Create an empty list to hold our name/value pairs. + aphia_ids <- list() + + #Build the dict + for (name in unique_names) { + #wm_name2id is a worrms function for associating a scientific name with its Aphia ID. + aphia_ids[[name]] <- worrms::wm_name2id(name) + } + + #return the table. + return(aphia_ids) +} + +get_aphiaid_from_lookup <- function(sciname, lookup) { + #Get the aphiaID from the lookup table. + aphiaid <- lookup[[sciname]] + #Strip off the 'named' part (this is what isn't working) + #aphiaid <- as.character(unname(aphiaid)) + return(aphiaid) +} \ No newline at end of file diff --git a/R/otn_imos_column_map.R b/R/otn_imos_column_map.R index c18234c..d44d2a3 100644 --- a/R/otn_imos_column_map.R +++ b/R/otn_imos_column_map.R @@ -57,6 +57,9 @@ otn_imos_column_map <- function(det_dataframe, rcvr_dataframe = NULL, tag_datafr tag_return <- derive_tag_from_det(det_dataframe) } + #Construct a little lookup table for the aphiaIDs. This keeps us from having to query the WORMS database over and over again (for example, the data I tested on had 300 entries for 'blue shark')- lot of redundant querying there. + lookup <- get_unique_aphiaids(det_dataframe$scientificname) + # Start by mapping the Detections dataframe. det_return <- det_dataframe %>% select( @@ -82,7 +85,7 @@ otn_imos_column_map <- function(det_dataframe, rcvr_dataframe = NULL, tag_datafr mutate( cleandate = ymd(as_date(datecollected)), CAAB_species_id = NA, - WORMS_species_aphia_id = NA, + WORMS_species_aphia_id = sapply(det_dataframe$scientificname, USE.NAMES=FALSE, FUN=get_aphiaid_from_lookup, lookup=lookup), animal_sex = NA, receiver_name = NA, receiver_project_name = NA, From 783c7fd74ab33e072700cab0f524bb1881eca78d Mon Sep 17 00:00:00 2001 From: jackVanish Date: Wed, 14 Feb 2024 14:00:36 +0000 Subject: [PATCH 2/4] Style code (GHA) --- R/get_aphiaIDs.R | 34 +++++++++++++++++----------------- R/otn_imos_column_map.R | 6 +++--- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/R/get_aphiaIDs.R b/R/get_aphiaIDs.R index 5da79ad..76df616 100644 --- a/R/get_aphiaIDs.R +++ b/R/get_aphiaIDs.R @@ -1,30 +1,30 @@ -#We can use the worrms library to get aphiaIDs, but since we could be dealing with very large datasets, we don't want to query the worms DB as part of the mutate/column mapping process, since we'd be doing a query for -#every row and therefore doing a lot of duplicate work and slowing down the code (I tried this- setting the 'WORMS_species_aphia_id' column equal to an lapply of wm_name2id on the scientific name column- and even on a dataset -#of only a few hundred rows, it bloated the time quite badly.) What we're going to do instead is collect and pre-query aphiaIDs for only the unique scientific names, for which we'll create a lookup table that we can then -#use to build out the column a little more quickly. +# We can use the worrms library to get aphiaIDs, but since we could be dealing with very large datasets, we don't want to query the worms DB as part of the mutate/column mapping process, since we'd be doing a query for +# every row and therefore doing a lot of duplicate work and slowing down the code (I tried this- setting the 'WORMS_species_aphia_id' column equal to an lapply of wm_name2id on the scientific name column- and even on a dataset +# of only a few hundred rows, it bloated the time quite badly.) What we're going to do instead is collect and pre-query aphiaIDs for only the unique scientific names, for which we'll create a lookup table that we can then +# use to build out the column a little more quickly. -#get a table of unique scientific names and aphiaIDs. +# get a table of unique scientific names and aphiaIDs. get_unique_aphiaids <- function(scinames) { - #Get the unique names. + # Get the unique names. unique_names <- unique(scinames) - - #Create an empty list to hold our name/value pairs. + + # Create an empty list to hold our name/value pairs. aphia_ids <- list() - - #Build the dict + + # Build the dict for (name in unique_names) { - #wm_name2id is a worrms function for associating a scientific name with its Aphia ID. + # wm_name2id is a worrms function for associating a scientific name with its Aphia ID. aphia_ids[[name]] <- worrms::wm_name2id(name) } - - #return the table. + + # return the table. return(aphia_ids) } get_aphiaid_from_lookup <- function(sciname, lookup) { - #Get the aphiaID from the lookup table. + # Get the aphiaID from the lookup table. aphiaid <- lookup[[sciname]] - #Strip off the 'named' part (this is what isn't working) - #aphiaid <- as.character(unname(aphiaid)) + # Strip off the 'named' part (this is what isn't working) + # aphiaid <- as.character(unname(aphiaid)) return(aphiaid) -} \ No newline at end of file +} diff --git a/R/otn_imos_column_map.R b/R/otn_imos_column_map.R index d44d2a3..8fcd827 100644 --- a/R/otn_imos_column_map.R +++ b/R/otn_imos_column_map.R @@ -57,9 +57,9 @@ otn_imos_column_map <- function(det_dataframe, rcvr_dataframe = NULL, tag_datafr tag_return <- derive_tag_from_det(det_dataframe) } - #Construct a little lookup table for the aphiaIDs. This keeps us from having to query the WORMS database over and over again (for example, the data I tested on had 300 entries for 'blue shark')- lot of redundant querying there. + # Construct a little lookup table for the aphiaIDs. This keeps us from having to query the WORMS database over and over again (for example, the data I tested on had 300 entries for 'blue shark')- lot of redundant querying there. lookup <- get_unique_aphiaids(det_dataframe$scientificname) - + # Start by mapping the Detections dataframe. det_return <- det_dataframe %>% select( @@ -85,7 +85,7 @@ otn_imos_column_map <- function(det_dataframe, rcvr_dataframe = NULL, tag_datafr mutate( cleandate = ymd(as_date(datecollected)), CAAB_species_id = NA, - WORMS_species_aphia_id = sapply(det_dataframe$scientificname, USE.NAMES=FALSE, FUN=get_aphiaid_from_lookup, lookup=lookup), + WORMS_species_aphia_id = sapply(det_dataframe$scientificname, USE.NAMES = FALSE, FUN = get_aphiaid_from_lookup, lookup = lookup), animal_sex = NA, receiver_name = NA, receiver_project_name = NA, From 9361ee47481febf1eba25f5178b26e60a0b7da73 Mon Sep 17 00:00:00 2001 From: Bruce Delo Date: Wed, 14 Feb 2024 10:18:58 -0400 Subject: [PATCH 3/4] All the Roxygen metadata for the aphiaID stuff. --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/get_aphiaIDs.R | 29 +++++++++++++++++++++++++---- man/get_aphiaid_from_lookup.Rd | 19 +++++++++++++++++++ man/get_unique_aphiaids.Rd | 21 +++++++++++++++++++++ 5 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 man/get_aphiaid_from_lookup.Rd create mode 100644 man/get_unique_aphiaids.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 76caeeb..a4b57aa 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,7 +14,7 @@ Description: Surimi takes as input data files representing acoustic telemetry, w License: GPL (>= 3) Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1.9000 Imports: dplyr, lubridate, diff --git a/NAMESPACE b/NAMESPACE index ecc3aa8..7b86ea3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,3 +18,4 @@ importFrom(lubridate,as_date) importFrom(lubridate,ymd) importFrom(tidyr,separate) importFrom(tidyr,unite) +importFrom(worrms,wm_name2id) diff --git a/R/get_aphiaIDs.R b/R/get_aphiaIDs.R index 5da79ad..efc1232 100644 --- a/R/get_aphiaIDs.R +++ b/R/get_aphiaIDs.R @@ -1,7 +1,17 @@ -#We can use the worrms library to get aphiaIDs, but since we could be dealing with very large datasets, we don't want to query the worms DB as part of the mutate/column mapping process, since we'd be doing a query for -#every row and therefore doing a lot of duplicate work and slowing down the code (I tried this- setting the 'WORMS_species_aphia_id' column equal to an lapply of wm_name2id on the scientific name column- and even on a dataset -#of only a few hundred rows, it bloated the time quite badly.) What we're going to do instead is collect and pre-query aphiaIDs for only the unique scientific names, for which we'll create a lookup table that we can then -#use to build out the column a little more quickly. +##' @title Get AphiaIDs for scientific names +##' +##' @description Takes a column of scientific names and creates a lookup table (read: named list) of the unique scientific names +##' against their aphia IDs. We can use worrms to query the WORMS REST service for the aphiaIDs, but doing it for every row is +##' time intensive in a way we don't want. This way, we can create the lookup client-side and then do all the querying only as +##' we need to. +##' +##' @param scinames A vector (dataframe column) containing the list of scientific names from a detection extract dataframe in +##' Surimi. +##' +##' @importFrom worrms wm_name2id +##' +##' @return Returns a named list with the scientific name as the key and the aphiaID as the value. +##' #get a table of unique scientific names and aphiaIDs. get_unique_aphiaids <- function(scinames) { @@ -21,6 +31,17 @@ get_unique_aphiaids <- function(scinames) { return(aphia_ids) } + +##' @title Consult a lookup table for the aphiaID. +##' +##' @description This is the helper function that we use in the sapply when mutating the WORMS_species_aphia_id into existence. +##' +##' @param sciname A Scientific name as a string. +##' @param lookup The named list containing key-value pairs of scientific names and aphiaIDs. +##' +##' @return Returns the appropriate aphiaID corresponding to the sciname. +##' + get_aphiaid_from_lookup <- function(sciname, lookup) { #Get the aphiaID from the lookup table. aphiaid <- lookup[[sciname]] diff --git a/man/get_aphiaid_from_lookup.Rd b/man/get_aphiaid_from_lookup.Rd new file mode 100644 index 0000000..aa90409 --- /dev/null +++ b/man/get_aphiaid_from_lookup.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_aphiaIDs.R +\name{get_aphiaid_from_lookup} +\alias{get_aphiaid_from_lookup} +\title{Consult a lookup table for the aphiaID.} +\usage{ +get_aphiaid_from_lookup(sciname, lookup) +} +\arguments{ +\item{sciname}{A Scientific name as a string.} + +\item{lookup}{The named list containing key-value pairs of scientific names and aphiaIDs.} +} +\value{ +Returns the appropriate aphiaID corresponding to the sciname. +} +\description{ +This is the helper function that we use in the sapply when mutating the WORMS_species_aphia_id into existence. +} diff --git a/man/get_unique_aphiaids.Rd b/man/get_unique_aphiaids.Rd new file mode 100644 index 0000000..8b4261c --- /dev/null +++ b/man/get_unique_aphiaids.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_aphiaIDs.R +\name{get_unique_aphiaids} +\alias{get_unique_aphiaids} +\title{Get AphiaIDs for scientific names} +\usage{ +get_unique_aphiaids(scinames) +} +\arguments{ +\item{scinames}{A vector (dataframe column) containing the list of scientific names from a detection extract dataframe in +Surimi.} +} +\value{ +Returns a named list with the scientific name as the key and the aphiaID as the value. +} +\description{ +Takes a column of scientific names and creates a lookup table (read: named list) of the unique scientific names +against their aphia IDs. We can use worrms to query the WORMS REST service for the aphiaIDs, but doing it for every row is +time intensive in a way we don't want. This way, we can create the lookup client-side and then do all the querying only as +we need to. +} From 3900e2c8183cfcb42ae9850162e52b0b2e36ba09 Mon Sep 17 00:00:00 2001 From: jackVanish Date: Wed, 14 Feb 2024 14:22:04 +0000 Subject: [PATCH 4/4] Style code (GHA) --- R/get_aphiaIDs.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/get_aphiaIDs.R b/R/get_aphiaIDs.R index 5dfdd18..2904463 100644 --- a/R/get_aphiaIDs.R +++ b/R/get_aphiaIDs.R @@ -6,11 +6,11 @@ ##' we need to. ##' ##' @param scinames A vector (dataframe column) containing the list of scientific names from a detection extract dataframe in -##' Surimi. +##' Surimi. ##' ##' @importFrom worrms wm_name2id ##' -##' @return Returns a named list with the scientific name as the key and the aphiaID as the value. +##' @return Returns a named list with the scientific name as the key and the aphiaID as the value. ##' # get a table of unique scientific names and aphiaIDs. @@ -34,13 +34,13 @@ get_unique_aphiaids <- function(scinames) { ##' @title Consult a lookup table for the aphiaID. ##' -##' @description This is the helper function that we use in the sapply when mutating the WORMS_species_aphia_id into existence. +##' @description This is the helper function that we use in the sapply when mutating the WORMS_species_aphia_id into existence. ##' ##' @param sciname A Scientific name as a string. -##' @param lookup The named list containing key-value pairs of scientific names and aphiaIDs. +##' @param lookup The named list containing key-value pairs of scientific names and aphiaIDs. +##' +##' @return Returns the appropriate aphiaID corresponding to the sciname. ##' -##' @return Returns the appropriate aphiaID corresponding to the sciname. -##' get_aphiaid_from_lookup <- function(sciname, lookup) { # Get the aphiaID from the lookup table.