diff --git a/R/get_aphiaIDs.R b/R/get_aphiaIDs.R new file mode 100644 index 0000000..5da79ad --- /dev/null +++ b/R/get_aphiaIDs.R @@ -0,0 +1,30 @@ +#We can use the worrms library to get aphiaIDs, but since we could be dealing with very large datasets, we don't want to query the worms DB as part of the mutate/column mapping process, since we'd be doing a query for +#every row and therefore doing a lot of duplicate work and slowing down the code (I tried this- setting the 'WORMS_species_aphia_id' column equal to an lapply of wm_name2id on the scientific name column- and even on a dataset +#of only a few hundred rows, it bloated the time quite badly.) What we're going to do instead is collect and pre-query aphiaIDs for only the unique scientific names, for which we'll create a lookup table that we can then +#use to build out the column a little more quickly. + +#get a table of unique scientific names and aphiaIDs. +get_unique_aphiaids <- function(scinames) { + #Get the unique names. + unique_names <- unique(scinames) + + #Create an empty list to hold our name/value pairs. + aphia_ids <- list() + + #Build the dict + for (name in unique_names) { + #wm_name2id is a worrms function for associating a scientific name with its Aphia ID. + aphia_ids[[name]] <- worrms::wm_name2id(name) + } + + #return the table. + return(aphia_ids) +} + +get_aphiaid_from_lookup <- function(sciname, lookup) { + #Get the aphiaID from the lookup table. + aphiaid <- lookup[[sciname]] + #Strip off the 'named' part (this is what isn't working) + #aphiaid <- as.character(unname(aphiaid)) + return(aphiaid) +} \ No newline at end of file diff --git a/R/otn_imos_column_map.R b/R/otn_imos_column_map.R index c18234c..d44d2a3 100644 --- a/R/otn_imos_column_map.R +++ b/R/otn_imos_column_map.R @@ -57,6 +57,9 @@ otn_imos_column_map <- function(det_dataframe, rcvr_dataframe = NULL, tag_datafr tag_return <- derive_tag_from_det(det_dataframe) } + #Construct a little lookup table for the aphiaIDs. This keeps us from having to query the WORMS database over and over again (for example, the data I tested on had 300 entries for 'blue shark')- lot of redundant querying there. + lookup <- get_unique_aphiaids(det_dataframe$scientificname) + # Start by mapping the Detections dataframe. det_return <- det_dataframe %>% select( @@ -82,7 +85,7 @@ otn_imos_column_map <- function(det_dataframe, rcvr_dataframe = NULL, tag_datafr mutate( cleandate = ymd(as_date(datecollected)), CAAB_species_id = NA, - WORMS_species_aphia_id = NA, + WORMS_species_aphia_id = sapply(det_dataframe$scientificname, USE.NAMES=FALSE, FUN=get_aphiaid_from_lookup, lookup=lookup), animal_sex = NA, receiver_name = NA, receiver_project_name = NA,