diff --git a/.github/workflows/update-ad-publications.yaml b/.github/workflows/update-publications.yaml similarity index 78% rename from .github/workflows/update-ad-publications.yaml rename to .github/workflows/update-publications.yaml index a008e3a..e0b5c8c 100644 --- a/.github/workflows/update-ad-publications.yaml +++ b/.github/workflows/update-publications.yaml @@ -1,8 +1,9 @@ -name: "Update AD Publications" +name: "Update Publications" on: schedule: - cron: "0 0 1 * *" + workflow_dispatch: env: RETICULATE_AUTOCONFIGURE: 'FALSE' @@ -10,18 +11,19 @@ env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} jobs: - update-ad-publications: + update-publications: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@master + - uses: r-lib/actions/setup-r@v2 with: - r-version: '4.0' + r-version: '4.1.3' - name: Query dependencies run: | install.packages('remotes') + install.packages("synapser") saveRDS(remotes::dev_package_deps(dependencies = TRUE), "depends.Rds", version = 2) shell: Rscript {0} @@ -48,11 +50,11 @@ jobs: run: | Rscript -e "reticulate::py_discover_config()" Rscript -e "reticulate::py_install(c('pandas', 'numpy', 'boto3', 'synapseclient'), pip = TRUE)" - + - name: Install porTools run: | - Rscript -e "remotes::install_github('Sage-Bionetworks/porTools')" - + Rscript -e "remotes::install_github('eliteportal/publication_scraper')" + - name: Query PubMed and upload results run: | - Rscript ./inst/scripts/update-publications-ad.R --grant_table syn17024229 --parent syn20463015 --pub_table syn20448807 --auth_token ${{ secrets.SYNAPSE_PAT }} + Rscript ./inst/scripts/query-pubmed-grants.R --grant_table syn51209786 --parent syn51400816 --pub_table syn51407023 --auth_token ${{ secrets.SYNAPSE_PAT }} diff --git a/.gitignore b/.gitignore index 5b6a065..b3879be 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,6 @@ .Rhistory .RData .Ruserdata +.synapseConfig +.DS_Store +publications_pmid_list.txt diff --git a/R/global-hard-coded-variables.R b/R/global-hard-coded-variables.R new file mode 100644 index 0000000..45e13fe --- /dev/null +++ b/R/global-hard-coded-variables.R @@ -0,0 +1,12 @@ +# IDs of syanpse folders and tables that are used throughout the package for gather grant IDs and uploading annotations and entities. +# Edit the following for the relevant project + +root_dir <- "~/Documents/Projects/ELITE/ELITE-porTools" +sid_project <- "syn27229419" +sid_studies_table <- "syn51210771" +sid_studies_fv <- "syn51523775" +sid_projects_table <- "syn51209786" # ELITE Portal Projects Table +sid_pub_table <- "syn51407023" +sid_pub_folder <- "syn51317180" +sid_people_table <- "syn51209684" +sid_pmid_file <- "syn52227331" diff --git a/R/md-converter.R b/R/md-converter.R new file mode 100644 index 0000000..cd19d62 --- /dev/null +++ b/R/md-converter.R @@ -0,0 +1,2 @@ +# Convert vignette to R script +knitr::purl("~/Documents/Projects/ELITE/ELITE-porTools/vignettes/query-pubmed-grants.Rmd") diff --git a/R/pubmed.R b/R/pubmed.R index dd0ed55..42cca3e 100644 --- a/R/pubmed.R +++ b/R/pubmed.R @@ -106,7 +106,7 @@ pub_query <- function(pub_pmids_list) { names(pub_summary_list) <- names(pub_pmids_list) # collapse list of dataframes into a single df - dplyr::bind_rows(pub_summary_list, .id = "grantSerialNumber") + dplyr::bind_rows(pub_summary_list, .id = "result") } #' Parse Summary Obj @@ -214,7 +214,7 @@ make_entity_name <- function(dat){ # Need to leave space for year and pubmed ID # Arbitrarily set to 200 characters short_name <- stringr::str_trunc( - glue::glue("{first_author} {dat$fulljournalname}"), + glue::glue("{first_author} {dat$journal}"), width = 200 ) diff --git a/R/setup_env.R b/R/setup_env.R new file mode 100644 index 0000000..3fd4550 --- /dev/null +++ b/R/setup_env.R @@ -0,0 +1,24 @@ +# setup env +# Package names +packages <- c("librarian", "knitr") + +# Install packages not yet installed +installed_packages <- packages %in% rownames(installed.packages()) +if (any(installed_packages == FALSE)) { + install.packages(packages[!installed_packages]) +} + +# install.packages("synapser", repos=c("http://ran.synapse.org", "http://cran.fhcrc.org")) + +librarian::shelf( + optparse, + rentrez, + rmarkdown, + reticulate, + janitor, + dplyr, + readr, + stringr, + reticulate, + easyPubMed +) diff --git a/R/synapseLogin.R b/R/synapseLogin.R new file mode 100644 index 0000000..a4857df --- /dev/null +++ b/R/synapseLogin.R @@ -0,0 +1,25 @@ +library("optparse") + +# nolint start +option_list <- list( + make_option( + "--auth_token", + action = "store", + default = NA, + type = "character", + help = "Synapse Personal Access Token. If not given, assumes local .synapseConfig." + ) +) + +opts <- parse_args(OptionParser(option_list = option_list)) +# nolint end + +## Synapse client and logging in +synapseclient <- reticulate::import("synapseclient") +syntab <- reticulate::import("synapseclient.table") +syn <- synapseclient$Synapse() +if(!is.na(opts$auth_token)) { + syn$login(authToken = opts$auth_token) +} else { + syn$login() +} diff --git a/R/text-cleaning.R b/R/text-cleaning.R index 7be5e93..1c8cc9f 100644 --- a/R/text-cleaning.R +++ b/R/text-cleaning.R @@ -53,6 +53,7 @@ remove_unacceptable_characters <- function(text) { conv <- gsub(",", "", conv) conv <- gsub("\\]", "", conv) conv <- gsub("\\[", "", conv) + conv <- gsub("=", "-", conv) return(conv) } #' Clean up funky text diff --git a/README.md b/README.md index 92adb03..63dc912 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,9 @@ Sage portals require content management of publications, people, data, studies a [[[[work in-progress]]]] `devtools::install_github('Sage-Bionetworks/porTools')` + + +## Updates +**2023-10-10** +- If the grant serial number overlaps with annother for example `UH2AG064706` and `UH3AG064706` then a different call to get the search results must be made and the previously developed functions do not work +- Found the NIH library for R is much faster than python diff --git a/inst/scripts/curate-portal-people-table-pec.R b/inst/scripts/curate-portal-people-table-pec.R index 3ef5eea..b8fcb5a 100644 --- a/inst/scripts/curate-portal-people-table-pec.R +++ b/inst/scripts/curate-portal-people-table-pec.R @@ -1,9 +1,11 @@ library(tidyverse) library(purrr) -synapseclient <- reticulate::import("synapseclient") -syntab <- reticulate::import("synapseclient.table") -syn <- synapseclient$Synapse() -syn$login() + +# Login to synapse +source("~/Projects/ELITE/porTools/R/synapseLogin.R") + +### Hard coded variables +source("~/Projects/ELITE/porTools/R/globalHardCodedVariables.R") ## functions update_synapse_table <- function(table_id, update_df, syn, syntab) { @@ -14,6 +16,7 @@ update_synapse_table <- function(table_id, update_df, syn, syntab) { update_rows <- syntab$Table(table_id, tmpfile) syn$store(update_rows) } + make_df <- function(list, column_name) { df <- tibble::enframe(list) %>% tidyr::unnest(cols = c(value), keep_empty = TRUE) @@ -22,7 +25,7 @@ make_df <- function(list, column_name) { df } ### -people <- read_csv(syn$tableQuery("Select * from syn22096112")$filepath) +people <- read_csv(syn$tableQuery(glue::glue("SELECT * from {sid_people_table}")$filepath) # table to portal - people team <- syn$getTeamMembers("3323356") list <- reticulate::iterate(team) member <- map(list, ~.$get("member")) @@ -57,4 +60,4 @@ update <- update %>% mutate_all(function(x) ifelse(is.na(x),"",x)) update$ROW_ID <- "" -update_synapse_table("syn22096112", update, syn, syntab) +update_synapse_table(sid_people_table, update, syn, syntab) diff --git a/inst/scripts/curate-portal-studies-table-pec.R b/inst/scripts/curate-portal-studies-table-pec.R index 2038c97..64116fe 100644 --- a/inst/scripts/curate-portal-studies-table-pec.R +++ b/inst/scripts/curate-portal-studies-table-pec.R @@ -1,40 +1,44 @@ library(tidyverse) -synapseclient <- reticulate::import("synapseclient") -syntab <- reticulate::import("synapseclient.table") -syn <- synapseclient$Synapse() -syn$login() + +# Login to synapse +source("~/Projects/ELITE/porTools/R/synapseLogin.R") # Once study folders are annotated, this script will find those annotations and merge them # into the studies table that creates the study cards in the portal. +### Hard coded variables +source("~/Projects/ELITE/porTools/R/globalHardCodedVariables.R") + ### functions -coalesceJoin <- function(x, y, - by = NULL, suffix = c(".x", ".y"), - join = dplyr::left_join, ...) { +coalesceJoin <- function(x, + y, + by = NULL, + suffix = c(".x", ".y"), + join = dplyr::left_join, + ...) { joined <- join(x, y, by = by, suffix = suffix, ...) # names of desired output cols <- union(names(x), names(y)) to_coalesce <- names(joined)[!names(joined) %in% cols] - suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)] + suffix_used <- + suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)] # remove suffixes and de-duplicate - to_coalesce <- unique(substr( - to_coalesce, - 1, - nchar(to_coalesce) - nchar(suffix_used) - )) - - coalesced <- purrr::map_dfc(to_coalesce, ~dplyr::coalesce( - joined[[paste0(.x, suffix[1])]], - joined[[paste0(.x, suffix[2])]] - )) + to_coalesce <- unique(substr(to_coalesce, + 1, + nchar(to_coalesce) - nchar(suffix_used))) + + coalesced <- purrr::map_dfc(to_coalesce, ~ dplyr::coalesce(joined[[paste0(.x, suffix[1])]], + joined[[paste0(.x, suffix[2])]])) + names(coalesced) <- to_coalesce dplyr::bind_cols(joined, coalesced)[cols] } update_synapse_table <- function(table_id, update_df, syn, syntab) { - current_rows <- syn$tableQuery(glue::glue("SELECT * FROM {table_id}")) + current_rows <- + syn$tableQuery(glue::glue("SELECT * FROM {table_id}")) syn$delete(current_rows) tmpfile <- fs::file_temp("rows.csv") write_csv(update_df, tmpfile) @@ -45,32 +49,57 @@ update_synapse_table <- function(table_id, update_df, syn, syntab) { # update studies table # force view to rebuild -trigger <- syn$tableQuery("select * from syn21990011") +trigger <- syn$tableQuery(glue::glue("SELECT * FROM {sid_studies}", )) -table <- dccvalidator::get_synapse_table("syn21783965", syn) -fv <- dccvalidator::get_synapse_table("syn21990011", syn) +table <- dccvalidator::get_synapse_table(sid_studies, syn) + +# Why do we need a file view? +fv <- dccvalidator::get_synapse_table(sid_studies_fv, syn) # studies view for portal # Parse rows from file view that contain annotations to be captured in the # PEC studies table -to_update <- fv[!is.na(fv$studyDescription),] +to_update <- fv[!is.na(fv$studyDescription), ] to_update <- rename(to_update, key = id, studyName = name) + table <- rename(table, key = study) # join on synId -updated <- coalesceJoin(to_update, table, by = "key", join = full_join) +updated <- + coalesceJoin(to_update, table, by = "key", join = full_join) # change to required schema updated <- rename(updated, study = key) # NAs must be changed to empty strings -dat <- updated %>% mutate_all(function(x) ifelse(is.na(x),"",x)) +dat <- updated %>% mutate_all(function(x) + ifelse(is.na(x), "", x)) # order cards alphabetically -dat <- dat[order(dat$studyName),] +dat <- dat[order(dat$studyName), ] #order schema -dat <- dplyr::select(dat, studyType, isModelSystem, numberOfIndividuals, species, study, studyDescription, studyName, nucleicAcidSource, contributingInstitution, dataTypes, diagnosis, grants, phase, methods, relatedStudies, tissue) - -update_synapse_table("syn21783965", dat, syn, syntab) +dat <- + dplyr::select( + dat, + studyType, + isModelSystem, + numberOfIndividuals, + species, + study, + studyDescription, + studyName, + nucleicAcidSource, + contributingInstitution, + dataTypes, + diagnosis, + grants, + phase, + methods, + relatedStudies, + tissue + ) + +# update the portal studies table +update_synapse_table(sid_studies_table, dat, syn, syntab) diff --git a/inst/scripts/query-pubmed-grants.R b/inst/scripts/query-pubmed-grants.R new file mode 100755 index 0000000..45c920f --- /dev/null +++ b/inst/scripts/query-pubmed-grants.R @@ -0,0 +1,324 @@ +## ----setup, include = FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------- +install.packages('librarian') +install.packages("synapser", repos=c("http://ran.synapse.org", "http://cran.fhcrc.org")) + +librarian::shelf( + optparse, + rmarkdown, + reticulate, + janitor, + dplyr, + readr, + stringr, + reticulate, + synapser, + easyPubMed, + comprehenr, + easyPubMed, + httr, + tidyr, + dplyr +) + +library('synapser') + +# nolint start +option_list <- list( + make_option( + "--auth_token", + action = "store", + default = NA, + type = "character", + help = "Synapse Personal Access Token. If not given, assumes local .synapseConfig." + ), + make_option( + "--grant_table", + action = "store", + default = NA, + type = "character", + help = "Synapse synID for table with grants to query for. Requires columns `grant`, `grantSerialNumber`, `Program`. grants are queried by serial number." + ), + make_option( + "--parent", + action = "store", + default = NA, + type = "character", + help = "Synapse synID of parent folder to store publication entities to." + ), + make_option( + "--pub_table", + action = "store", + default = NA, + type = "character", + help = "Synapse synID of file view scoped to publication folder (`parent`)." + ) +) +opts <- parse_args(OptionParser(option_list = option_list)) + +# get the base working directory to make it work on others systems +base_dir <- gsub('vignettes', '', getwd()) +source(glue::glue("{base_dir}/R/pubmed.R")) +source(glue::glue("{base_dir}/R/text-cleaning.R")) +source(glue::glue("{base_dir}/R/annotation.R")) +source(glue::glue("{base_dir}/R/global-hard-coded-variables.R")) + +# Login to synapse +## Synapse client and logging in +synapseclient <- reticulate::import("synapseclient") +syntab <- reticulate::import("synapseclient.table") +syn <- synapseclient$Synapse() +if (!is.na(opts$auth_token)) { + syn$login(authToken = opts$auth_token) +} else { + syn$login() +} + +## ----functions------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +hacky_cleaning <- function(text) { + conv <- convert_to_ascii(text = text) + conv <- remove_hmtl_formatting(text = conv) + conv <- gsub("&|amp;", "and", conv) + conv <- gsub("'|"", "'", conv) + conv <- gsub(">", "Greater Than ", conv) + conv <- gsub("[[:punct:]]+", "", conv) + conv <- gsub("\\s+", " ", conv) + conv <- str_trunc(text, width = 500) + return(conv) +} + + +## ----vars, echo=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------- +# table_id <- "syn51209786" # ELITE Portal Projects Table + +# Gather list of grants from synapse +grants <- + syn$tableQuery(glue::glue("SELECT grant, program, name FROM {sid_projects_table}"))$asDataFrame() + +# expand rows with multiple grants +grants <- tidyr::unnest(grants, cols = grant) + +grant_list <- grants$grant + +## ----scrape pubmed ids from grant numbers--------------------------------------------------------------------------------------------------------------------------------------------- +get_pub_details <- function(request_body) { + # Make the POST request + response <- + POST( + url = API_URL, + headers = headers, + body = request_body, + encode = "json" + ) + return (response) +} + +process_response <- function(response) { + if (response$status_code == 200) { + # Success! + data <- content(response, "parsed") + + df <- as.data.frame(do.call(rbind, data$results)) + + return (df) + } +} + +# works for project Numbers instead of project serial numbers +# Set the API URL +API_URL <- "https://api.reporter.nih.gov/v2/publications/search" + +# Set the headers +headers <- list(accept = "application/json", + "Content-Type" = "application/json") + +# Set the request body +request_body <- list( + criteria = list(core_project_nums = grant_list), + offset = 0, + limit = 50, + sort_field = "core_project_nums", + sort_order = "desc" +) + +# Make the POST request +response <- + POST( + url = API_URL, + headers = headers, + body = request_body, + encode = "json" + ) + +# Check the response status code +if (response$status_code == 200) { + # Success! + pmids <- list() + + # get results as dataframe + pmids_temp <- process_response(response) + + data <- content(response, "parsed") + + total <- data$meta$total + + results <- process_response(response) + + pmids[[length(pmids) + 1]] <- results + + request_body$offset <- request_body$offset + request_body$limit + + while (nrow(results) > 0) { + response <- get_pub_details(request_body) + + results <- process_response(response) + + # extend pmids list + pmids[[length(pmids) + 1]] <- results + + # update offset in request + request_body$offset <- + request_body$offset + request_body$limit + } +} else { + # Something went wrong + print("Error:", response$status_code) +} + +# create dataframe with pmids +pmids_df <- do.call(rbind, pmids) + +pmids_df <- pmids_df %>% rename('grant' = 'coreproject') + +# for joining +pmids_df$grant <- as.character(pmids_df$grant) + +# remove exisiting entities in portal +pubs_exisiting <- + syn$tableQuery( + glue::glue( + "SELECT id, Name, PubmedId, PMID, Title, grant, Program FROM {sid_pub_table}" + ) + )$asDataFrame() + +# remove duplicate no name entities +# for (i in as.list(pubs_exisiting[grep("NA NA NA", pubs_exisiting$Name), "id"])) { +# print(i) +# tryCatch({ +# syn$delete(i) +# }, +# error = function(e) { +# print(glue::glue('error deleting {i}')) +# }) +# } + +# collapse rows by grouping by pmids since some publications can be assoicated with multiple grants +pmids_df <- pmids_df %>% group_by(pmid) %>% reframe( + grant = paste0(grant, collapse = ","), + applid = paste0(unique(applid), collapse = ",") +) + +# Take only pmids not in the portal already +pmids_df <- + pmids_df[!(pmids_df$pmid %in% pubs_exisiting$PubmedId),] + +## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# one eternity later.... +pmid_metadata <- pub_query(pmids_df$pmid) + +## ----query---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# create complete dataset +dat <- dplyr::right_join(grants, pmids_df, by = "grant") + +dat$pmid <- as.character(dat$pmid) + +dat <- dplyr::inner_join(dat, pmid_metadata, by = "pmid") + +# clean column names +dat <- janitor::clean_names(dat, "lower_camel") + +## ----hacky---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# Included in hacky_cleaning is conversion to ascii and removing html formatting +dat$year <- stringr::str_extract(dat$pubdate, "\\d{4}") +dat$year <- as.integer(dat$year) +dat$title <- hacky_cleaning(dat$title) +dat$authors <- hacky_cleaning(dat$authors) +dat$journal <- remove_unacceptable_characters(dat$fulljournalname) + +# dat$abstract <- hacky_cleaning(dat$abstract) + +# drop unnecessary columns +dat <- dat %>% select(-c('applid', 'result', 'pubdate')) + +cat( + 'Total rows: ', + nrow(dat), + '\n', + 'Duplicates: ', + sum(dat %>% duplicated()), + '\n', + 'Rows after duplicate remove: ', + nrow(dat) - sum(dat %>% duplicated()) +) + +# Need to remove duplicates, but keep all grants and consortium +# Includes some renaming and dropping of columns +dat <- dat %>% + group_by(pmid) %>% + mutate(grant = glue::glue_collapse(unique(.data$`grant`), ", ")) %>% + mutate(consortium = glue::glue_collapse(unique(.data$program), ", ")) %>% + mutate(name = glue::glue_collapse(unique(.data$name), ", ")) %>% + select(!c(grant, program)) %>% + rename( + pubmed_id = pmid, + DOI = doi, + program = consortium, + study = name + ) %>% + distinct() + +dat <- dat %>% rename('pmid' = 'pubmed_id') +dat$entity_name <- make_entity_name(dat) +dat$Name <- make_entity_name(dat) + + +#Using rename() +dat <- dat %>% rename( + "Authors" = "authors", + "Journal" = "journal", + "PubmedId" = "pmid", + "Title" = "title", + "Year" = "year", + "Program" = "program", +) + +# Remove common, unallowed characters from entity name; includes hacky_cleaning +dat$entity_name <- remove_unacceptable_characters(dat$entity_name) + +## ----columns-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +dat <- set_up_multiannotations(dat, "grant") +dat <- set_up_multiannotations(dat, "Program") +dat <- set_up_multiannotations(dat, "Authors") + + +## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +store_as_annotations <- function(parent, list) { + entity <- purrr::map( + list, + ~ synapseclient$File( + path = glue::glue("http://doi.org/{.$DOI}"), + name = .$entity_name, + parent = parent, + synapseStore = FALSE, + annotations = . + ) + ) + # entity + purrr::map(entity, ~ syn$store(., forceVersion = FALSE)) +} + +## ----store, message=FALSE, echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------- +# parent = "syn51317180" # ELITE publications folder +dat_list <- purrr::transpose(dat) + +# another eternity +store_as_annotations(parent = sid_pub_folder, dat_list) diff --git a/inst/scripts/set-folder-annotations-pec.R b/inst/scripts/set-folder-annotations-pec.R index 3f80335..6446798 100644 --- a/inst/scripts/set-folder-annotations-pec.R +++ b/inst/scripts/set-folder-annotations-pec.R @@ -4,9 +4,10 @@ library(easyPubMed) library(readr) library(reticulate) library(porTools) -synapseclient <- reticulate::import("synapseclient") -syn <- synapseclient$Synapse() -syn$login() + +# Login to synapse +source("~/Projects/ELITE/porTools/R/synapseLogin.R") + # The study folders get annotated with phase, grants, tissue, species, diagnosis, study type, study description, nucleic acid source and contributing institution. # # - The keys *grants*, *tissue*, *species*, *diagnosis* and *nucleicAcidSource* follow the constrained vocabulary of the [synapseAnnotations repo](https://github.com/sage-bionetworks/synapseannotations). diff --git a/inst/scripts/update-publications.R b/inst/scripts/update-publications.R index 03904b3..7632c8e 100644 --- a/inst/scripts/update-publications.R +++ b/inst/scripts/update-publications.R @@ -1,12 +1,12 @@ #!/usr/bin/Rscript ####################################################### -## Update AD Knowledge Portal Publications ## +## Update ELITE Portal Publications ## ## ## ## Description: ## ## Query PubMed for publications and upload ## ## results to Synapse in the format required by ## -## the AD Knowledge Portal ## +## the ELITE Portal ## ## ## ## Usage: ## ## Rscript update-publications-ad.R \ ## @@ -19,16 +19,21 @@ ## Libraries ------------------------------------------------------------------- -library("dplyr") -library("optparse") -library("porTools") -library("rentrez") -library("purrr") -library("stringr") -## Required, but not fully loaded -## readr, reticulate, glue, easyPubMed, dccvalidator - -## Setup ----------------------------------------------------------------------- +librarian::shelf( + optparse, + rmarkdown, + reticulate, + janitor, + dplyr, + readr, + stringr, + reticulate, + easyPubMed, + synapser, + httr, + tidyr, + dplyr +) # nolint start option_list <- list( @@ -62,98 +67,263 @@ option_list <- list( ) ) opts <- parse_args(OptionParser(option_list = option_list)) -# nolint end +# get the base working directory to make it work on others systems +base_dir <- gsub('vignettes', '', getwd()) +source(glue::glue("{base_dir}/R/pubmed.R")) +source(glue::glue("{base_dir}/R/text-cleaning.R")) +source(glue::glue("{base_dir}/R/annotation.R")) +source(glue::glue("{base_dir}/R/global-hard-coded-variables.R")) + +# Login to synapse ## Synapse client and logging in synapseclient <- reticulate::import("synapseclient") syntab <- reticulate::import("synapseclient.table") syn <- synapseclient$Synapse() -if(!is.na(opts$auth_token)) { +if (!is.na(opts$auth_token)) { syn$login(authToken = opts$auth_token) } else { syn$login() } -## Grab grants ------------------------------------------------------ +## ----functions------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +hacky_cleaning <- function(text) { + conv <- convert_to_ascii(text = text) + conv <- remove_hmtl_formatting(text = conv) + conv <- gsub("&|amp;", "and", conv) + conv <- gsub("'|"", "'", conv) + conv <- gsub(">", "Greater Than ", conv) + conv <- gsub("[[:punct:]]+", "", conv) + conv <- gsub("\\s+", " ", conv) + conv <- str_trunc(text, width = 500) + return(conv) +} + + +## ----vars, echo=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------- +# table_id <- "syn51209786" # ELITE Portal Projects Table + +# Gather list of grants from synapse +grants <- + syn$tableQuery(glue::glue("SELECT grantNumber, program, name FROM {sid_projects_table}"))$asDataFrame() + +# convert grant numbers into string +library(comprehenr) +grantNumbers <- + to_list(for (g in grants$grantNumber) + for (y in g) + y) + +# expand rows with multiple grantNumbers +grants$grantNumber <- + purrr::map(grants$grantNumber, function(x) { + paste(unlist(x), collapse = ",") + }) + +grants <- grants %>% + separate_rows(grantNumber) + +## ----scrape pubmed ids from grant numbers--------------------------------------------------------------------------------------------------------------------------------------------- +get_pub_details <- function(request_body) { + # Make the POST request + response <- + POST( + url = API_URL, + headers = headers, + body = request_body, + encode = "json" + ) + return (response) +} + +process_response <- function(response) { + if (response$status_code == 200) { + # Success! + data <- content(response, "parsed") + + df <- as.data.frame(do.call(rbind, data$results)) -# qury synapse -grants <- syn$tableQuery( - glue::glue( - "SELECT \"Grant Number\", grantSerialNumber, Program ", - "FROM {opts$grant_table}" + return (df) + } +} + +# works for project Numbers instead of project serial numbers +# Set the API URL +API_URL <- "https://api.reporter.nih.gov/v2/publications/search" + +# Set the headers +headers <- list(accept = "application/json", + "Content-Type" = "application/json") + +# Set the request body +request_body <- list( + criteria = list(core_project_nums = grantNumbers), + offset = 0, + limit = 50, + sort_field = "core_project_nums", + sort_order = "desc" +) + +# Make the POST request +response <- + POST( + url = API_URL, + headers = headers, + body = request_body, + encode = "json" ) -)$asDataFrame() -# remove rows that have NaN or NA or empty string for the serial number -grants <- grants[!(grants$grantSerialNumber %in% c(NaN, NA, "")), ] +# Check the response status code +if (response$status_code == 200) { + # Success! + pmids <- list() + + # get results as dataframe + pmids_temp <- process_response(response) + + data <- content(response, "parsed") + + total <- data$meta$total + + results <- process_response(response) + + pmids[[length(pmids) + 1]] <- results + + request_body$offset <- request_body$offset + request_body$limit + + while (nrow(results) > 0) { + response <- get_pub_details(request_body) -## Query PubMed ----------------------------------------------------- + results <- process_response(response) -# unlist list of grant serial numbers into a vector -grant_serial_nums <- unlist(grants$grantSerialNumber) + # extend pmids list + pmids[[length(pmids) + 1]] <- results -# run all grant serial numbers through query pubmed -# returns a tibble - # each row is a publication - # columns include grantserialnumber, pubmed id, publication date, title, full journal name, doi, authors -dat <- query_pubmed(grant_serial_nums) + # update offset in request + request_body$offset <- + request_body$offset + request_body$limit + } +} else { + # Something went wrong + print("Error:", response$status_code) +} + +# create dataframe with pmids +pmids_df <- do.call(rbind, pmids) + +pmids_df <- pmids_df %>% rename('grantNumber' = 'coreproject') + +# for joining +pmids_df$grantNumber <- as.character(pmids_df$grantNumber) + + +## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# one eternity later.... +pmid_metadata <- pub_query(pmids_df$pmid) -## Clean up --------------------------------------------------------- -# munge pubmed query results -# this function pulls out the year from pubdate and adds entity_name column -dat <- munge_pubmed(dat) +## ----query---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# create complete dataset +dat <- dplyr::right_join(grants, pmids_df, by = "grantNumber") -# For some reason, grantSerialNumber isn't always a character -grants$grantSerialNumber <- as.character(grants$grantSerialNumber) +dat$pmid <- as.character(dat$pmid) + +dat <- dplyr::left_join(dat, pmid_metadata, by = "pmid") + +# clean column names +dat <- janitor::clean_names(dat, "lower_camel") + + + +## ----hacky---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +# Included in hacky_cleaning is conversion to ascii and removing html formatting +dat$year <- stringr::str_extract(dat$pubdate, "\\d{4}") +dat$year <- as.integer(dat$year) +dat$title <- hacky_cleaning(dat$title) +dat$authors <- hacky_cleaning(dat$authors) +dat$journal <- remove_unacceptable_characters(dat$fulljournalname) -# Join dat and grants table by grantSerialNumber -dat <- dplyr::right_join(grants, dat, by = "grantSerialNumber") +# dat$abstract <- hacky_cleaning(dat$abstract) -# Some pubmedIDs show up multiple times under different grants -# Need to capture this information in a single row of information so it isn't duplicated +# drop unnecessary columns +dat <- dat %>% select(-c('applid', 'result', 'pubdate')) +cat( + 'Total rows: ', + nrow(dat), + '\n', + 'Duplicates: ', + sum(dat %>% duplicated()), + '\n', + 'Rows after duplicate remove: ', + nrow(dat) - sum(dat %>% duplicated()) +) + +# Need to remove duplicates, but keep all grants and consortium +# Includes some renaming and dropping of columns dat <- dat %>% - # for each pubmedID group_by(pmid) %>% - mutate( - # Create a new column that captures all grants that duplicate pmid is associated with - grant = glue::glue_collapse(unique(.data$`Grant Number`), ", ") + mutate(grant = glue::glue_collapse(unique(.data$`grantNumber`), ", ")) %>% + mutate(consortium = glue::glue_collapse(unique(.data$program), ", ")) %>% + select(!c(grantNumber, program)) %>% + rename( + pubmed_id = pmid, + DOI = doi, + program = consortium, + study = name ) %>% - # Create a new column that captures all programs that duplicate pmid is associated with - mutate(consortium = glue::glue_collapse(unique(.data$Program), ", ")) %>% - # drop Grant Number, Program, and GrantSerialNumber cols - select(!c(`Grant Number`, Program, grantSerialNumber)) %>% - # rename some columns - rename(pubmed_id = pmid, DOI = doi, Program = consortium, journal = fulljournalname) %>% - # keep only distinct rows distinct() -# Hacky cleaning -## Included in hacky_cleaning is conversion to ascii and removing html formatting -dat$title <- hacky_cleaning(dat$title) -dat$authors <- hacky_cleaning(dat$authors) -dat$journal <- hacky_cleaning(dat$journal) +dat <- dat %>% rename('pmid' = 'pubmed_id') +dat$entity_name <- make_entity_name(dat) +dat$Name <- make_entity_name(dat) + + +#Using rename() +dat <- dat %>% rename( + "Authors" = "authors", + "Journal" = "journal", + "PubmedId" = "pmid", + "Title" = "title", + "Year" = "year", + "Grant" = "grant", + "Program" = "program", +) # Remove common, unallowed characters from entity name; includes hacky_cleaning dat$entity_name <- remove_unacceptable_characters(dat$entity_name) -## Remove row of NA -# See this (https://github.com/Sage-Bionetworks/porTools/issues/10#issuecomment-1083441995) issue comment for more info -# TODO: Figure out why this is happening -# capture cases where pubmed ID is NA -idx <- is.na(dat$pubmed_id) -# only keep cases where pubmed ID is not NA -dat <- dat[!idx,] -# Set up multi-annotation columns correctly -dat <- set_up_multiannotations(dat, "grant") + +## ----columns-------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +dat <- set_up_multiannotations(dat, "Grant") dat <- set_up_multiannotations(dat, "Program") +dat <- set_up_multiannotations(dat, "Authors") + + +## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +store_as_annotations <- function(parent, list) { + entity <- purrr::map( + list, + ~ synapseclient$File( + path = glue::glue("http://doi.org/{.$DOI}"), + name = .$entity_name, + parent = parent, + synapseStore = FALSE, + annotations = . + ) + ) + # entity + purrr::map(entity, ~ syn$store(., forceVersion = FALSE)) +} -## Store publications -------------------------------------------- +## ----store, message=FALSE, echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------- +# parent = "syn51317180" # ELITE publications folder dat_list <- purrr::transpose(dat) -store_as_annotations(parent = opts$parent, dat_list) + +# another eternity +store_as_annotations(parent = sid_pub_folder, dat_list) ## Force file view update if (!is.na(opts$pub_table)) { diff --git a/vignettes/query-pubmed-grants.Rmd b/vignettes/query-pubmed-grants.Rmd index 12ac1b9..deb3409 100644 --- a/vignettes/query-pubmed-grants.Rmd +++ b/vignettes/query-pubmed-grants.Rmd @@ -1,44 +1,93 @@ --- title: "Query Pubmed by Grant" -author: "Nicole Kauer", "Kelsey Montgomery" +author: "Nicole Kauer", "Kelsey Montgomery", "Nicholas Lee" date: "3/24/2020" +edited: "10/10/2023" output: html_document --- ```{r setup, include = FALSE} +rm(list=ls()); gc() + knitr::opts_chunk$set(echo = TRUE) -library(dccvalidator) -library(dplyr) -library(easyPubMed) -library(readr) -library(reticulate) -library(porTools) -synapseclient <- reticulate::import("synapseclient") -syntab <- reticulate::import("synapseclient.table") -syn <- synapseclient$Synapse() -syn$login() +# library(dccvalidator) +# library(janitor) +# library(dplyr) +# library(readr) +# library(stringr) +# library(reticulate) +# +# library(easyPubMed) +# library(synapser) +# library(porTools) + +librarian::shelf( + optparse, + rmarkdown, + reticulate, + janitor, + dplyr, + readr, + stringr, + reticulate, + easyPubMed, + synapser, + httr, + tidyr, + dplyr +) + +# get the base working directory to make it work on others systems +base_dir <- gsub('vignettes', '', getwd()) +source(glue::glue("{base_dir}R/pubmed.R")) +source(glue::glue("{base_dir}R/text-cleaning.R")) +source(glue::glue("{base_dir}R/annotation.R")) +source(glue::glue("{base_dir}R/global-hard-coded-variables.R")) + +# Login to synapse +source(glue::glue("{base_dir}R/synapseLogin.R")) +``` + +```{r functions} +hacky_cleaning <- function(text) { + conv <- convert_to_ascii(text = text) + conv <- remove_hmtl_formatting(text = conv) + conv <- gsub("&|amp;", "and", conv) + conv <- gsub("'|"", "'", conv) + conv <- gsub(">", "Greater Than ", conv) + conv <- gsub("[[:punct:]]+", "", conv) + conv <- gsub("\\s+", " ", conv) + conv <- str_trunc(text, width = 500) + return(conv) +} ``` ## Query Pubmed and store data as file annotations The data needed for these steps are *Grant Number*, *grantSerialNumber* and *Program*. Theses functions take a list of grant serial numbers and queries Pubmed to download title, abstract, authors, journal name, year and DOI. Theses annotations are visible in the [AD Knowledge Portal - Publications View](https://www.synapse.org/#!Synapse:syn20448807/tables/). See the [Explore Publications module](https://adknowledgeportal.synapse.org/Explore/Publications) for a visual of how this data is surfaced on the portal. -```{r ex_format, echo = FALSE} -tribble(~`Grant Number`, ~grantSerialNumber, ~Program, - "U01AG046139", "AG046139", "AMP-AD" - ) -``` - Import the grants with their respective programs and serial numbers. - +Serial number is the characters after the first three characters till the end e.g. "U19AG063893" -> "AG063893" is the serial number. ```{r vars, echo=FALSE} -table_id <- "syn17024229" +table_id <- "syn51209786" # ELITE Portal Projects Table + +# Gather list of grants from synapse grants <- syn$tableQuery( - glue::glue("SELECT \"Grant Number\", grantSerialNumber, Program FROM {table_id}") + glue::glue("SELECT grantNumber, program, name FROM {table_id}") )$asDataFrame() -# Remove rows that have NaN or NA or empty string for the serial number -grants <- grants[!(grants$grantSerialNumber %in% c(NaN, NA, "")), ] + +# convert grant numbers into string +library(comprehenr) +grantNumbers <- to_list(for (g in grants$grantNumber) for (y in g) y) + +# expand rows with multiple grantNumbers +grants$grantNumber <- purrr::map(grants$grantNumber, function(x){paste(unlist(x),collapse=",")}) + +grants <- grants %>% + separate_rows(grantNumber) + ``` + ## Run the code Any character vector can be passed to `query_list_general`. This function wraps several functions: @@ -50,59 +99,197 @@ Any character vector can be passed to `query_list_general`. This function wraps - leaves out grants that were not associated with a PubmedId - creates a *query* column to associate the PubmedId with a specific query -```{r query, message=FALSE, warning=FALSE} -dat <- query_list_general(grants$grantSerialNumber) +```{r scrape pubmed ids from grant numbers} +get_pub_details <- function(request_body) { + # Make the POST request + response <- + POST( + url = API_URL, + headers = headers, + body = request_body, + encode = "json" + ) + return (response) +} + +process_response <- function(response){ + if (response$status_code == 200) { + # Success! + data <- content(response, "parsed") + + df <- as.data.frame(do.call(rbind, data$results)) + + return (df) + } +} + +# works for project Numbers instead of project serial numbers +# Set the API URL +API_URL <- "https://api.reporter.nih.gov/v2/publications/search" + +# Set the headers +headers <- list( + accept = "application/json", + "Content-Type" = "application/json" +) + +# Set the request body +request_body <- list( + criteria = list( + core_project_nums = grantNumbers + ), + offset = 0, + limit = 50, + sort_field = "core_project_nums", + sort_order = "desc" +) + +# Make the POST request +response <- POST(url = API_URL, headers = headers, body = request_body, encode = "json") + +# Check the response status code +if (response$status_code == 200) { + # Success! + pmids <- list() + + # get results as dataframe + pmids_temp <- process_response(response) + + data <- content(response, "parsed") + + total <- data$meta$total + + results <- process_response(response) + + pmids[[length(pmids) + 1]] <- results + + request_body$offset <- request_body$offset + request_body$limit + + while (nrow(results) > 0){ + + response <- get_pub_details(request_body) + + results <- process_response(response) + + # extend pmids list + pmids[[length(pmids) + 1]] <- results + + # update offset in request + request_body$offset <- request_body$offset + request_body$limit + } +} else { + # Something went wrong + print("Error:", response$status_code) +} + +# create dataframe with pmids +pmids_df <- do.call(rbind, pmids) + +pmids_df <- pmids_df %>% rename( + 'grantNumber' = 'coreproject' +) + +# for joining +pmids_df$grantNumber <- as.character(pmids_df$grantNumber) ``` -Join the grants to the Pubmed queries and clean up. +Gathers pmids from pubmed +```{r} +# one eternity later.... +pmid_metadata <- pub_query(pmids_df$pmid) +``` + +Join the grants to the Pubmed queries and clean up. ```{r query} -dat <- dat %>% - rename(grantSerialNumber = query) -# For some reason, grantSerialNumber isn't always a character -grants$grantSerialNumber <- as.character(grants$grantSerialNumber) -dat <- dplyr::right_join(grants, dat, by = "grantSerialNumber") -# Need to remove duplicates, but keep all grants and consortium -# Includes some renaming and dropping of columns -dat <- dat %>% - group_by(pmid) %>% - mutate(grant = glue::glue_collapse(unique(.data$`Grant Number`), ", ")) %>% - mutate(consortium = glue::glue_collapse(unique(.data$Program), ", ")) %>% - select(!c(`Grant Number`, Program, grantSerialNumber)) %>% - rename(pubmed_id = pmid, DOI = doi, Program = consortium) %>% - distinct() +# create complete dataset +dat <- dplyr::right_join(grants, pmids_df, by = "grantNumber") + +dat$pmid <- as.character(dat$pmid) + +dat <- dplyr::left_join(dat, pmid_metadata, by = "pmid") + +# clean column names +dat <- janitor::clean_names(dat, "lower_camel") + ``` The following has fixes for some of the formatting issues found. It also updates the entity name to remove common, unallowed characters. ```{r hacky} # Included in hacky_cleaning is conversion to ascii and removing html formatting +dat$year <- stringr::str_extract(dat$pubdate, "\\d{4}") +dat$year <- as.integer(dat$year) dat$title <- hacky_cleaning(dat$title) dat$authors <- hacky_cleaning(dat$authors) -dat$journal <- hacky_cleaning(dat$journal) -dat$abstract <- hacky_cleaning(dat$abstract) +dat$journal <- remove_unacceptable_characters(dat$fulljournalname) + +# dat$abstract <- hacky_cleaning(dat$abstract) + +# drop unnecessary columns +dat <- dat %>% select(-c('applid', 'result', 'pubdate')) + +cat('Total rows: ', nrow(dat), '\n', 'Duplicates: ', sum(dat %>% duplicated()), '\n', 'Rows after duplicate remove: ', nrow(dat)-sum(dat %>% duplicated())) + +# Need to remove duplicates, but keep all grants and consortium +# Includes some renaming and dropping of columns +dat <- dat %>% + group_by(pmid) %>% + mutate(grant = glue::glue_collapse(unique(.data$`grantNumber`), ", ")) %>% + mutate(consortium = glue::glue_collapse(unique(.data$program), ", ")) %>% + select(!c(grantNumber, program)) %>% + rename(pubmed_id = pmid, DOI = doi, program = consortium, study = name) %>% + distinct() + +dat <- dat %>% rename('pmid' = 'pubmed_id') +dat$entity_name <- make_entity_name(dat) +dat$Name <- dat$title + + +#Using rename() +dat <- dat %>% rename( + "Authors" = "authors", + "Journal"= "journal", + "PubmedId" = "pmid", + "Title"= "title", + "Year"="year", + "Grant"="grant", + "Program"="program", +) + # Remove common, unallowed characters from entity name; includes hacky_cleaning dat$entity_name <- remove_unacceptable_characters(dat$entity_name) + ``` + `set_up_multiannotations` parses comma-separated lists to be stored correctly in Synapse as multi-value annotations. Before setting up the multiannotations, add extra columns that are needed for working with the Portal. The additional, redundant columns will be removed in the future. Should keep `grant` and `Program`, and remove `long_amp_ad_grants`, `doi`, and `consortium`. ```{r columns} -dat <- set_up_multiannotations(dat, "grant") +dat <- set_up_multiannotations(dat, "Grant") dat <- set_up_multiannotations(dat, "Program") +dat <- set_up_multiannotations(dat, "Authors") ``` The final data is transposed so that it can be iterated over by `purrr` and stored in Synapse under the `parent` folder. +```{r} +store_as_annotations <- function(parent, list) { + entity <- purrr::map(list, ~ synapseclient$File( + path = glue::glue("http://doi.org/{.$DOI}"), + name = .$entity_name, + parent = parent, + synapseStore = FALSE, + annotations = . + )) + # entity + purrr::map(entity, ~ syn$store(., forceVersion = FALSE)) +} +``` ```{r store, message=FALSE, echo=FALSE} -parent = "syn20463015" +parent = "syn51317180" # ELITE publications folder dat_list <- purrr::transpose(dat) -store_as_annotations(parent = parent, dat_list) -``` -Query the publications table to force an update. - -```{r query, message=FALSE, echo=FALSE} -pub_table <- "syn20448807" -syn$tableQuery(glue::glue("SELECT * FROM {pub_table} LIMIT 1")) -``` +# another eternity +store_as_annotations(parent = parent, dat_list) +``` diff --git a/vignettes/query-pubmed.Rmd b/vignettes/query-pubmed.Rmd index 70fc89b..a74490f 100644 --- a/vignettes/query-pubmed.Rmd +++ b/vignettes/query-pubmed.Rmd @@ -1,39 +1,52 @@ --- title: "Query Pubmed" -author: "Kelsey Montgomery" +author: "Kelsey Montgomery, Nicholas Lee" date: "9/18/2020" +edited: "04/28/2023" +notes: "Adapted for the ELITE portal" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) -library(dccvalidator) +# library(dccvalidator) library(dplyr) library(easyPubMed) library(readr) library(reticulate) -synapseclient <- reticulate::import("synapseclient") -syntab <- reticulate::import("synapseclient.table") -syn <- synapseclient$Synapse() -syn$login() +library(synapser) + +source("~/Documents/Projects/ELITE/ELITE-porTools/R/globalHardCodedVariables.R") +source("~/Documents/Projects/ELITE/ELITE-porTools/R/pubmed.R") + +synLogin() ``` ## Query Pubmed and store data as file annotations The schema for these steps is *pubmedId*, *grants* and *study*. Theses functions take -a list of PubmedIds and queries the site to pull down title, abstract, authors, journal name, year and DOI. Theses annotations are visible in the [PEC Portal - Publications View](https://www.synapse.org/#!Synapse:syn22095937/tables/). See the [Explore Publications module](https://psychencode.synapse.org/Explore/Publications) for a visual of how this data is surfaced on the portal. +a list of PubmedIds and queries the site to pull down title, abstract, authors, journal name, year and DOI. Theses annotations are visible in the [ELITE Portal - Publications View](https://www.synapse.org/#!Synapse:syn51209321/tables/). See the [Explore Publications module](https://psychencode.synapse.org/Explore/Publications) for a visual of how this data is surfaced on the portal. + +```{r query synapse for grants} +grant_list <- + readr::read_csv(syn$tableQuery(paste( + "SELECT * FROM ", sid_studies_table, sep = "" + ))$filepath, + col_types = readr::cols(.default = "c")) -```{r ex_format, echo = FALSE} -tribble(~pubmedId, ~grants, ~study, - "24057217", "R21MH103877", c("study1,study2") - ) +# cleanup grant list +grant_list$grantNumber <- gsub('\\[|\\]|\\"',"", as.character(grant_list$grantNumber)) + +# take only grant number as the grants +grants <- grant_list$grantNumber ``` Import the list of Pubmed Ids and define the Synapse parentId where the file entities will be stored with the Pubmed-relevant annotations. - ```{r vars, echo=FALSE} -parent <- "syn22235314" -pmids <- readr::read_tsv(syn$get("syn22080024")$path, +parent <- "syn51317180" # ELITE Portal backend folder for publications + +# read pmids file +pmids <- readr::read_tsv(syn$get(sid_pmid_file)$path, col_types = readr::cols(.default = "c")) ``` @@ -46,7 +59,7 @@ Any character vector can be passed to `query_list_pmids`. This function wraps se - creates one row per PubmedId ```{r query} -dat <- query_list_pmids(pmids$pubmedId) +dat <- query_list_pmids(pmid_map$pubmed_Id) ``` I am keeping an eye out for weird edge cases. These (hacky) steps clean some missing values and remove extraneous characters. diff --git a/vignettes/update-static-table.Rmd b/vignettes/update-static-table.Rmd new file mode 100644 index 0000000..3a830c8 --- /dev/null +++ b/vignettes/update-static-table.Rmd @@ -0,0 +1,47 @@ +--- +title: "extra" +output: html_document +date: "2023-10-10" +--- + + +Create the pmid to grant file map. Drop the duplicates found and store in synapse as reference for later updates. +Write out pmid dataframe out to csv for loading later +```{r} +pmid_map <- dat %>% select("pubmed_id", "grant", "program", "study") + +write.table(pmid_map, file.path(root_dir, 'publications_pmid_list.txt'), row.names = FALSE, sep = "\t") + +file <- File(path = file.path(root_dir, 'publications_pmid_list.txt'), parent = 'syn52227310') + +file <- synStore(file) +``` + +```{r} +## Delete old publication report table rows +pub_report_table <- "syn51209321" +current_table <- syn$tableQuery(glue::glue("SELECT * FROM {pub_report_table}")) +``` + +```{r} +syn$delete(current_table) # delete current rows + +## Update table rows +temp_table <- tempfile() +write_csv(pubs, temp_table, na = "") + +``` + +```{r} +new_table <- synapse$Table(pub_report_table, temp_table) +syn$store(new_table) + +## Query to force table index to rebuild +syn$tableQuery(glue("SELECT ROW_ID FROM {pub_report_table}")) +``` + +Query the publications table to force an update. +```{r query, message=FALSE, echo=FALSE} +pub_table <- "syn51407023" +syn$tableQuery(glue::glue("SELECT * FROM {pub_table} LIMIT 1")) +```