Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

compare changes from ELITE fork #20

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
ec675bd
first pass editing scripts
nlee-sage May 1, 2023
48aa81c
Created Synapse login file and hardcoded variables. Updated vignettes
nlee-sage May 1, 2023
03bbde2
Merge branch 'master' of github.com:nlee-sage/porTools
nlee-sage May 9, 2023
0f5aacd
Updated vignettes and scripts
nlee-sage May 9, 2023
c2d7a2d
Merge branch 'master' of github.com:nlee-sage/porTools
nlee-sage May 9, 2023
e236d77
Updates to query-pubmed-grants to push new files
nlee-sage Aug 3, 2023
8fb631d
updated path, functions and multi-annotations
nlee-sage Oct 10, 2023
cd5710b
Created working script for grant numbers
nlee-sage Oct 10, 2023
6ed772b
updated workflow and synapse login
nlee-sage Oct 10, 2023
c78f070
Updated query pubmed file
nlee-sage Oct 11, 2023
4e4d153
Update update-publications.yaml
nlee-sage Oct 10, 2023
c8b53b1
Update update-publications.yaml
nlee-sage Oct 10, 2023
42e79d1
Update update-publications.yaml
nlee-sage Oct 11, 2023
80f81a2
Update update-publications.yaml
nlee-sage Oct 11, 2023
a191b03
Update update-publications.yaml
nlee-sage Oct 11, 2023
d488224
Update update-publications.yaml
nlee-sage Oct 11, 2023
2690b62
Update update-publications.yaml
nlee-sage Oct 11, 2023
e44746e
Update update-publications.yaml
nlee-sage Oct 11, 2023
773588e
Update query-pubmed-grants.R
nlee-sage Oct 11, 2023
354647f
Update query-pubmed-grants.R
nlee-sage Oct 11, 2023
6b556db
Update query-pubmed-grants.R
nlee-sage Oct 11, 2023
a2f6ead
Updated query pubmed file
nlee-sage Oct 11, 2023
32c64cb
add synapser dependency
nlee-sage Oct 11, 2023
da6fdfe
updated workflow and script
nlee-sage Oct 12, 2023
b7577ac
Update update-publications.yaml
nlee-sage Oct 13, 2023
883a3da
fixed query script
nlee-sage Apr 2, 2024
4bab1b6
Merge pull request #4 from eliteportal/dev
nlee-sage Apr 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
name: "Update AD Publications"
name: "Update Publications"

on:
schedule:
- cron: "0 0 1 * *"
workflow_dispatch:

env:
RETICULATE_AUTOCONFIGURE: 'FALSE'
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

jobs:
update-ad-publications:
update-publications:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- uses: r-lib/actions/setup-r@master
- uses: r-lib/actions/setup-r@v2
with:
r-version: '4.0'
r-version: '4.1.3'

- name: Query dependencies
run: |
install.packages('remotes')
install.packages("synapser")
saveRDS(remotes::dev_package_deps(dependencies = TRUE), "depends.Rds", version = 2)
shell: Rscript {0}

Expand All @@ -48,11 +50,11 @@ jobs:
run: |
Rscript -e "reticulate::py_discover_config()"
Rscript -e "reticulate::py_install(c('pandas', 'numpy', 'boto3', 'synapseclient'), pip = TRUE)"

- name: Install porTools
run: |
Rscript -e "remotes::install_github('Sage-Bionetworks/porTools')"
Rscript -e "remotes::install_github('eliteportal/publication_scraper')"

- name: Query PubMed and upload results
run: |
Rscript ./inst/scripts/update-publications-ad.R --grant_table syn17024229 --parent syn20463015 --pub_table syn20448807 --auth_token ${{ secrets.SYNAPSE_PAT }}
Rscript ./inst/scripts/query-pubmed-grants.R --grant_table syn51209786 --parent syn51400816 --pub_table syn51407023 --auth_token ${{ secrets.SYNAPSE_PAT }}
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
.Rhistory
.RData
.Ruserdata
.synapseConfig
.DS_Store
publications_pmid_list.txt
12 changes: 12 additions & 0 deletions R/global-hard-coded-variables.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# IDs of syanpse folders and tables that are used throughout the package for gather grant IDs and uploading annotations and entities.
# Edit the following for the relevant project

root_dir <- "~/Documents/Projects/ELITE/ELITE-porTools"
sid_project <- "syn27229419"
sid_studies_table <- "syn51210771"
sid_studies_fv <- "syn51523775"
sid_projects_table <- "syn51209786" # ELITE Portal Projects Table
sid_pub_table <- "syn51407023"
sid_pub_folder <- "syn51317180"
sid_people_table <- "syn51209684"
sid_pmid_file <- "syn52227331"
2 changes: 2 additions & 0 deletions R/md-converter.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Convert vignette to R script
knitr::purl("~/Documents/Projects/ELITE/ELITE-porTools/vignettes/query-pubmed-grants.Rmd")
4 changes: 2 additions & 2 deletions R/pubmed.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ pub_query <- function(pub_pmids_list) {
names(pub_summary_list) <- names(pub_pmids_list)

# collapse list of dataframes into a single df
dplyr::bind_rows(pub_summary_list, .id = "grantSerialNumber")
dplyr::bind_rows(pub_summary_list, .id = "result")
}

#' Parse Summary Obj
Expand Down Expand Up @@ -214,7 +214,7 @@ make_entity_name <- function(dat){
# Need to leave space for year and pubmed ID
# Arbitrarily set to 200 characters
short_name <- stringr::str_trunc(
glue::glue("{first_author} {dat$fulljournalname}"),
glue::glue("{first_author} {dat$journal}"),
width = 200
)

Expand Down
24 changes: 24 additions & 0 deletions R/setup_env.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# setup env
# Package names
packages <- c("librarian", "knitr")

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}

# install.packages("synapser", repos=c("http://ran.synapse.org", "http://cran.fhcrc.org"))

librarian::shelf(
optparse,
rentrez,
rmarkdown,
reticulate,
janitor,
dplyr,
readr,
stringr,
reticulate,
easyPubMed
)
25 changes: 25 additions & 0 deletions R/synapseLogin.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
library("optparse")

# nolint start
option_list <- list(
make_option(
"--auth_token",
action = "store",
default = NA,
type = "character",
help = "Synapse Personal Access Token. If not given, assumes local .synapseConfig."
)
)

opts <- parse_args(OptionParser(option_list = option_list))
# nolint end

## Synapse client and logging in
synapseclient <- reticulate::import("synapseclient")
syntab <- reticulate::import("synapseclient.table")
syn <- synapseclient$Synapse()
if(!is.na(opts$auth_token)) {
syn$login(authToken = opts$auth_token)
} else {
syn$login()
}
1 change: 1 addition & 0 deletions R/text-cleaning.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ remove_unacceptable_characters <- function(text) {
conv <- gsub(",", "", conv)
conv <- gsub("\\]", "", conv)
conv <- gsub("\\[", "", conv)
conv <- gsub("=", "-", conv)
return(conv)
}
#' Clean up funky text
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,9 @@ Sage portals require content management of publications, people, data, studies a
[[[[work in-progress]]]]

`devtools::install_github('Sage-Bionetworks/porTools')`


## Updates
**2023-10-10**
- If the grant serial number overlaps with annother for example `UH2AG064706` and `UH3AG064706` then a different call to get the search results must be made and the previously developed functions do not work
- Found the NIH library for R is much faster than python
15 changes: 9 additions & 6 deletions inst/scripts/curate-portal-people-table-pec.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
library(tidyverse)
library(purrr)
synapseclient <- reticulate::import("synapseclient")
syntab <- reticulate::import("synapseclient.table")
syn <- synapseclient$Synapse()
syn$login()

# Login to synapse
source("~/Projects/ELITE/porTools/R/synapseLogin.R")

### Hard coded variables
source("~/Projects/ELITE/porTools/R/globalHardCodedVariables.R")

## functions
update_synapse_table <- function(table_id, update_df, syn, syntab) {
Expand All @@ -14,6 +16,7 @@ update_synapse_table <- function(table_id, update_df, syn, syntab) {
update_rows <- syntab$Table(table_id, tmpfile)
syn$store(update_rows)
}

make_df <- function(list, column_name) {
df <- tibble::enframe(list) %>%
tidyr::unnest(cols = c(value), keep_empty = TRUE)
Expand All @@ -22,7 +25,7 @@ make_df <- function(list, column_name) {
df
}
###
people <- read_csv(syn$tableQuery("Select * from syn22096112")$filepath)
people <- read_csv(syn$tableQuery(glue::glue("SELECT * from {sid_people_table}")$filepath) # table to portal - people
team <- syn$getTeamMembers("3323356")
list <- reticulate::iterate(team)
member <- map(list, ~.$get("member"))
Expand Down Expand Up @@ -57,4 +60,4 @@ update <- update %>% mutate_all(function(x) ifelse(is.na(x),"",x))

update$ROW_ID <- ""

update_synapse_table("syn22096112", update, syn, syntab)
update_synapse_table(sid_people_table, update, syn, syntab)
87 changes: 58 additions & 29 deletions inst/scripts/curate-portal-studies-table-pec.R
Original file line number Diff line number Diff line change
@@ -1,40 +1,44 @@
library(tidyverse)
synapseclient <- reticulate::import("synapseclient")
syntab <- reticulate::import("synapseclient.table")
syn <- synapseclient$Synapse()
syn$login()

# Login to synapse
source("~/Projects/ELITE/porTools/R/synapseLogin.R")

# Once study folders are annotated, this script will find those annotations and merge them
# into the studies table that creates the study cards in the portal.

### Hard coded variables
source("~/Projects/ELITE/porTools/R/globalHardCodedVariables.R")

### functions
coalesceJoin <- function(x, y,
by = NULL, suffix = c(".x", ".y"),
join = dplyr::left_join, ...) {
coalesceJoin <- function(x,
y,
by = NULL,
suffix = c(".x", ".y"),
join = dplyr::left_join,
...) {
joined <- join(x, y, by = by, suffix = suffix, ...)
# names of desired output
cols <- union(names(x), names(y))

to_coalesce <- names(joined)[!names(joined) %in% cols]
suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
suffix_used <-
suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
# remove suffixes and de-duplicate
to_coalesce <- unique(substr(
to_coalesce,
1,
nchar(to_coalesce) - nchar(suffix_used)
))

coalesced <- purrr::map_dfc(to_coalesce, ~dplyr::coalesce(
joined[[paste0(.x, suffix[1])]],
joined[[paste0(.x, suffix[2])]]
))
to_coalesce <- unique(substr(to_coalesce,
1,
nchar(to_coalesce) - nchar(suffix_used)))

coalesced <- purrr::map_dfc(to_coalesce, ~ dplyr::coalesce(joined[[paste0(.x, suffix[1])]],
joined[[paste0(.x, suffix[2])]]))

names(coalesced) <- to_coalesce

dplyr::bind_cols(joined, coalesced)[cols]
}

update_synapse_table <- function(table_id, update_df, syn, syntab) {
current_rows <- syn$tableQuery(glue::glue("SELECT * FROM {table_id}"))
current_rows <-
syn$tableQuery(glue::glue("SELECT * FROM {table_id}"))
syn$delete(current_rows)
tmpfile <- fs::file_temp("rows.csv")
write_csv(update_df, tmpfile)
Expand All @@ -45,32 +49,57 @@ update_synapse_table <- function(table_id, update_df, syn, syntab) {

# update studies table
# force view to rebuild
trigger <- syn$tableQuery("select * from syn21990011")
trigger <- syn$tableQuery(glue::glue("SELECT * FROM {sid_studies}", ))

table <- dccvalidator::get_synapse_table("syn21783965", syn)
fv <- dccvalidator::get_synapse_table("syn21990011", syn)
table <- dccvalidator::get_synapse_table(sid_studies, syn)

# Why do we need a file view?
fv <- dccvalidator::get_synapse_table(sid_studies_fv, syn) # studies view for portal

# Parse rows from file view that contain annotations to be captured in the
# PEC studies table
to_update <- fv[!is.na(fv$studyDescription),]
to_update <- fv[!is.na(fv$studyDescription), ]

to_update <- rename(to_update, key = id,
studyName = name)

table <- rename(table, key = study)

# join on synId
updated <- coalesceJoin(to_update, table, by = "key", join = full_join)
updated <-
coalesceJoin(to_update, table, by = "key", join = full_join)

# change to required schema
updated <- rename(updated, study = key)

# NAs must be changed to empty strings
dat <- updated %>% mutate_all(function(x) ifelse(is.na(x),"",x))
dat <- updated %>% mutate_all(function(x)
ifelse(is.na(x), "", x))

# order cards alphabetically
dat <- dat[order(dat$studyName),]
dat <- dat[order(dat$studyName), ]

#order schema
dat <- dplyr::select(dat, studyType, isModelSystem, numberOfIndividuals, species, study, studyDescription, studyName, nucleicAcidSource, contributingInstitution, dataTypes, diagnosis, grants, phase, methods, relatedStudies, tissue)

update_synapse_table("syn21783965", dat, syn, syntab)
dat <-
dplyr::select(
dat,
studyType,
isModelSystem,
numberOfIndividuals,
species,
study,
studyDescription,
studyName,
nucleicAcidSource,
contributingInstitution,
dataTypes,
diagnosis,
grants,
phase,
methods,
relatedStudies,
tissue
)

# update the portal studies table
update_synapse_table(sid_studies_table, dat, syn, syntab)
Loading