-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PspA figure implementation #93
base: main
Are you sure you want to change the base?
Conversation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've moved things around a bit to at least clean up towards a working function. It's clearly not quite there yet. I hope this helps as a start.
I need a little help, @the-mayer @epbrenner -- thanks!
domains_rename <- read_delim("data/acc_files/domains_rename.txt", | ||
delim="\t", col_names=TRUE) | ||
|
||
# domains_ignore <- read_delim("data/acc_files/domains_ignore.txt", | ||
# delim="\t", col_names=T) | ||
|
||
domains_keep <- read_delim("data/acc_files/domains_keep.txt", | ||
delim="\t", col_names=T) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
domains_rename <- read_delim("data/acc_files/domains_rename.txt", | |
delim="\t", col_names=TRUE) | |
# domains_ignore <- read_delim("data/acc_files/domains_ignore.txt", | |
# delim="\t", col_names=T) | |
domains_keep <- read_delim("data/acc_files/domains_keep.txt", | |
delim="\t", col_names=T) |
# Replace SIG+Toastrack with TM+Toastrack | ||
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | ||
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | ||
|
||
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | ||
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | ||
|
||
# Replace SIG+DUF4178 with TM+4178 | ||
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | ||
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | ||
|
||
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | ||
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | ||
|
||
# Replace SIG+PspA with TM+PspA | ||
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | ||
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | ||
|
||
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | ||
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
# Replace SIG+Toastrack with TM+Toastrack | |
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | |
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | |
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | |
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") | |
# Replace SIG+DUF4178 with TM+4178 | |
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | |
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | |
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | |
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") | |
# Replace SIG+PspA with TM+PspA | |
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | |
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | |
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") | |
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") |
# Convert Sig+Snf7 to TM+Snf7 | ||
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | ||
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | ||
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | ||
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | ||
|
||
|
||
## Write cleaned up file | ||
# write_tsv(all_cln, "data/rawdata_tsv/all_clean_combined_20210329.txt") | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
# Convert Sig+Snf7 to TM+Snf7 | |
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | |
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | |
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | |
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") | |
## Write cleaned up file | |
# write_tsv(all_cln, "data/rawdata_tsv/all_clean_combined_20210329.txt") |
psp_fig4 <- function(job_dir) { | ||
# FIXME: figure out if all_raw can be read from the job folder | ||
all_w_extra <- read_tsv(file="data/rawdata_tsv/all_with_extrapspasnf7.tsv") | ||
# all_w_extra$GI | ||
all_raw <- all_w_extra | ||
|
||
# similar to the Rmd, we run the cleanup function on all_raw to produce prot | ||
prot <- data_cleanup(all_raw) | ||
|
||
# the rest of the figure generation follows as-is | ||
prot$Lineage.reduced = prot$Lineage %>% str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>% str_replace(pattern = "^viruses.*", replacement = "viruses") | ||
|
||
prot$Lineage = prot$Lineage.reduced | ||
|
||
nopspa <- prot %>% | ||
filter_by_doms(column="DomArch", | ||
doms_remove=c("PspA", "PspA(s)", "Snf7"), | ||
ignore.case=T) | ||
|
||
# Include only DAs ≥ min.cutoff | ||
nopspa_tc <- nopspa %>% total_counts(cutoff=100) | ||
|
||
nopspa_tc <- nopspa_tc %>% filter(totalcount>=50) | ||
|
||
cutoff_perc <- 100 - nopspa_tc$CumulativePercent[nrow(nopspa_tc)] | ||
|
||
stlin_nops <- stacked_lin_plot(prot=nopspa, column="DomArch", | ||
cutoff=cutoff_perc, | ||
label.size=20, | ||
legend.position=c(0.6, 0.40), legend.text.size=20, | ||
legend.size=1) | ||
|
||
# res = stacked_lin_plot(prot=nopspa, column="DomArch", | ||
# cutoff=cutoff_perc, | ||
# label.size=20, | ||
# legend.position=c(0.6, 0.40), legend.text.size=20, | ||
# legend.size=1) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to fix path (connection between job_dir, file_path, raw_file, port) -- what it reads | first few lines need to be addressed.
Removed PSP-specific parts.
psp_fig4 <- function(job_dir) { | |
# FIXME: figure out if all_raw can be read from the job folder | |
all_w_extra <- read_tsv(file="data/rawdata_tsv/all_with_extrapspasnf7.tsv") | |
# all_w_extra$GI | |
all_raw <- all_w_extra | |
# similar to the Rmd, we run the cleanup function on all_raw to produce prot | |
prot <- data_cleanup(all_raw) | |
# the rest of the figure generation follows as-is | |
prot$Lineage.reduced = prot$Lineage %>% str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>% str_replace(pattern = "^viruses.*", replacement = "viruses") | |
prot$Lineage = prot$Lineage.reduced | |
nopspa <- prot %>% | |
filter_by_doms(column="DomArch", | |
doms_remove=c("PspA", "PspA(s)", "Snf7"), | |
ignore.case=T) | |
# Include only DAs ≥ min.cutoff | |
nopspa_tc <- nopspa %>% total_counts(cutoff=100) | |
nopspa_tc <- nopspa_tc %>% filter(totalcount>=50) | |
cutoff_perc <- 100 - nopspa_tc$CumulativePercent[nrow(nopspa_tc)] | |
stlin_nops <- stacked_lin_plot(prot=nopspa, column="DomArch", | |
cutoff=cutoff_perc, | |
label.size=20, | |
legend.position=c(0.6, 0.40), legend.text.size=20, | |
legend.size=1) | |
# res = stacked_lin_plot(prot=nopspa, column="DomArch", | |
# cutoff=cutoff_perc, | |
# label.size=20, | |
# legend.position=c(0.6, 0.40), legend.text.size=20, | |
# legend.size=1) | |
plotLineageBarStack <- function(job_dir) { | |
# FIXME # figure out if all_raw can be read from the job folder | |
all_w_extra <- read_tsv(file="data/rawdata_tsv/all_with_extrapspasnf7.tsv") | |
all_raw <- all_w_extra | |
# cleanup function on all_raw to produce prot | |
# FIXME # Check if this is needed | |
prot <- data_cleanup(all_raw) | |
# rename lineages | |
prot$Lineage.reduced = prot$Lineage %>% | |
str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>% | |
str_replace(pattern = "^viruses.*", replacement = "viruses") | |
prot$Lineage = prot$Lineage.reduced | |
# Include only DAs ≥ min.cutoff | |
prot_cutoff <- prot %>% total_counts(cutoff=100) | |
prot_cutoff <- prot_cutoff %>% filter(totalcount>=50) | |
cutoff_perc <- 100 - prot_cutoff$CumulativePercent[nrow(prot_cutoff)] | |
stacked_lin <- stacked_lin_plot(prot= prot, column="DomArch", | |
cutoff=cutoff_perc, | |
label.size=20, | |
legend.position=c(0.6, 0.40), legend.text.size=20, | |
legend.size=1) | |
# legend.position=c(0.6, 0.40), legend.text.size=20, | ||
# legend.size=1) | ||
|
||
return(stlin_nops) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return(stlin_nops) | |
return(stacked_lin) |
# FA: writing out the file commented out in favor of returning the object | ||
|
||
# Decrease width, increase height, increase font size | ||
# | ||
# ggsave("stackedLinPlot_50tc.png", stlin_nops, | ||
# width=15, | ||
# height=18, | ||
# dpi=300) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agreed
# FA: writing out the file commented out in favor of returning the object | |
# Decrease width, increase height, increase font size | |
# | |
# ggsave("stackedLinPlot_50tc.png", stlin_nops, | |
# width=15, | |
# height=18, | |
# dpi=300) |
# Cleanup GenContext | ||
# Calls reverse_operons | ||
all_cln <- all_cln %>% | ||
cleanup_gencontext(domains_rename=domains_rename, | ||
repeat2s=FALSE) | ||
|
||
all_cln$GenContext.repeats <- all_cln$GenContext | ||
|
||
all_cln <- all_cln %>% | ||
cleanup_gencontext(domains_rename=domains_rename, | ||
repeat2s=T) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No GenContext yet | @epbrenner
# Cleanup GenContext | |
# Calls reverse_operons | |
all_cln <- all_cln %>% | |
cleanup_gencontext(domains_rename=domains_rename, | |
repeat2s=FALSE) | |
all_cln$GenContext.repeats <- all_cln$GenContext | |
all_cln <- all_cln %>% | |
cleanup_gencontext(domains_rename=domains_rename, | |
repeat2s=T) | |
# FIXME # Not using genomic context parts of the script, yet | |
# Cleanup GenContext | |
# Calls reverseOperons | |
# all_cln <- all_cln %>% | |
# cleanup_gencontext(domains_rename=domains_rename, | |
# repeat2s=FALSE) | |
# | |
# all_cln$GenContext.repeats <- all_cln$GenContext | |
# | |
# all_cln <- all_cln %>% | |
# cleanup_gencontext(domains_rename=domains_rename, | |
# repeat2s=T) |
all_cln <- all_cln %>% | ||
cleanup_domarch(domains_rename=domains_rename, | ||
domains_keep=NULL, # filter applied to only ClustName for now. | ||
domains_ignore=NULL, #!! should check and remove soon! | ||
repeat2s=FALSE, | ||
remove_tails=F, #new! check below if it works! | ||
remove_empty=F) #new! needed? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
need to check with new function names | @the-mayer
removing PSP-specific code
need logical checks | @epbrenner
all_cln <- all_cln %>% | |
cleanup_domarch(domains_rename=domains_rename, | |
domains_keep=NULL, # filter applied to only ClustName for now. | |
domains_ignore=NULL, #!! should check and remove soon! | |
repeat2s=FALSE, | |
remove_tails=F, #new! check below if it works! | |
remove_empty=F) #new! needed? | |
all_cln <- all_cln %>% | |
cleanup_domarch(domains_rename=domains_rename, | |
repeat2s=FALSE) | |
# remove_tails=F, # FIXME # keep if it exists & works for DomArch | |
# remove_empty=F) # FIXME # keep if it exists & works for DomArch | |
all_cln$DomArch.repeats <- all_cln$DomArch | ||
|
||
# Removing duplicate AccNum w/ different DomArchs | ||
all_cln$DomArch.uncompressed <- all_cln$DomArch.repeats | ||
# !! repeat2s: deprecation notice for funs & list | ||
all_cln <- repeat2s(all_cln, "DomArch.uncompressed", | ||
excluded_prots=c("PspC", "LiaI-LiaF-TM")) | ||
|
||
# Extract unique rows | ||
all_cln <- all_cln %>% distinct() | ||
# Pick longer of the duplicated AccNum DomArchs | ||
all_cln <- all_cln %>% pick_longer_duplicate("DomArch.uncompressed") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea is to clean up DomArch names and address repeats
[seeing Fig4, PSP should give a fair idea of the outcome] | @epbrenner
repeat2s -- what's the new name/functionality? needs checking | @the-mayer
Rm, PSP-specific code
all_cln$DomArch.repeats <- all_cln$DomArch | |
# Removing duplicate AccNum w/ different DomArchs | |
all_cln$DomArch.uncompressed <- all_cln$DomArch.repeats | |
# !! repeat2s: deprecation notice for funs & list | |
all_cln <- repeat2s(all_cln, "DomArch.uncompressed", | |
excluded_prots=c("PspC", "LiaI-LiaF-TM")) | |
# Extract unique rows | |
all_cln <- all_cln %>% distinct() | |
# Pick longer of the duplicated AccNum DomArchs | |
all_cln <- all_cln %>% pick_longer_duplicate("DomArch.uncompressed") | |
all_cln$DomArch.repeats <- all_cln$DomArch | |
# Removing duplicate AccNum w/ different DomArchs | |
all_cln$DomArch.uncompressed <- all_cln$DomArch.repeats | |
# !! repeat2s: deprecation notice for funs & list | |
all_cln <- repeat2s(all_cln, "DomArch.uncompressed") | |
# Extract unique rows | |
all_cln <- all_cln %>% distinct() | |
# Pick longer of the duplicated AccNum DomArchs | |
all_cln <- all_cln %>% | |
pick_longer_duplicate("DomArch.uncompressed") |
library(tidyverse) | ||
library(data.table) | ||
|
||
# source("/data/research/jravilab/molevol_scripts/R/cleanup.R") # in package | ||
# source("/data/research/jravilab/molevol_scripts/R/summarize.R") # in package | ||
# source("/data/research/jravilab/molevol_scripts/R/plotting.R") # in package | ||
# source("/data/research/jravilab/molevol_scripts/R/reverse_operons.R") # in package | ||
# source("/data/research/jravilab/molevol_scripts/R/networks_domarch.R") # in package |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
avoiding hard-coded paths & library calls
library(tidyverse) | |
library(data.table) | |
# source("/data/research/jravilab/molevol_scripts/R/cleanup.R") # in package | |
# source("/data/research/jravilab/molevol_scripts/R/summarize.R") # in package | |
# source("/data/research/jravilab/molevol_scripts/R/plotting.R") # in package | |
# source("/data/research/jravilab/molevol_scripts/R/reverse_operons.R") # in package | |
# source("/data/research/jravilab/molevol_scripts/R/networks_domarch.R") # in package |
Description
This PR contains a very work-in-progress implementation of figure 4 from the PspA paper, https://journals.asm.org/doi/epub/10.1128/msystems.00847-23. The PR
is intended as a starting point, so you don't have to go and find the original code from the papers, but it definitely needs a lot of work! See issue #90 for more details.
Closes #90.
What kind of change(s) are included?
Checklist
Please ensure that all boxes are checked before indicating that this pull request is ready for review.