-
Notifications
You must be signed in to change notification settings - Fork 1
DESeq2 with salmon
Meg Staton edited this page Nov 15, 2022
·
4 revisions
Get files
scp -r isaac:/lustre/isaac/proj/UTK0208/rnaseq/analysis/mstaton1/3_salmon/\*quant .
You really only need quant.sf for each sample but this keeps directory structure in order which makes R code easier
Fix problem with "blomming"
mv EarlyBlommingRep1_quant EarlyBloomingRep1_quant
Get gff3 file to build your transcript to gene conversion data:
scp -r isaac:/lustre/isaac/proj/UTK0208/rnaseq/raw_data/Ppersica_298_v2.1.gene_exons.gff3 .
Move over into R
# folder with counts file
setwd("/Users/margaretstaton/Desktop/PrunusDormancy/salmon")
#BiocManager::install("tximport")
library("tximport")
#BiocManager::install("GenomicFeatures")
library("GenomicFeatures")
library( "DESeq2")
library("pheatmap")
library("RColorBrewer")
library("ggplot2")
##-------------------------------------------------
##------- get tx2gene
##-------------------------------------------------
txdb <- makeTxDbFromGFF("Ppersica_298_v2.1.gene_exons.gff3",
"gff3",
"Phytozome",
"Prunus persica")
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- select(txdb, k, "GENEID", "TXNAME")
##-------------------------------------------------
##------- import salmon counts
##-------------------------------------------------
files <- c("./EarlyBloomingRep1_quant/quant.sf",
"./EarlyBloomingRep2_quant/quant.sf",
"./LateBloomingRep1_quant/quant.sf",
"./LateBloomingRep2_quant/quant.sf")
txi <- tximport(files, type = "salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM")
# set up sample table
sampleFiles <- grep("Rep",list.files(),value=TRUE)
sampleGenotype <- sub("(.*ming).*","\\1",sampleFiles)
sampleName <- sub("(.*mingRep[0-9]+).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleName,
fileName = sampleFiles,
genotype = sampleGenotype)
sampleTable$genotype <- factor(sampleTable$genotype)
# then below...
ddsTXI <- DESeqDataSetFromTximport(txi, sampleTable, ~genotype)
##-------------------------------------------------
##--------- now we can get to deseq2, similar code as we used before for STAR
##-------------------------------------------------
##Prefiltering very lowly expressed reads, which will make everything else run faster
keepTXI <- rowSums(counts(ddsTXI)) >= 10
ddsTXI <- ddsTXI[keepTXI,]
ddsTXI
ddsTXI <- DESeq(ddsTXI)
resTXI <- results(ddsTXI)
resTXI
# Check dispersion plot
plotDispEsts(ddsTXI)
#Now we actually need to transform the data again. From the vignette: "In order to test for differential expression, we operate on raw counts and use discrete distributions as described in the previous section on differential expression. However for other downstream analyses – e.g. for visualization or clustering – it might be useful to work with transformed versions of the count data."
# Lets go with rlog.
rldTXI <- rlog(ddsTXI, blind=FALSE)
#Sample to sample distances
sampleDistsTXI <- dist(t(assay(rldTXI)))
sampleDistMatrixTXI <- as.matrix(sampleDistsTXI)
rownames(sampleDistMatrixTXI) <- rldTXI$genotype
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrixTXI,
clustering_distance_rows=sampleDistsTXI,
clustering_distance_cols=sampleDistsTXI,
col=colors)
#PCA
plotPCA(rldTXI, intgroup=c("genotype"))
# Lets go back to investigating our results
summary(resTXI)
# For easy access to the genes of interest, let's create a dataframe with only the genes with an adjusted p value less than 0.05.
res_sigTXI <- as.data.frame(resTXI[ which(resTXI$padj < 0.05),])
head(res_sigTXI)
dim(res_sigTXI)
#Lets sort the dataframe so we can see the most significant genes:
res_sigTXI <- res_sigTXI[order(res_sigTXI$padj),]
#To see the 10 most significant genes, we can do a head and request 10 lines:
head(res_sigTXI, n=10)
#Add gene annotation
# You can add gene annotation to your data frame, making it easier to see what each gene is. You'll need to download the annotation file
# read annotation info file in
annotat <- read.csv("../Ppersica_298_v2.1.annotation_info.txt", header = TRUE, comment.char = '', sep='\t')
# keep first row for each unique gene in locusName, removing transcript isoforms
annotat <- annotat[!duplicated(annotat[,2]),]
head(annotat)
# Merge annotation content to DEGs
res.annoTXI <- merge(res_sigTXI, annotat, by.x = "row.names", by.y = "locusName", all.x = TRUE, all.y = FALSE)
## write annotated results to table
write.table(res.annoTXI, file="DEG_sig.salmonTXI.txt", sep = "\t", row.names = F)