Skip to content

DESeq2 with salmon

Meg Staton edited this page Nov 15, 2022 · 4 revisions

Get files

scp -r isaac:/lustre/isaac/proj/UTK0208/rnaseq/analysis/mstaton1/3_salmon/\*quant .

You really only need quant.sf for each sample but this keeps directory structure in order which makes R code easier

Fix problem with "blomming"

 mv EarlyBlommingRep1_quant EarlyBloomingRep1_quant

Get gff3 file to build your transcript to gene conversion data:

scp -r isaac:/lustre/isaac/proj/UTK0208/rnaseq/raw_data/Ppersica_298_v2.1.gene_exons.gff3 .

Move over into R

# folder with counts file
setwd("/Users/margaretstaton/Desktop/PrunusDormancy/salmon")

#BiocManager::install("tximport")
library("tximport")
#BiocManager::install("GenomicFeatures")
library("GenomicFeatures")
library( "DESeq2")
library("pheatmap")
library("RColorBrewer")
library("ggplot2")

##-------------------------------------------------
##------- get tx2gene
##-------------------------------------------------

txdb <- makeTxDbFromGFF("Ppersica_298_v2.1.gene_exons.gff3",
                "gff3",
                "Phytozome",
                "Prunus persica")
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- select(txdb, k, "GENEID", "TXNAME")

##-------------------------------------------------
##------- import salmon counts
##-------------------------------------------------

files <- c("./EarlyBloomingRep1_quant/quant.sf", 
           "./EarlyBloomingRep2_quant/quant.sf",
           "./LateBloomingRep1_quant/quant.sf",
           "./LateBloomingRep2_quant/quant.sf")
txi <- tximport(files, type = "salmon", tx2gene=tx2gene, countsFromAbundance = "lengthScaledTPM")

# set up sample table
sampleFiles <- grep("Rep",list.files(),value=TRUE)
sampleGenotype <- sub("(.*ming).*","\\1",sampleFiles)
sampleName <- sub("(.*mingRep[0-9]+).*","\\1",sampleFiles)
sampleTable <- data.frame(sampleName = sampleName,
                          fileName = sampleFiles,
                          genotype = sampleGenotype)
sampleTable$genotype <- factor(sampleTable$genotype)

# then below...
ddsTXI <- DESeqDataSetFromTximport(txi, sampleTable, ~genotype)

##-------------------------------------------------
##--------- now we can get to deseq2, similar code as we used before for STAR
##-------------------------------------------------

##Prefiltering very lowly expressed reads, which will make everything else run faster

keepTXI <- rowSums(counts(ddsTXI)) >= 10
ddsTXI <- ddsTXI[keepTXI,]
ddsTXI

ddsTXI <- DESeq(ddsTXI)
resTXI <- results(ddsTXI)
resTXI

# Check dispersion plot

plotDispEsts(ddsTXI)

#Now we actually need to transform the data again. From the vignette: "In order to test for differential expression, we operate on raw counts and use discrete distributions as described in the previous section on differential expression. However for other downstream analyses – e.g. for visualization or clustering – it might be useful to work with transformed versions of the count data."

# Lets go with rlog.

rldTXI <- rlog(ddsTXI, blind=FALSE)

#Sample to sample distances

sampleDistsTXI <- dist(t(assay(rldTXI)))
sampleDistMatrixTXI <- as.matrix(sampleDistsTXI)
rownames(sampleDistMatrixTXI) <- rldTXI$genotype
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrixTXI,
         clustering_distance_rows=sampleDistsTXI,
         clustering_distance_cols=sampleDistsTXI,
         col=colors)
#PCA

plotPCA(rldTXI, intgroup=c("genotype"))

# Lets go back to investigating our results

summary(resTXI)

# For easy access to the genes of interest, let's create a dataframe with only the genes with an adjusted p value less than 0.05.

res_sigTXI <- as.data.frame(resTXI[ which(resTXI$padj < 0.05),])
head(res_sigTXI)
dim(res_sigTXI)

#Lets sort the dataframe so we can see the most significant genes:

res_sigTXI <- res_sigTXI[order(res_sigTXI$padj),]

#To see the 10 most significant genes, we can do a head and request 10 lines:

head(res_sigTXI, n=10)

#Add gene annotation
# You can add gene annotation to your data frame, making it easier to see what each gene is. You'll need to download the annotation file

# read annotation info file in
annotat <- read.csv("../Ppersica_298_v2.1.annotation_info.txt", header = TRUE, comment.char = '', sep='\t')

# keep first row for each unique gene in locusName, removing transcript isoforms
annotat <- annotat[!duplicated(annotat[,2]),]
head(annotat)

# Merge annotation content to DEGs
res.annoTXI <- merge(res_sigTXI, annotat, by.x = "row.names", by.y = "locusName", all.x = TRUE, all.y = FALSE)
## write annotated results to table
write.table(res.annoTXI, file="DEG_sig.salmonTXI.txt", sep = "\t", row.names = F)
Clone this wiki locally