Skip to content

Test 3 Question 2

Meg Staton edited this page Dec 11, 2020 · 2 revisions

test3 - q2

STAR - index and map

Indexing failed with 2 cores, get 3:

qsub -I -A ACF-UTK0138 -q debug -l walltime=1:00:00,nodes=1:ppn=3,advres=epp622.13776065

Index command:

/lustre/haven/proj/UTK0138/software/STAR-2.7.6a/bin/Linux_x86_64/STAR \
--runMode genomeGenerate \
--genomeDir STAR_idx \
--genomeFastaFiles Ppersica_298_v2.0.fa \
--runThreadN 1 \
--sjdbGTFfile Ppersica_298_v2.1.gene_exons.gff3 \
--sjdbGTFtagExonParentTranscript Parent \
--sjdbOverhang 50

Make filenames file:

ls ../*fq > filenames.txt

Run qsub task array job:

#PBS -S /bin/bash
#PBS -A ACF-UTK0138
#PBS -l nodes=1:ppn=2,walltime=01:00:00
#PBS -l advres=epp622.13776065
#PBS -N rna-map
#PBS -m abe
#PBS -M [email protected]
#PBS -t 1-12

cd $PBS_O_WORKDIR

infile=$(sed -n -e "${PBS_ARRAYID} p" filenames.txt)

samplename=$(basename $infile | sed 's/.fq//')

/lustre/haven/proj/UTK0138/software/STAR-2.7.6a/bin/Linux_x86_64/STAR \
--genomeDir ../STAR_idx \
--readFilesIn $infile \
--runThreadN 2 \
--outFileNamePrefix $samplename \
--outSAMtype BAM SortedByCoordinate

Count:

module load htseq
module load pysam

for f in *bam
do
    samplename=$(basename $f | sed 's/Aligned.sortedByCoord.out.bam//g')
    echo $base
    htseq-count \
    --format=bam \
    --order=pos \
    --stranded=no \
    --type=gene \
    --idattr=ID \
    $f \
    ../Ppersica_298_v2.1.gene_exons.gff3 \
    >${samplename}.counts.txt
done

Transfer to local computer and run R code:

library( "DESeq2" )
library("pheatmap")
library("RColorBrewer")
library("ggplot2")

setwd("/Users/margaretstaton/Desktop/test3")

## Set Up
sampleTable <- read.table("sampleTable.txt", header=TRUE, colClasses=c("character", "character", "factor", "factor"))
dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, directory = '.', design = ~ genotype + time + genotype:time)

## Filter low count genes
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

## Relevel so the base time is 400 (ie up-regulated will report up regulation FROM 400 TO 800)
dds$time <- relevel(dds$time, ref = "400")

## get dds object
dds <- DESeq(dds)
resultsNames(dds)

###--------
## Get results for each factor
## Try out two methods - results with the name factor and results with a contrast

# Time
resTime = results(dds, name="time_800_vs_400")
summary(resTimeContrast,alpha=0.05)

resTimeContrast = results(dds, contrast=c("time",'400','800'))
summary(resTimeContrast,alpha=0.05)

summary(resTime,alpha=0.05)

# Genotype
resGT = results(dds, name="genotype_LB_vs_EB")
summary(resGT,alpha=0.05)

resGTContrast = results(dds, contrast=c("genotype",'EB','LB'))
summary(resGTContrast,alpha=0.05)

# Interaction
resInt = results(dds, name="genotypeLB.time800")
summary(resInt,alpha=0.05)

###--------
# Plots

plotDispEsts(dds)

rld <- rlog(dds, blind=FALSE)

sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

plotPCA(rld, intgroup=c("genotype", "time"))

pcaData <- plotPCA(rld, intgroup=c("genotype", "time"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=time, shape=genotype)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed()