-
Notifications
You must be signed in to change notification settings - Fork 0
/
practical.Rmd
433 lines (240 loc) · 15.5 KB
/
practical.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
---
title: "practical-Biodata.ptCrashCourses.Rmd"
output:
html_document:
toc: true
toc_float: true
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo=TRUE,
message=FALSE,
warning=FALSE,
paged.print=FALSE,
fig.align = "center")
startTime <- Sys.time() # start to count the running time
```
## 16S rRNA gene amplicon - upstream data analysis
<br>
Participant:
Contact (e-mail):
Day: 31 January 2020
Place: [IGC](http://www.igc.gulbenkian.pt/), Oeiras, Portugal
<br>
#### Import *dada2* packages (and dependencies)
```{r install and import packages, message=FALSE, warning=FALSE, paged.print=FALSE}
### Install and import packages
## dada2 from CRAN (currently does not work for R version 3.6.2)
#install.packages(pkgs = "dada2") # this function will try to install the latest version of 'dada2' from CRAN - remove the '#' if you don't have 'dada2' installed
## dada2 instalation from github repository
#install.packages("devtools") #install this package first to install 'dada2' then; if you run into problems during the installation, you may need to run the following command in your terminal (assuming that you are using an Ubuntu OS); for other OS you may need to use other commands
#sudo apt install libfontconfig1-dev libcairo2-dev libcurl4-openssl-dev libssl-dev libxml2-dev
#
#
#devtools::install_github("benjjneb/dada2", ref="v1.14") # this will install the 'dada2' version '1.14' from the 'benjjneb' github repo
library(package = "dada2") # import package 'dada2'
packageVersion(pkg = "dada2") # prints the 'dada2' version that you have imported - see below after running this function
```
#### Set seed
```{r set seed, message=FALSE, warning=FALSE, paged.print=FALSE}
### Set seed
set.seed(1024) # set seed to the number '1024' - you can use any number
```
#### Define the directory of 16S rRNA gene amplicon fastq files
```{r set path to NGS files, message=FALSE, warning=FALSE, paged.print=FALSE}
### set the relative path to the 16S rRNA gene amplicon fastq files directory
fastqPath <- "./MiSeq_SOP" # set the path to the 16S rRNA gene amplicon fastq files that you have download 'MiSeq_SOP' (the './' is a notation used that means 'in the current directory')
list.files(fastqPath) # list all the files under the directory 'fastqPath == ./MiSeq_SOP'
```
```{r set path to fwd and rev fastq}
# create a path to each forward and reverse fastq file
fastqFwdPath <- sort(x = list.files(path = fastqPath, pattern = "_R1_001.fastq", full.names = TRUE)) # sort fwd fastq file paths
fastqRevPath <- sort(x = list.files(path = fastqPath, pattern = "_R2_001.fastq", full.names = TRUE)) # sort rev fastq file paths
# extract the sample names
sampleNames <- sapply(X = strsplit(x = basename(path = fastqFwdPath), split = "_"), FUN = `[`, 1) # extract sample names from fastqFwdPath
sampleNames # print the sample names
```
```{r check fwd and rev fastq file lists}
### Compare if fwd and rev fastq file lists correspond (if are sorted)
source("./scripts/biodataPtCrashCourse.R") # import R script with built-in functions
## compareSamplesNames(): built-in function to compare vector files lists of fwd and rev sample names
compareSampleNames(fwdPathList = fastqFwdPath, revPathList = fastqRevPath, splitSymbol = "_", pickElement = 1) # function that takes as input 4 arguments: (1-2) `fastqFwdPath` and `fastqRevPath` vector fwd and rev path lists; (3) the `splitSymbol` (in our case samples are separated by the symbol "_") and (4) `pickElement` `1`, it means after siplit names by "_" pick the first suffix (that contains the sample name)
```
### (0) Fastq Control
```{r fastQC, fig.height = 8, fig.width=13}
### Plot the quality profiles
#dir.create("fastQC_plots") # create the folder "fastQC_plots" - remove the first '#' to create this folder if you want to save the plots as pdf files under this folder
#pdf(file = "fastQC_plots/fwd_fastqc_plots.pdf", width = 13, height = 8) # remove the first '#' to save the pdf file with the plot fastq profile under the folder "fastQC_plots"
plotQualityProfile(fl = fastqFwdPath) # plot the fastq profiles for the forward fastq files
#dev.off() # clear the output
#pdf(file = "fastQC_plots/rev_fastqc_plots.pdf", width = 13, height = 8) # do the same as mentioned above but for the reverse fastq files
plotQualityProfile(fl = fastqRevPath) # plot the fastq profiles for the reverse fastq files
#dev.off()
```
```{r filter and trim reads, message=FALSE, warning=FALSE, paged.print=FALSE}
### Filter and trim reads
filtFastqFwdPath <- file.path(fastqPath, "filtered", paste0(sampleNames, "_fwd_filt.fastq.gz")) # relative file path for fwd filtered reads that will be created below
filtFastqRevPath <- file.path(fastqPath, "filtered", paste0(sampleNames, "_rev_filt.fastq.gz")) # relative file path for rev filtered reads that will be created below
## assign to each file path the sample name
names(filtFastqFwdPath) <- sampleNames
names(filtFastqRevPath) <- sampleNames
## filter and trim fwd and rev fastq files writing the new filtered files in compressed - '.gz' - format to the directories specified above
filterTrimReads <- filterAndTrim(fwd = fastqFwdPath, filt = filtFastqFwdPath, rev = fastqRevPath, filt.rev = filtFastqRevPath, truncLen = c(240,160), maxEE = 2, truncQ = 2, maxN = 0, rm.phix = TRUE, compress = TRUE, verbose = FALSE, multithread = TRUE)
knitr::kable(filterTrimReads) # fancy way to print the matrix, you could just do 'filterTrimReads'
```
### (2) Estimate error rates
```{r learn error rates}
### Learn error rates
errFwd <- learnErrors(fls = filtFastqFwdPath, multithread = TRUE) # model/learn the fwd error rates for the filtered fastq files
errRev <- learnErrors(fls = filtFastqRevPath, multithread = TRUE) # model/learn the rev error rates for the filtered fastq files
```
```{r plot error rates}
## Plot errors
plotErrors(dq = errFwd, nominalQ = TRUE) # for fwd
plotErrors(dq = errRev, nominalQ = TRUE) # for rev
```
### (3) Dereplicate reads into unique sequences
```{r dereplication}
### Dereplication
derepFwd <- derepFastq(fls = filtFastqFwdPath) # dereplicating fwd reads - 'filtFastqFwdPath' (list of paths to the filtered fwd fastq files)
derepRev <- derepFastq(fls = filtFastqRevPath) # dereplicating rev reads - 'filtFastqRevPath' (list of paths to the filtered rev fastq files)
```
```{r plot unique - abundance, fig.width=18}
### Plot the number of unique sequences in relation to the beginning
## retrieve the no. of unique and total fwd seqs
source("./scripts/biodataPtCrashCourse.R") # import the built-in functions
seqDerepFwd <- countUniqueAndTotalFromDerepObjcList(deRepObj = derepFwd) # use the built-in function 'countUniqueAndTotalFromDerepObjcList()' from the script 'biodataPtCrashCourse.R' to retrieve a list with two vectors with the no. of 'unique' and 'total' fwd sequences
uniqDerepFwd <- seqDerepFwd$unique # retrieve the vector with the no. of unique fwd sequences
total <- seqDerepFwd$total # retrieve the vector with the no. of total sequences
## retrieve the unique rev
seqDerepRev <- countUniqueAndTotalFromDerepObjcList(derepRev) # do the same as before but now for the rev
uniqDerepRev <- seqDerepRev$unique # retrieve only the no. of unique rev seqs - the no. of total rev seqs its the same as the no. of total fwd seqs retrieve above
## combine the 2 vectors into a matrix of two rows
derepMtxAbund <- rbind(total, uniqDerepFwd, uniqDerepRev) # total abundance
## Let's finally plot
barplot(derepMtxAbund, main = "Unique/dereplicated sequences (absolute abundance)", xlab = "Samples", ylab = "Unique sequences (absolute abundance)", col = c("gray","blue", "lightblue"), legend = rownames(x = derepMtxAbund), beside=TRUE) # abundance
```
### (4) Denoise unique sequences (and exclude singletons)
```{r Denoising, message=FALSE, warning=FALSE, paged.print=FALSE}
### Denoising
dadaFwd <- dada(derep = derepFwd, err = errFwd, multithread = TRUE) # denoise fwd seqs
dadaRev <- dada(derep = derepRev, err = errRev, multithread = TRUE) # denoise rev seqs
#dadaFwd
#dadaRev
```
### (5) merge denoised forward and reverse reads
```{r Merge PE reads, message=FALSE, warning=FALSE, paged.print=FALSE}
### Merge paired-end reads
mergePE <- mergePairs(dadaF = dadaFwd, derepF = derepFwd, dadaR = dadaRev, derepR = derepRev, verbose = TRUE) # merge PE reads
```
### (6) Construct an ASV table
```{r build a sequence ASV table}
### Make a ASV table
asvTbl <- makeSequenceTable(samples = mergePE) # tabulate ASVs
```
```{r histogram sequence length}
histSeqLen <- table(nchar(getSequences(asvTbl))) # the dada2 function `getSequences()` retrieve the column name sequences and the `nchar()` counts the read-lenth of that sequences, and, finally the `table()` counts the frequency of read-length
```
### (7) Remove chimeras
```{r remove chimeras}
### Remove chimeras from the ASV table
asvTblNoChim <- removeBimeraDenovo(unqs = asvTbl, method = "consensus", multithread = TRUE, verbose = TRUE)
```
```{r summarize seqs}
### Summarize the no. of sequences kept in each pipeline step
getN <- function(x) sum(getUniques(x)) # function that sums `sum(getUniques(x)` the no. of unique sequences `getUniques(x)`
## build a matrix with all the sequences kept in each pipeline step
summaryTblSeq <- cbind(filterTrimReads, # initial reads and filtered/trimmed reads
sapply(dadaFwd, getN), sapply(dadaRev, getN), # denoised sequences
sapply(mergePE, getN), # merged PE sequences
rowSums(asvTblNoChim)) # non-chimeric sequences
## rename the column and row names
colnames(summaryTblSeq) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(summaryTblSeq) <- sampleNames
## create a second summary table seq with one column for the samples
summaryTblSeq2 <- cbind("Samples" = sampleNames, summaryTblSeq)
dir.create("output") # let's create 'output' folder
write.table(x = summaryTblSeq2, file = "./output/summaryTblSeq.tsv", sep = "\t", row.names = FALSE)
knitr::kable(summaryTblSeq)
```
```{r plot no. seqs pipeline - abs. no., fig.width=18}
### Barplot with the abs. abundance of sequences
summaryTblSeqTrans <- t(summaryTblSeq) # transpose
barplot(summaryTblSeqTrans, main = "Absolute no. of sequences kept through the pipeline", ylab = "Absolute no. of sequences", xlab = "Samples", col = c("gray", "#EFF3FF", "#BDD7E7", "#6BAED6", "#3182BD", "#08519C"), legend = rownames(summaryTblSeqTrans), beside = TRUE)
```
```{r plot no. seqs pipeline - perc. no.}
### Barplot in percentage
summaryTblSeqPerc <- apply(X = summaryTblSeq, MARGIN = 2, function(x) x / summaryTblSeq[,1] * 100) # get the correspondent percentage table
summaryTblSeqPercTrans <- t(summaryTblSeqPerc) # transpose
barplot(summaryTblSeqPercTrans, main = "Percentage of sequences kept through the pipeline", ylab = "Percentage of sequences (%)", xlab = "Samples", col = c("gray", "#EFF3FF", "#BDD7E7", "#6BAED6", "#3182BD", "#08519C"), legend = rownames(summaryTblSeqPercTrans), beside = TRUE) # plot it
```
### (8) Assign taxonomy
```{r assign taxonomy to ASVs}
### naive Bayes classifier
taxTbl <- assignTaxonomy(seqs = asvTblNoChim, refFasta = "./database/silva_nr_v132_train_set.fa.gz", multithread = TRUE) # assign taxonomy against the SILVA NR database (version 132)
## add species
taxTbl <- addSpecies(taxtab = taxTbl, refFasta = "./database/silva_species_assignment_v132.fa.gz") # add species to the , previous assignment based on 100% match
```
```{r save asv and tax tables}
### Save ASV and taxonomy tables in R format
# this can be important if you need just these tables in R format to import latter instead of repeating the whole tutorial
saveRDS(object = asvTblNoChim, file = "./output/asvTblNoChim.rds") # save the ASV table
saveRDS(object = taxTbl, file = "./output/taxTbl.rds") # save the ASV taxonomy
```
```{r Format ASV and Tax tables}
## keep the trackability of your ASVs
taxTbl2 <- cbind(taxTbl, "ASV" = paste0("ASV_", 1:nrow(taxTbl))) # add a new column with the new ASV labels/ids to the taxonomy table
rownames(taxTbl2) <- taxTbl2[,8] # substitute the DNA sequences in rownames by the new identifiers/tags/ids "ASV_nrSeq" in the taxonomy table
## retrieve the DNA sequences
uniquesToFasta(asvTblNoChim, "./output/asvFastaDNASequences.fasta", ids = taxTbl2[,8])
## do the same for the ASV table (with the distribution)
asvTblNoChim2 <- asvTblNoChim # copy ASV table
colnames(asvTblNoChim2) <- taxTbl2[,8] # substitute column DNA sequences names by "ASV_nrSeq" in the ASV table
asvTblNoChim2 <- t(asvTblNoChim2) # transpose the ASV matrix table
asvTblNoChim2 <- as.data.frame(asvTblNoChim2)
asvTblNoChim2[,"ASV_ID"] <- rownames(asvTblNoChim2)
asvTblNoChim2 <- asvTblNoChim2[, c(21,1:19)] # remove the mock community and put the last as the first column
## Let's save these 2 R objs
write.table(x = taxTbl2, file = "./output/taxTbl.txt", sep = "\t", row.names = FALSE, quote = FALSE) # save taxonomy table
write.table(x = asvTblNoChim2, file = "./output/asvTblNoChim.txt", sep = "\t", row.names = FALSE, quote = FALSE) # save ASV table
# the code above assumes that the rownames(taxTbl) are in the same order of colnames(asvTblNoChim)
# if you have doubts about it you can run `rownames(taxTbl) == colnames(asvTblNoChim)`
# this will compare all the entries between both vector lists all will return TRUE or FALSE if they are the same or not
```
```{r ASV and Tax table compatible with Biom}
## put taxonomy in a compatible format to convert it latter to biom format
source("./scripts/biodataPtCrashCourse.R") # import R script with built-in functions
taxTbl2 <- tax2biom(taxTbl2)
## Join ASV and Taxonomy tables into one
asvTaxTbl <- cbind(asvTblNoChim2, "taxonomy" = taxTbl2[,-1]) # exclude the "ID" first column from "taxTbl2" because "asvTblNoChim2" has already this information
write.table(x = asvTaxTbl, file = "./output/asvTaxTbl.txt", sep = "\t", row.names = FALSE, quote = FALSE) # save ASV-taxonomy tables
```
```{r Import and edit metadata file}
### Import metadata and put it in a biom format too
metadata <- read.table("./MiSeq_SOP/mouse.time.design", header = TRUE)
rownames(metadata) <- metadata[,1]
colnames(metadata) <- c("SampleID", "Condition")
write.csv(x = metadata, file = "./output/metadata.csv", quote = FALSE, row.names = TRUE)
```
### (9) Evaluate accuracy and contamination (with the *mock community*)
```{r assess the dada2 accuracy}
### Assess the DADA2 accuracy
mockAsvTbl <- asvTbl["Mock",] # retrieve the sample "Mock" from the ASV table
mockAsvTbl <- sort(x = mockAsvTbl[mockAsvTbl>0], decreasing = TRUE) # retrieve only ASVs higher than 0
refMockSeq <- getSequences(file.path(fastqPath, "HMP_MOCK.v35.fasta")) # import reference mock sequences from "HMP_MOCK.v35.fasta"
compareRefAsvMock <- sum(sapply(names(mockAsvTbl), function(x) any(grepl(x, refMockSeq)))) # compare our ASV mock sequences wiht the reference
```
### Convert the ASV table text file into a biom (Biological Observation Matrix) file
```{r ASV table to biom format}
## Convert ASV table with taxonomy in tab-delimited format into biom format
source("./scripts/biodataPtCrashCourse.R") # import R script with built-in functions
convertTab2Biom(inFile = "./output/asvTaxTbl.txt", outFile = "./output/asvTable.biom")
```
#### Estimate the running time of R
```{r Time, message=FALSE, warning=FALSE, paged.print=FALSE}
endTime <- Sys.time()
endTime - startTime # running time
```
#### R packages and versions used in this course
```{r References, message=FALSE, warning=FALSE, paged.print=FALSE}
sessionInfo()
```