-
Notifications
You must be signed in to change notification settings - Fork 0
/
script01_exp-design.sh
266 lines (216 loc) · 12.2 KB
/
script01_exp-design.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/bin/bash
# EXPERIMENTAL DESIGN: SELECTING THE NON-ANTARCTIC SPONGE MICROBIOME DATASET FROM THE SPONGE MICROBIOME PROJECT (SMP) DATA AND STRUCTURING THE METADATA TABLE #
# ----------------------------------------------------------------------------------------------------------------------------------------------------------- #
## This code must be executed by lines or statements, do not execute it in one go.
## The metadata related to the SMP samples ("SraRunInfo.csv" and "SraRunTable.txt" files) and the SRA accesion list was downloaded from the SRA download options and the Run Selector (NCBI).
#--- Isolate SRA and BioSample IDs:
cut -f1,26 tab-SraRunInfo.csv
cut -f1 tab-SraRunInfo.csv > biosample-ID.txt # save BioSample ID list
#--- Download BioSample full reports:
while IFS= read -r line
do
wget https://www.ncbi.nlm.nih.gov/biosample/$line?report=full\&format=text -O $line.txt
done < biosample-ID.txt
#--- Create a customized metadata table from the BioSample reports:
echo -e "BioSample\tSample type\tLife stage\tHost health\tHost\tHost name\tCollection date\tGeographic location\tSampling site\tLatitude\tLongitude\tWater temperature\tDepth" >> tmp.out
for file in *.txt
do
a=$( grep -o -m1 "\w*SAMEA\w*" $file )
b=$( grep "sample type=\"" $file )
c=$( grep "life stage=\"" $file )
d=$( grep "host health state=\"" $file )
e=$( grep "host=" $file )
f=$( grep "host common name=" $file )
g=$( grep "collection" $file )
h=$( grep -m1 "geographic location" $file )
i=$( grep "sampling_site=" $file )
j=$( grep "latitude=" $file )
k=$( grep "longitude=" $file )
l=$( grep "water_temperature_degrees_c=" $file )
m=$( grep "depth=" $file )
echo -e "$a\t$b\t$c\t$d\t$e\t$f\t$g\t$h\t$i\t$j\t$k\t$l\t$m" >> tmp.out
done
#--- Concatenate the cutomized metadata table with desired columns of the "SraRunInfo.csv" file:
echo -e "SRA ID\tBases\tAverage length\tSize(mb)\tBiosample verif." >> tmp2.out
while IFS= read -r line
do
grep $line SraRunInfo.csv | cut -d "," -f1,5,7,8,26 --output-delimiter=$'\t' >> tmp2.out
done < biosample-ID.out
paste -d "\t" tmp.out tmp2.out > metadata-SMP.tsv
## The cutomized metadata table was uploaded to Google Docs. There, the table was filtered by sample type as "sponge tissue", host health as "healthy", life stage as "adult" or "sissing_not provided", and depth as "<50m". Also, we discarded samples displaying the columns "host" and "host name" other than the host scientific name (e.g. "freshwater metagenome", "seawater", "sponges"). Ulva sp. was removed as it is classified as an algae according to NCBI Taxonomy. The "host y "host name" columns were merged as a new column identifiying the sponge species. The filtered table was saved as "metadata-SMP-filt-tmp.tsv".
## We realized many sponge orders reported did not coincide with the sponge species reported. Many sponge orders did not agree the World Porifera database (https://www.marinespecies.org/porifera/) or the NCBI taxonomy database (https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi) for the given species. Therefore, we corrected the assignments of sponge order according to the sponge species reported (trusting the appropiate morphological sponge classification during the experimental procedures of the SMP preceding analyses at the host species level).
#--- Search the sponge order for each sponge species reported using the NCBI taxonomy (FTP download, /pub/taxonomy/taxdump_archive directory, file "rankedlineage.dmp", accessed on 01-08-2021):
echo -e "Order" > tmp3.out
while IFS= read -r line
do
grep -P -m1 "\t$line\t" rankedlineage.dmp | cut -d "|" -f6 | tr -d "\t" >> tmp3.out # orders
grep -P -m1 "\t$line\t" rankedlineage.dmp >> tmp4.out # complete assignments
#echo -e "$line\t\t$(grep -P -m1 "\t$line\t" rankedlineage.dmp | cut -d "|" -f6 | tr -d "\t")" >> tmp4.out # To know the provenance of empty rows
done < list-species-SMP-filt.txt
# NOTE: Some uncorrect names of sponge species in the NCBI taxonomy assingments were corrected according to the World Porifera database.
## The new assigments and non-assigned orders from NCBI were verified manually using the World Porifera database. Verification was also applied for the taxonomy of the published Antarctic sponge samples.
#--- Paste new sponge order column with the
paste -d "\t" metadata-SMP-filt-tmp.tsv tmp4.out > metadata-SMP-filt.tsv
#--- Filter the customized metadata table by sponge orders matching the taxonomy of the available Antarctic sponges:
echo -e "BioSample\tSample type\tLife stage\tHost health\tSpecies\tOrder\tCollection date\tGeographic location\tSampling site\tLatitude\tLongitude\tWater temperature\tDepth\tSRA ID\tBases\tAverage length\tSize(mb)\tBiosample verif." > desired-orders.tsv
while IFS= read -r line
do
grep -w $line metadata-SMP-filt.tsv >> desired-orders.tsv
done < list-orders.txt
## The customized metadata table "metadata-SMP-filt.tsv" was then classified according to the marine environment, realm, province, and ecoregion using the coordinates tables provided by the Marine Ecoregions of the World (MEOW, Spalding et al., 2007, https://doi.org/10.1641/B570707) of the R package "meowR" (see Rscript "script2_metadata.R").
#--- Choose randomly 5 sponge species replicates per environment.
for envi in trop temp
do
echo -e "BioSample\tSample type\tLife stage\tHost health\tSpecies\tOrder\tCollection date\tGeographic location\tSampling site\tLatitude\tLongitude\tWater temperature\tDepth\tSRA ID\tBases\tAverage length\tSize(mb)\tBiosample verif." > $envi.tsv
while IFS= read -r line
do
grep "$line" $envi-metadata-SMP.tsv | rl -c5 >> $envi.tsv
done < list-$envi.txt
done
# NOTE: For species with less than 5 replicates, the maximum number of replicates available was reptrieved.
#--- Verify sponge genera and sponge species matching the Antarctic sponges taxa in each environment-classified table:
# For Species:
while IFS= read -r line
do
if grep -n "$line" *.tsv | cut -f1,5 >> verif-desired-species.csv
then
echo "$line found" >> tmp5-species.out
else
echo "$line not found" >> tmp5-species.out
fi
done < desired-species.txt
grep "not found" tmp5-species.out
grep -c "not found" tmp5-species.out
# For genus:
while IFS= read -r line
do
if grep -n -w "$line" *.tsv | cut -f1,5 >> verif-desired-genera.csv
then
echo "$line found" >> tmp6-genera.out
else
echo "$line not found" >> tmp6-genera.out
fi
done < desired-genera.txt
grep "not found" tmp6-genera.out
grep -c "not found" tmp6-genera.out
#--- List the SRA IDs by environment-classified table:
cut -f14 trop.tsv > trop-sra-IDs.txt
cut -f14 temp.tsv > temp-sra-IDs.txt
#--- Download the datasets of the resulting customized metadata tables using the SRA IDs:
for file in trop temp
do
while IFS= read -r line
do
prefetch -s $line >> $file-size-SRA-datasets.txt
prefetch $line -O $file-SRA-datasets
cd $file-SRA-datasets/
fasterq-dump $line.sra -x -O . >> $file-details-SRA-datasets.txt
echo "----------------------------------------------------" >> $file-details-SRA-datasets.txt
cd ..
done < $file-sra-IDs.txt
done
#--- Create a file with the IDs, environemnt type, and Illumina sequencer (sequencing run):
for file in trop temp
do
while IFS= read -r line
do
a=$(grep $line tab-SraRunInfo.csv | cut -f20)
b=$(cat SraRunTable.tsv | cut -d "," -f37,38 | tr "," "\t")
echo -e "$line\t$file\t$a\t$b" >> TS-by-seq-runs.txt
done < $file-sra-IDs.txt
done
## IMPORTANT: The metadata related to Antarctic sponges was prepared manually and independently.
#--- Download SRA dataset from Sacristán-Soriano et al., 2020 (Antarctic sponge dataset):
while IFS= read -r line
do
#prefetch -s $line >> size-SRA-datasets.txt
#prefetch $line -O SRA-datasets
cd SRA-datasets/
fasterq-dump $line.sra -x -O . >> details-SRA-datasets.txt
echo "----------------------------------------------------" >> details-SRA-datasets.txt
cd ..
done < SraAccList.txt
## The datasets of the rest of Antarctic sponges were retrieved from our lab repository for practicity, though they were publicly available in SRA, with the exception of the newly generated dataset (1821 sequencing run).
## The Antarctic, tropical, and temperate sponge tables were merged in a single metadata one (metadata-sponges.tsv). The complete taxonomic assignments were added manually to the customized metadata table in LibreOffice. The sequencing run/lanes IDs were codified in the metadada from the "SraRunTable.tsv" as follows: "h2000" or "h2500" depending on the Illumina Hiseq sequencing platform used, followed by "l" from the word lane and the number of the lane used. The sequencing run IDs of Antarctic sponge datasets were codified according to the sampling year or publication year as the identifier.
## IMPORTANT: Only sponge samples matching the taxonomy of the available Antarctic sponges at least at the family level were kept at this point, the rest was manually removed from the selected dataset and metadata ("metadata-sponges.tsv" file).Then, the dataset (fastq files) was divided in different folders according to the sequencing run ID.
#--- Copy fastq files from a list with the IDs classified by sequencing run to a folder (only for non-Antarctic sponges; the Antarctic sponge files were divided in different folders manually):
for num in h2000l7 h2000l8 h2500l3 h2500l4 h2500l5 h2500l7
do
for file in $(cat ./TS-hiseq$num.txt)
do
cp "$file" ./TS-hiseq$num/
done
done
#--- Replace sequecing run IDs in r scripts:
for num in h2000l7 h2000l8 h2500l3 h2500l4 h2500l5 h2500l7 2013cgb 2014 2015 2020ss 1821
do
cp dada2-qc-2013.r dada2-qc-$num.r
cp dada2-dn-2013.r dada2-dn-$num.r
#sed -i "s/demux/sponges/g" dada2-qc-$num.r
sed -i "s/2013/$num/g" dada2-qc-$num.r
sed -i "s/2013/$num/g" dada2-dn-$num.r
done
#--- Tranform fastq files into required format of demultiplexed single-end reads to use QIIME2 software:
array=( sponges-h2000l7 sponges-h2000l8 sponges-h2500l3 sponges-h2500l4 sponges-h2500l5 sponges-h2500l6 sponges-h2500l7 )
a=1
for file in "${array[@]}"
do
cd $file/
for i in *.fastq.gz
do
rename "s/_L001/_${a}_L001/g" $i
#echo "hola_${i}_$a"
a=$((a+1))
done
cd ../
done
#--- Use QIIME2 software to visualize quality plots:
array=( h2000l7 h2000l8 h2500l3 h2500l4 h2500l5 h2500l7 ) #single-end
for folder in "${array[@]}"
do
qiime tools import \
--type 'SampleData[SequencesWithQuality]' \
--input-path sponges-$folder \
--input-format CasavaOneEightSingleLanePerSampleDirFmt \
--output-path sponges-$folder.qza >> info-qplots.txt 2>> error-qplots.txt
qiime demux summarize \
--i-data sponges-$folder.qza \
--o-visualization summarize-qc-$folder.qzv >> info-qplots.txt 2>> error-qplots.txt
done
#--- Use QIIME2 software to visualize quality plots:
array=( 2013 2013cgb 2014 2015 1821 2020ss ) #paired-end
for folder in "${array[@]}"
do
qiime tools import \
--type EMPPairedEndSequences \
--input-path sponges-$folder \
--output-path imported-$folder.qza >> info-qplots.txt 2>> error-qplots.txt
qiime demux emp-paired \
--m-barcodes-file metadata-$folder.tsv \
--m-barcodes-column barcode-sequence \
--p-rev-comp-mapping-barcodes \
--i-seqs imported-$folder.qza \
--o-per-sample-sequences sponges-$folder.qza \
--o-error-correction-details dm-details-$folder.qza >> info-qplots.txt 2>> error-qplots.txt
qiime demux summarize \
--i-data sponges-$folder.qza \
--o-visualization summarize-qc-$folder.qzv >> info-qplots.txt 2>> error-qplots.txt
done
#--- Remove primers with Cutadapt (only for sequencing run IDs 1821 and 2020s):
for file in *_L001_R1_001.fastq.gz
do
cutadapt -g GTGYCAGCMGCCGCGGTAA -o ./outs/out-$file $file --match-read-wildcards -O 19 2>> details-cutadapt.log
done
#--- Obtain sequence length distribution of fastq files:
array=( h2000l7 h2000l8 h2500l3 h2500l4 h2500l5 h2500l6 h2500l7 2013 2013cgb 2014 2015 1821 2020ss )
for folder in "${array[@]}"
do
awk 'NR%4 == 2 {lengths[length($0)]++ ; counter++} END {for (l in lengths) {print l, lengths[l]}; print "total reads: " counter}' ./sponges-$folder/*.fastq > ./read-lengths/read-length-distrib-$folder.csv
done
#--- Search host type in the metadata of Lurgi et al., 2019 from species name to join the metadata of the study:
while IFS= read -r line
do
a=$(grep -m1 "$line" Lurgi2019-suppl-data.csv | cut -f15)
echo -e "$line\t$a" >> host-type.txt
done < TS-by-sponge-species.txt
## From this part, the datasets were processed in R environment (Rscripts).