-
Notifications
You must be signed in to change notification settings - Fork 0
/
Kelp-transcriptomics.Rmd
123 lines (84 loc) · 3.32 KB
/
Kelp-transcriptomics.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
---
title: "Kelp-transcriptomics"
output: html_notebook
---
#Read trimming and cleaning
```{r}
#check quality
mkdir -p 0.1.fastqc
conda activate fastqc
fastqc HK-7_1_1.clean.fq.gz HK-7_1_2.clean.fq.gz HK-7_2_1.clean.fq.gz HK-7_2_2.clean.fq.gz \
HKB-7_1_1.clean.fq.gz HKB-7_1_2.clean.fq.gz HKB-7_2_1.clean.fq.gz HKB-7_2_2.clean.fq.gz \
NK-7_1_1.clean.fq.gz NK-7_1_2.clean.fq.gz NK-7_2_1.clean.fq.gz NK-7_2_2.clean.fq.gz \
NKB-7_1_1.clean.fq.gz NKB-7_1_2.clean.fq.gz NKB-7_2_1.clean.fq.gz NKB-7_2_2.clean.fq.gz \
-o 0.1.fastqc -t 30
conda activate multiqc
multiqc 0.1.fastqc \
-o 0.1.fastqc/multiqc
```
```{r}
#remove rRNA SEQUENCES
mkdir -p 0.1.fastq/0.2.decontam/
R1s=`ls 0.1.fastq/*.1.clean.fq.gz | python -c 'import sys; print(" ".join([x.strip() for x in sys.stdin.readlines()]))'`
R2s=`ls 0.1.fastq/*.2.clean.fq.gz | python -c 'import sys; print(" ".join([x.strip() for x in sys.stdin.readlines()]))'`
python ~/0.scripts/decontaminate.reference.py \
--R1 $R1s \
--R2 $R2s \
--reference ~/database/sortmerna/sortmerna-4.3.6/smr_v4.3_sensitive_db_rfam_seeds.fasta \
--output 0.1.fastq/0.2.decontam/0.1.rrna \
--threads 50 --verbose
#rename files if required
cd 0.1.fastq/0.2.decontam/0.1.rrna
for file in *1.clean.fq.R1.fq.gz; do mv "$file" "${file/.1.clean.fq.R1.fq.gz/.R1.fq.gz}"; done
for file in *1.clean.fq.R2.fq.gz; do mv "$file" "${file/.1.clean.fq.R2.fq.gz/.R2.fq.gz}"; done
cd -
```
#Align with S.japoinica genome
#https://ftp.ncbi.nlm.nih.gov/genomes/genbank/protozoa/Saccharina_japonica/latest_assembly_versions/GCA_008828725.1_ASM882872v1/GCA_008828725.1_ASM882872v1_genomic.fna.gz
#GenBank assembly accession: GCA_008828725.1
#align with reference genome
```{r}
R1s=`ls 0.1.fastq/0.2.decontam/0.1.rrna/*.R1.fq.gz | python -c 'import sys; print(" ".join([x.strip() for x in sys.stdin.readlines()]))'`
R2s=`ls 0.1.fastq/0.2.decontam/0.1.rrna/*.R2.fq.gz | python -c 'import sys; print(" ".join([x.strip() for x in sys.stdin.readlines()]))'`
#check files
echo $R1s
echo $R2s
#align with reference genome GCA_008828725.1
mkdir -p 0.1.fastq/0.2.decontam/0.3.align.to.ref
python ~/0.scripts/align.to.reference.py \
--R1 $R1s \
--R2 $R2s \
--reference GCA_008828725.1_ASM882872v1.fixed.fasta \
--output 0.1.fastq/0.2.decontam/0.3.align.to.ref/ \
--threads 30 --verbose
#rename files if required
cd 0.1.fastq/0.2.decontam/0.3.align.to.ref/
for file in *.R1.fq.R1.fq.gz; do mv "$file" "${file/.R1.fq.R1.fq.gz/.R1.fq.gz}"; done
for file in *.R1.fq.R2.fq.gz; do mv "$file" "${file/.R1.fq.R2.fq.gz/.R2.fq.gz}"; done
cd -
```
#seq2fun
```{r}
mkdir -p 0.3.seq2fun/aligned.to.kelp/
cd 0.3.seq2fun/
~/soft/Seq2Fun/bin/seq2fun --sampletable sample.tsv \
--tfmi ~/soft/Seq2Fun/database/plants/plants_v2.0.fmi \
--genemap ~/soft/Seq2Fun/database/plants/plants_annotation_v2.0.txt \
-w 50 \
--profiling \
--dbDir ~/soft/Seq2Fun/database \
--verbose
cd -
/home/mcs/soft/Seq2Fun/database/algae/algae_v2.0.fmi
#with algae dtabase
cd 0.3.seq2fun/aligned.to.kelp/
~/soft/Seq2Fun/bin/seq2fun --sampletable sample.tsv \
--tfmi ~/soft/Seq2Fun/database/algae/algae_v2.0.fmi \
--genemap ~/soft/Seq2Fun/database/algae/algae_v2.0.txt \
-w 30 \
--profiling \
--dbDir ~/soft/Seq2Fun/database \
--verbose
cd -
#process in expressanalyst webserver
```