forked from darcyabjones/pclust
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pgenome.nf
executable file
·129 lines (102 loc) · 2.6 KB
/
pgenome.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
*/
def helpMessage() {
log.info"""
=================================
pclust/pgenome
=================================
Usage:
abaaab
Mandatory Arguments:
--genomes
--gffs
--clusters
--proteindb description
Options:
--non-existant description
Outputs:
""".stripIndent()
}
if (params.help){
helpMessage()
exit 0
}
params.gffs = false
if ( params.gffs ) {
gffs = Channel
.fromPath( params.gffs )
.map { [it.baseName, it] }
}
params.genomes = Channel
.fromPath( params.genomes )
.map { [it.baseName, it] }
clusters = Channel.fromPath( params.clusters )
seq = Channel.fromPath( params.proteindb )
/*
* Construct genomes database
*/
process createGenomeSequenceDB {
label 'mmseqs'
publishDir "genomes"
tag { label }
input:
set val(label), file(fasta) from combinedGenomeFasta
output:
set val(label), file("${label}") into genomeSeq
"""
mkdir -p "${label}"
mmseqs createdb "${fasta}" "${label}/db" --dont-split-seq-by-len
"""
}
/*
* Search for the clusters in the original genome sequences.
*/
process searchGenomes {
label 'mmseqs'
publishDir "genomes"
tag { label }
input:
set val(label), file("genome") from genomeSeq
file "clusters" from clusters
file "seqs" from seq
output:
set val(label), file("${label}.tsv") into searchResults
"""
mkdir -p tmp
# Create profiles for each cluster.
mmseqs result2profile \
seqs/db \
seqs/db \
clusters/db \
profile \
--threads ${task.cpus}
# Search profile vs genome
# Search parameters are slightly more conservative than default.
mmseqs search \
profile \
genome/db \
result \
tmp \
--threads ${task.cpus} \
--realign \
--gap-open 15 \
--gap-extend 2 \
--cov-mode 2 \
--rescore-mode 2 \
--min-length 20 \
--orf-start-mode 1 \
--use-all-table-starts true
# Extract matches from results
mmseqs convertalis \
profile \
genome/db \
result "${label}.tsv" \
--threads ${task.cpus} \
--format-mode 0 \
--format-output "query target evalue qcov tcov gapopen pident nident mismatch raw bits qstart qend tstart tend qlen tlen alnlen cigar qframe tframe"
sed -i '1i query\ttarget\tevalue\tqcov\ttcov\tgapopen\tpident\tnident\tmismatch\traw\tbits\tqstart\tqend\ttstart\ttend\tqlen\ttlen\talnlen\tcigar\tqframe\ttframe' profile_matches.tsv
"""
}