Skip to content

Commit

Permalink
Merge pull request #11 from johnne/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
johnne authored Feb 8, 2022
2 parents ca28f41 + cbce2e3 commit fe08f80
Show file tree
Hide file tree
Showing 20 changed files with 494 additions and 39,589 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recursive include src/coidb/ Snakefile config*.yaml
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ dependencies:
- tqdm
- pandas
- snakemake
- seqkit
- importlib_resources
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[build-system]
requires = [
"setuptools>=42",
"wheel"
]
build-backend = "setuptools.build_meta"
9 changes: 0 additions & 9 deletions requirements.txt

This file was deleted.

35 changes: 35 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[metadata]
name = coidb
version = 0.4.0
author = John Sundh
author_email = [email protected]
description = Workflow for downloading and formatting COI database
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/johnne/coidb
project_urls =
Bug Tracker = https://github.com/johnne/coidb/issues
classifiers =
Programming Language :: Python :: 3
License :: OSI Approved :: MIT License
Operating System :: OS Independent

[options]
package_dir =
= src
packages = find:
python_requires = >=3.6
include_package_data = True
install_requires =
snakemake
biopython
tqdm
pandas
importlib_resources

[options.packages.find]
where = src

[options.entry_points]
console_scripts =
coidb = coidb.__main__:main
47 changes: 0 additions & 47 deletions setup.py

This file was deleted.

107 changes: 77 additions & 30 deletions src/coidb/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,68 +7,113 @@ singularity: "docker://continuumio/miniconda3:4.9.2"
# Validate config
validate(config, "config.schema.yaml")

nrows = None
if config["testing"]["nrows"] > 0:
nrows = config["testing"]["nrows"]

localrules: coidb, download, filter, clean, format

wildcard_constraints:
textfile = "occurrences.txt|dna.txt|Taxon.tsv",
zipfile = "bold.zip|backbone.zip"

textfile_dict = {'Taxon.tsv': 'backbone.zip',
'occurrences.txt': 'bold.zip',
'dna.txt': 'bold.zip'}

rule coidb:
input: expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies"])

rule download:
rule download_zipfile:
"""
Download zipfile with database sequences + info
"""
output:
"bold.zip",
"bold_bins.zip"
"{zipfile}"
log:
"logs/download.log"
"logs/download.{zipfile}.log"
params:
url=config["database"]["url"],
tax_url=config["database"]["tax_url"]
url = lambda wildcards: config["database"][wildcards.zipfile]
shell:
"""
curl -L -o $TMPDIR/bold.zip {params.url} > {log} 2>&1
curl -L -o $TMPDIR/bold_bins.zip {params.tax_url} >> {log} 2>&1
mv $TMPDIR/bold.zip {output[0]}
mv $TMPDIR/bold_bins.zip {output[1]}
curl -L -o $TMPDIR/{wildcards.zipfile} {params.url} > {log} 2>&1
mv $TMPDIR/{wildcards.zipfile} {output[0]}
"""

rule extract:
rule download:
input:
"bold.zip",
"bold_bins.zip"
textfile_dict.values()

rule extract_zipfile:
input:
lambda wildcards: textfile_dict[wildcards.textfile]
output:
"bold_info.tsv",
"bold_seqs.txt",
"bold_taxa.txt"
"{textfile}"
log:
"logs/extract.log"
"logs/extract.{textfile}.log"
shell:
"""
unzip -o -d $TMPDIR/ {input[0]} occurrences.txt dna.txt >> {log} 2>&1
unzip -o -d $TMPDIR/ {input[1]} taxon.txt >> {log} 2>&1
mv $TMPDIR/occurrences.txt {output[0]}
mv $TMPDIR/dna.txt {output[1]}
mv $TMPDIR/taxon.txt {output[2]}
f=$(unzip -l {input[0]} | grep -w {output[0]} | rev | cut -f1 -d ' ' | rev)
unzip -o -d $TMPDIR/ {input[0]} $f >> {log} 2>&1
mv $TMPDIR/$f {output[0]}
"""

rule filter:

rule extract:
input:
textfile_dict.keys()

rule filter_data:
"""
Extract record ids based on taxonomy
Filter the BOLD data to genes and taxa of interest
This also keeps only records with BOLD: ids
"""
input:
"bold_info.tsv",
"bold_seqs.txt",
"bold_taxa.txt"
"occurrences.txt",
"dna.txt",
"Taxon.tsv"
output:
info = "bold_info_filtered.tsv",
fasta = "bold_filtered.fasta",
fasta = "bold.fasta",
params:
genes = config["database"]["gene"],
phyla = config["database"]["phyla"],
tmpf = "$TMPDIR/bold_filtered.fasta"
filter_taxa = config["database"]["taxa"],
filter_rank = config["database"]["rank"],
ranks = config["database"]["ranks"],
tmpf = "$TMPDIR/bold_filtered.fasta",
nrows = nrows
script:
"scripts/common.py"

rule remove_non_standard:
input:
"bold.fasta"
output:
"bold_filtered.fasta"
log:
"logs/remove_non_standard.log"
params:
tmpfile = "$TMPDIR/bold_seqkit_cleaned.fasta",
ids = "$TMPDIR/bold_non_standard_ids.txt",
fastafile = "$TMPDIR/bold_filtered.fasta"
shell:
"""
exec &> {log}
# Remove gap characters, then remove leading and trailing 'N'
seqkit seq -g {input} | seqkit replace -s -r "" -p "N+$" | seqkit replace -s -r "" -p "^N+" > {params.tmpfile}
# Now remove ids still containing non standard DNA chars
seqkit grep -s -r -p "[^ACGTacgt]+" {params.tmpfile} | seqkit seq -i | grep ">" | sed 's/>//g' > {params.ids}
seqkit grep -v -f {params.ids} {params.tmpfile} > {params.fastafile}
mv {params.fastafile} {output[0]}
seqkit stats {input[0]} {params.tmpfile} {output[0]}
"""

rule filter:
input:
"bold_info_filtered.tsv",
"bold_filtered.fasta"

rule cluster:
"""
Cluster the filtered fasta file using vsearch
Expand Down Expand Up @@ -114,5 +159,7 @@ rule format:
output:
assignTaxaFasta = "bold_clustered.assignTaxonomy.fasta",
addSpeciesFasta = "bold_clustered.addSpecies.fasta"
params:
ranks = config["database"]["ranks"]
script:
"scripts/common.py"
59 changes: 49 additions & 10 deletions src/coidb/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,86 @@ description: schema for config parameters
type: object

properties:
testing:
type: object
default:
nrows: 0
nrows:
type: integer
description: For testing purposes, set number of rows to load with pandas
default: 0
primers:
type: object
default:
forward: ["CCHGAYATRGCHTTYCCHCG"]
reverse: ["CDGGRTGNCCRAARAAYCA"]
forward:
type: array
description: Forward primer sequence(s)
default: ["CCHGAYATRGCHTTYCCHCG"]
items:
type: string
reverse:
type: array
description: Reverse primer sequence(s)
default: ["CDGGRTGNCCRAARAAYCA"]
items:
type: string

database:
type: object
default:
pid: 1.0
url: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
tax_url: "https://hosted-datasets.gbif.org/ibol/ibol_bins_2021_02_08.zip"
pid: 0.99
bold.zip: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
backbone.zip: "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
gene:
- "COI-5P"
phyla: []
- COI-5P
taxa: []
rank: "phylum"
ranks:
- kingdom
- phylum
- class
- order
- family
- genus
- species
properties:
pid:
type: number
description: Id threshold for clustering database with vsearch
default: 0.99
max: 1.0
min: 0.0
url:
bold.zip:
type: string
default: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
description: URL to zipfile with database sequences + info
tax_url:
backbone.zip:
type: string
description: URL to zipfile with taxonomic information for BINS
default: "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
description: URL to GBIF taxonomy backbone
gene:
type: array
description: Genes of interest to extract from database
default:
- "COI-5P"
items:
type: string
phyla:
taxa:
type: array
description: Taxa of interest to extract from database
default: []
items:
type: string
rank:
type: string
description: Rank at which to filter using the taxa above
default: "phylum"
ranks:
type: array
description: Phyla of interest to extract from database
description: Ranks to include in taxonomic info
default: ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
items:
type: string
enum: ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
23 changes: 17 additions & 6 deletions src/coidb/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,24 @@ primers:
- "CDGGRTGNCCRAARAAYCA"
database:
# Percent identity to cluster seqs in the database by
pid: 1.0
pid: 0.99
# url to download info and sequence files from
url: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
bold.zip: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
# url to download zip file with 'taxon.txt' file
tax_url: "https://hosted-datasets.gbif.org/ibol/ibol_bins_2021_02_08.zip"
bold_bins.zip: "https://hosted-datasets.gbif.org/ibol/ibol_bins_2021_02_08.zip"
backbone.zip: "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
# gene of interest (will be used to filter sequences)
gene:
- "COI-5P"
# phyla of interest (omit this in order to include all phyla)
phyla: []
- COI-5P
# taxa of interest (omit this in order to include all taxa)
taxa: []
# rank at which to filter
rank: "phylum"
ranks:
- kingdom
- phylum
- class
- order
- family
- genus
- species
Loading

0 comments on commit fe08f80

Please sign in to comment.