Merge pull request #11 from johnne/devel

Devel
biodiversitydata-se · Feb 8, 2022 · fe08f80 · fe08f80
2 parents ca28f41 + cbce2e3
commit fe08f80
Show file tree

Hide file tree

Showing 20 changed files with 494 additions and 39,589 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive include src/coidb/ Snakefile config*.yaml
diff --git a/environment.yml b/environment.yml
@@ -10,4 +10,5 @@ dependencies:
   - tqdm
   - pandas
   - snakemake
+  - seqkit
   - importlib_resources
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,6 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel"
+]
+build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,35 @@
+[metadata]
+name = coidb
+version = 0.4.0
+author = John Sundh
+author_email = [email protected]
+description = Workflow for downloading and formatting COI database
+long_description = file: README.md
+long_description_content_type = text/markdown
+url = https://github.com/johnne/coidb
+project_urls =
+    Bug Tracker = https://github.com/johnne/coidb/issues
+classifiers =
+    Programming Language :: Python :: 3
+    License :: OSI Approved :: MIT License
+    Operating System :: OS Independent
+
+[options]
+package_dir =
+    = src
+packages = find:
+python_requires = >=3.6
+include_package_data = True
+install_requires =
+    snakemake
+    biopython
+    tqdm
+    pandas
+    importlib_resources
+
+[options.packages.find]
+where = src
+
+[options.entry_points]
+console_scripts =
+    coidb = coidb.__main__:main
diff --git a/setup.py b/setup.py
diff --git a/src/coidb/Snakefile b/src/coidb/Snakefile
@@ -7,68 +7,113 @@ singularity: "docker://continuumio/miniconda3:4.9.2"
 # Validate config
 validate(config, "config.schema.yaml")
 
+nrows = None
+if config["testing"]["nrows"] > 0:
+    nrows = config["testing"]["nrows"]
+
 localrules: coidb, download, filter, clean, format
 
+wildcard_constraints:
+    textfile = "occurrences.txt|dna.txt|Taxon.tsv",
+    zipfile = "bold.zip|backbone.zip"
+
+textfile_dict = {'Taxon.tsv': 'backbone.zip',
+                 'occurrences.txt': 'bold.zip',
+                 'dna.txt': 'bold.zip'}
+
 rule coidb:
     input: expand("bold_clustered.{w}.fasta", w=["assignTaxonomy", "addSpecies"])
 
-rule download:
+rule download_zipfile:
     """
     Download zipfile with database sequences + info
     """
     output:
-        "bold.zip",
-        "bold_bins.zip"
+        "{zipfile}"
     log:
-        "logs/download.log"
+        "logs/download.{zipfile}.log"
     params:
-        url=config["database"]["url"],
-        tax_url=config["database"]["tax_url"]
+        url = lambda wildcards: config["database"][wildcards.zipfile]
     shell:
         """
-        curl -L -o $TMPDIR/bold.zip {params.url} > {log} 2>&1
-        curl -L -o $TMPDIR/bold_bins.zip {params.tax_url} >> {log} 2>&1
-        mv $TMPDIR/bold.zip {output[0]}
-        mv $TMPDIR/bold_bins.zip {output[1]}
+        curl -L -o $TMPDIR/{wildcards.zipfile} {params.url} > {log} 2>&1
+        mv $TMPDIR/{wildcards.zipfile} {output[0]}
         """
 
-rule extract:
+rule download:
     input:
-        "bold.zip",
-        "bold_bins.zip"
+        textfile_dict.values()
+
+rule extract_zipfile:
+    input:
+        lambda wildcards: textfile_dict[wildcards.textfile]
     output:
-        "bold_info.tsv",
-        "bold_seqs.txt",
-        "bold_taxa.txt"
+        "{textfile}"
     log:
-        "logs/extract.log"
+        "logs/extract.{textfile}.log"
     shell:
         """
-        unzip -o -d $TMPDIR/ {input[0]} occurrences.txt dna.txt >> {log} 2>&1
-        unzip -o -d $TMPDIR/ {input[1]} taxon.txt >> {log} 2>&1
-        mv $TMPDIR/occurrences.txt {output[0]}
-        mv $TMPDIR/dna.txt {output[1]}
-        mv $TMPDIR/taxon.txt {output[2]}  
+        f=$(unzip -l {input[0]} | grep -w {output[0]} | rev | cut -f1 -d ' ' | rev)
+        unzip -o -d $TMPDIR/ {input[0]} $f >> {log} 2>&1
+        mv $TMPDIR/$f {output[0]}
         """
 
-rule filter:
+
+rule extract:
+    input:
+        textfile_dict.keys()
+
+rule filter_data:
     """
-    Extract record ids based on taxonomy
+    Filter the BOLD data to genes and taxa of interest
+    
+    This also keeps only records with BOLD: ids
     """
     input:
-        "bold_info.tsv",
-        "bold_seqs.txt",
-        "bold_taxa.txt"
+        "occurrences.txt",
+        "dna.txt",
+        "Taxon.tsv"
     output:
         info = "bold_info_filtered.tsv",
-        fasta = "bold_filtered.fasta",
+        fasta = "bold.fasta",
     params:
         genes = config["database"]["gene"],
-        phyla = config["database"]["phyla"],
-        tmpf = "$TMPDIR/bold_filtered.fasta"
+        filter_taxa = config["database"]["taxa"],
+        filter_rank = config["database"]["rank"],
+        ranks = config["database"]["ranks"],
+        tmpf = "$TMPDIR/bold_filtered.fasta",
+        nrows = nrows
     script:
         "scripts/common.py"
 
+rule remove_non_standard:
+    input:
+        "bold.fasta"
+    output:
+        "bold_filtered.fasta"
+    log:
+        "logs/remove_non_standard.log"
+    params:
+        tmpfile = "$TMPDIR/bold_seqkit_cleaned.fasta",
+        ids = "$TMPDIR/bold_non_standard_ids.txt",
+        fastafile = "$TMPDIR/bold_filtered.fasta"
+    shell:
+        """
+        exec &> {log} 
+        # Remove gap characters, then remove leading and trailing 'N'
+        seqkit seq -g {input} | seqkit replace -s -r "" -p "N+$" | seqkit replace -s -r "" -p "^N+" > {params.tmpfile}
+        # Now remove ids still containing non standard DNA chars
+        seqkit grep -s -r -p "[^ACGTacgt]+" {params.tmpfile} | seqkit seq -i | grep ">" | sed 's/>//g' > {params.ids}
+        seqkit grep -v -f {params.ids} {params.tmpfile} > {params.fastafile}
+        mv {params.fastafile} {output[0]}
+        seqkit stats {input[0]} {params.tmpfile} {output[0]}
+        """
+
+rule filter:
+    input:
+        "bold_info_filtered.tsv",
+        "bold_filtered.fasta"
+
 rule cluster:
     """
     Cluster the filtered fasta file using vsearch
@@ -114,5 +159,7 @@ rule format:
     output:
         assignTaxaFasta = "bold_clustered.assignTaxonomy.fasta",
         addSpeciesFasta = "bold_clustered.addSpecies.fasta"
+    params:
+        ranks = config["database"]["ranks"]
     script:
         "scripts/common.py"
diff --git a/src/coidb/config.schema.yaml b/src/coidb/config.schema.yaml
@@ -4,47 +4,86 @@ description: schema for config parameters
 type: object
 
 properties:
+  testing:
+    type: object
+    default:
+      nrows: 0
+    nrows:
+      type: integer
+      description: For testing purposes, set number of rows to load with pandas
+      default: 0
   primers:
     type: object
+    default:
+      forward: ["CCHGAYATRGCHTTYCCHCG"]
+      reverse: ["CDGGRTGNCCRAARAAYCA"]
     forward:
       type: array
       description: Forward primer sequence(s)
+      default: ["CCHGAYATRGCHTTYCCHCG"]
       items:
         type: string
     reverse:
       type: array
       description: Reverse primer sequence(s)
+      default: ["CDGGRTGNCCRAARAAYCA"]
       items:
         type: string
 
   database:
     type: object
     default:
-      pid: 1.0
-      url: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
-      tax_url: "https://hosted-datasets.gbif.org/ibol/ibol_bins_2021_02_08.zip"
+      pid: 0.99
+      bold.zip: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
+      backbone.zip: "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
       gene:
-        - "COI-5P"
-      phyla: []
+        - COI-5P
+      taxa: []
+      rank: "phylum"
+      ranks:
+        - kingdom
+        - phylum
+        - class
+        - order
+        - family
+        - genus
+        - species
     properties:
       pid:
         type: number
         description: Id threshold for clustering database with vsearch
+        default: 0.99
         max: 1.0
         min: 0.0
-      url:
+      bold.zip:
         type: string
+        default: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
         description: URL to zipfile with database sequences + info
-      tax_url:
+      backbone.zip:
         type: string
-        description: URL to zipfile with taxonomic information for BINS
+        default: "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
+        description: URL to GBIF taxonomy backbone
       gene:
         type: array
         description: Genes of interest to extract from database
+        default:
+          - "COI-5P"
         items:
           type: string
-      phyla:
+      taxa:
+        type: array
+        description: Taxa of interest to extract from database
+        default: []
+        items:
+          type: string
+      rank:
+        type: string
+        description: Rank at which to filter using the taxa above
+        default: "phylum"
+      ranks:
         type: array
-        description: Phyla of interest to extract from database
+        description: Ranks to include in taxonomic info
+        default: ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
         items:
           type: string
+          enum: ["kingdom", "phylum", "class", "order", "family", "genus", "species"]
diff --git a/src/coidb/config.yaml b/src/coidb/config.yaml
@@ -5,13 +5,24 @@ primers:
         - "CDGGRTGNCCRAARAAYCA"
 database:
     # Percent identity to cluster seqs in the database by
-    pid: 1.0
+    pid: 0.99
     # url to download info and sequence files from
-    url: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
+    bold.zip: "https://hosted-datasets.gbif.org/ibol/ibol.zip"
     # url to download zip file with 'taxon.txt' file
-    tax_url: "https://hosted-datasets.gbif.org/ibol/ibol_bins_2021_02_08.zip"
+    bold_bins.zip: "https://hosted-datasets.gbif.org/ibol/ibol_bins_2021_02_08.zip"
+    backbone.zip: "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
     # gene of interest (will be used to filter sequences)
     gene:
-        - "COI-5P"
-    # phyla of interest (omit this in order to include all phyla)
-    phyla: []
+        - COI-5P
+    # taxa of interest (omit this in order to include all taxa)
+    taxa: []
+    # rank at which to filter
+    rank: "phylum"
+    ranks:
+        - kingdom
+        - phylum
+        - class
+        - order
+        - family
+        - genus
+        - species