Merge pull request #6 from johnne/devel

Devel
biodiversitydata-se · Dec 7, 2021 · 956a865 · 956a865
2 parents 57af3f7 + 7e872d9
commit 956a865
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 14 deletions.
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="coidb",
-    version="0.2.4",
+    version="0.3.0",
     author="John Sundh",
     url="https://github.com/NBISweden/coidb/",
     description="Workflow for downloading and formatting COI database",

diff --git a/src/coidb/Snakefile b/src/coidb/Snakefile
@@ -17,24 +17,38 @@ rule download:
     Download zipfile with database sequences + info
     """
     output:
-        "bold_info.tsv",
-        "bold_seqs.txt",
-        "bold_taxa.txt"
+        "bold.zip",
+        "bold_bins.zip"
     log:
         "logs/download.log"
     params:
-        url = config["database"]["url"],
-        tax_url = config["database"]["tax_url"]
+        url=config["database"]["url"],
+        tax_url=config["database"]["tax_url"]
     shell:
         """
         curl -L -o $TMPDIR/bold.zip {params.url} > {log} 2>&1
-        unzip -o -d $TMPDIR/ $TMPDIR/bold.zip occurrences.txt sequence.txt >> {log} 2>&1
         curl -L -o $TMPDIR/bold_bins.zip {params.tax_url} >> {log} 2>&1
-        unzip -o -d $TMPDIR/ $TMPDIR/bold_bins.zip taxon.txt >> {log} 2>&1
+        mv $TMPDIR/bold.zip {output[0]}
+        mv $TMPDIR/bold_bins.zip {output[1]}
+        """
+
+rule extract:
+    input:
+        "bold.zip",
+        "bold_bins.zip"
+    output:
+        "bold_info.tsv",
+        "bold_seqs.txt",
+        "bold_taxa.txt"
+    log:
+        "logs/extract.log"
+    shell:
+        """
+        unzip -o -d $TMPDIR/ {input[0]} occurrences.txt dna.txt >> {log} 2>&1
+        unzip -o -d $TMPDIR/ {input[1]} taxon.txt >> {log} 2>&1
         mv $TMPDIR/occurrences.txt {output[0]}
-        mv $TMPDIR/sequence.txt {output[1]}
+        mv $TMPDIR/dna.txt {output[1]}
         mv $TMPDIR/taxon.txt {output[2]}  
-        rm $TMPDIR/bold.zip $TMPDIR/bold_bins.zip
         """
 
 rule filter:

diff --git a/src/coidb/scripts/common.py b/src/coidb/scripts/common.py
@@ -20,6 +20,9 @@ def write_seqs(seq_df, outfile, tmpfile):
     ranks = ["phylum", "class", "order", "family", "genus", "species", "bold_id"]
     old_index = []
     new_index = []
+    # Filter out sequences with non-DNA characters
+    seq_df, dropped_ids = filter_non_standard(seq_df)
+    sys.stderr.write(f"{dropped_ids} sequences dropped\n")
     # Sort sequences by BOLD IDs
     sys.stderr.write("Sorting sequences by BOLD IDs\n")
     seq_df = seq_df.sort_values("bold_id")
@@ -30,9 +33,6 @@ def write_seqs(seq_df, outfile, tmpfile):
                               desc=f"Writing sequences to temporary directory",
                               unit=" seqs"):
             seq = seq_df.loc[record_id, "seq"]
-            seq = seq.replace("-", "").strip("N")
-            if "N" in seq:
-                continue
             desc = ";".join([seq_df.loc[record_id, x] for x in ranks])
             fhout.write(f">{record_id} {desc}\n{seq}\n")
     sys.stderr.write(f"Moving {tmpfile} to {outfile}\n")
@@ -77,6 +77,26 @@ def extract_species_name(value):
     return sp_name
 
 
+def filter_non_standard(df):
+    """
+    Removes sequences with non-standard nucleotides
+
+    :param df: Dataframe with fasta sequences
+    :return: Dataframe with sequences with non-standard characters removed
+    """
+    drop_ids = []
+    for record_id in tqdm(df.index, unit=" records",
+                          desc="removing non-standard nucleotide seqs"):
+        seq = df.loc[record_id, "seq"]
+        seq = seq.replace("-", "").strip("N")
+        letters = set([x for x in seq])
+        for l in letters:
+            if l not in ["A", "C", "G", "T"]:
+                drop_ids.append(record_id)
+                break
+    return df.drop(drop_ids), len(drop_ids)
+
+
 def filter(sm):
     genes = sm.params.genes
     phyla = sm.params.phyla
@@ -99,7 +119,7 @@ def filter(sm):
     # Read fasta
     sys.stderr.write(f"Reading fasta file {sm.input[1]}\n")
     seqs = pd.read_csv(sm.input[1], header=None, sep="\t", index_col=0,
-                       names=["record_id", "gene", "seq"])
+                       names=["record_id", "gene", "seq"], usecols=[0,1,2])
     sys.stderr.write(f"{seqs.shape[0]} sequences read\n")
     if len(genes) > 0:
         # Filter fasta to gene(s)