Skip to content

Commit

Permalink
Merge pull request #6 from johnne/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
johnne authored Dec 7, 2021
2 parents 57af3f7 + 7e872d9 commit 956a865
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 14 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="coidb",
version="0.2.4",
version="0.3.0",
author="John Sundh",
url="https://github.com/NBISweden/coidb/",
description="Workflow for downloading and formatting COI database",
Expand Down
32 changes: 23 additions & 9 deletions src/coidb/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,38 @@ rule download:
Download zipfile with database sequences + info
"""
output:
"bold_info.tsv",
"bold_seqs.txt",
"bold_taxa.txt"
"bold.zip",
"bold_bins.zip"
log:
"logs/download.log"
params:
url = config["database"]["url"],
tax_url = config["database"]["tax_url"]
url=config["database"]["url"],
tax_url=config["database"]["tax_url"]
shell:
"""
curl -L -o $TMPDIR/bold.zip {params.url} > {log} 2>&1
unzip -o -d $TMPDIR/ $TMPDIR/bold.zip occurrences.txt sequence.txt >> {log} 2>&1
curl -L -o $TMPDIR/bold_bins.zip {params.tax_url} >> {log} 2>&1
unzip -o -d $TMPDIR/ $TMPDIR/bold_bins.zip taxon.txt >> {log} 2>&1
mv $TMPDIR/bold.zip {output[0]}
mv $TMPDIR/bold_bins.zip {output[1]}
"""

rule extract:
input:
"bold.zip",
"bold_bins.zip"
output:
"bold_info.tsv",
"bold_seqs.txt",
"bold_taxa.txt"
log:
"logs/extract.log"
shell:
"""
unzip -o -d $TMPDIR/ {input[0]} occurrences.txt dna.txt >> {log} 2>&1
unzip -o -d $TMPDIR/ {input[1]} taxon.txt >> {log} 2>&1
mv $TMPDIR/occurrences.txt {output[0]}
mv $TMPDIR/sequence.txt {output[1]}
mv $TMPDIR/dna.txt {output[1]}
mv $TMPDIR/taxon.txt {output[2]}
rm $TMPDIR/bold.zip $TMPDIR/bold_bins.zip
"""

rule filter:
Expand Down
28 changes: 24 additions & 4 deletions src/coidb/scripts/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def write_seqs(seq_df, outfile, tmpfile):
ranks = ["phylum", "class", "order", "family", "genus", "species", "bold_id"]
old_index = []
new_index = []
# Filter out sequences with non-DNA characters
seq_df, dropped_ids = filter_non_standard(seq_df)
sys.stderr.write(f"{dropped_ids} sequences dropped\n")
# Sort sequences by BOLD IDs
sys.stderr.write("Sorting sequences by BOLD IDs\n")
seq_df = seq_df.sort_values("bold_id")
Expand All @@ -30,9 +33,6 @@ def write_seqs(seq_df, outfile, tmpfile):
desc=f"Writing sequences to temporary directory",
unit=" seqs"):
seq = seq_df.loc[record_id, "seq"]
seq = seq.replace("-", "").strip("N")
if "N" in seq:
continue
desc = ";".join([seq_df.loc[record_id, x] for x in ranks])
fhout.write(f">{record_id} {desc}\n{seq}\n")
sys.stderr.write(f"Moving {tmpfile} to {outfile}\n")
Expand Down Expand Up @@ -77,6 +77,26 @@ def extract_species_name(value):
return sp_name


def filter_non_standard(df):
"""
Removes sequences with non-standard nucleotides
:param df: Dataframe with fasta sequences
:return: Dataframe with sequences with non-standard characters removed
"""
drop_ids = []
for record_id in tqdm(df.index, unit=" records",
desc="removing non-standard nucleotide seqs"):
seq = df.loc[record_id, "seq"]
seq = seq.replace("-", "").strip("N")
letters = set([x for x in seq])
for l in letters:
if l not in ["A", "C", "G", "T"]:
drop_ids.append(record_id)
break
return df.drop(drop_ids), len(drop_ids)


def filter(sm):
genes = sm.params.genes
phyla = sm.params.phyla
Expand All @@ -99,7 +119,7 @@ def filter(sm):
# Read fasta
sys.stderr.write(f"Reading fasta file {sm.input[1]}\n")
seqs = pd.read_csv(sm.input[1], header=None, sep="\t", index_col=0,
names=["record_id", "gene", "seq"])
names=["record_id", "gene", "seq"], usecols=[0,1,2])
sys.stderr.write(f"{seqs.shape[0]} sequences read\n")
if len(genes) > 0:
# Filter fasta to gene(s)
Expand Down

0 comments on commit 956a865

Please sign in to comment.