From b387c79a78ed96b984f2b1af9bb7af248a1d07e3 Mon Sep 17 00:00:00 2001 From: John Huddleston Date: Mon, 24 May 2021 13:48:42 -0700 Subject: [PATCH] Skip redundant combine and deduplicate script Now that the sequence sanitizer script has to handle duplicates anyway, we no longer need to pipe its output to a separate script that does the same thing. This commit also pipes output from the sanitizer script to xz to speed up compression. --- scripts/combine-and-dedup-fastas.py | 76 ---------------------- workflow/snakemake_rules/main_workflow.smk | 9 +-- 2 files changed, 3 insertions(+), 82 deletions(-) delete mode 100644 scripts/combine-and-dedup-fastas.py diff --git a/scripts/combine-and-dedup-fastas.py b/scripts/combine-and-dedup-fastas.py deleted file mode 100644 index 179618c28..000000000 --- a/scripts/combine-and-dedup-fastas.py +++ /dev/null @@ -1,76 +0,0 @@ -import argparse -from augur.io import open_file, read_sequences, write_sequences -from Bio import SeqIO -import hashlib -import sys -import textwrap - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="Combine and dedup FASTAs", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--input', type=str, nargs="+", metavar="FASTA", required=True, help="input FASTAs") - parser.add_argument('--warn-about-duplicates', action="store_true", help="warn the user about duplicate strains instead of exiting with an error. The output will include the first occurrence of a duplicate sequence.") - parser.add_argument('--output', type=str, metavar="FASTA", required=True, help="output FASTA") - args = parser.parse_args() - - sequence_hash_by_name = {} - duplicate_strains = set() - - counter = 0 - with open_file(args.output, "w") as output_handle: - # Stream sequences from all input files into a single output file, - # skipping duplicate records (same strain and sequence) and noting - # mismatched sequences for the same strain name. In order to - # prefer the latter files, we have to reverse the order of the - # files. - for record in read_sequences(*reversed(args.input)): - counter += 1 - if counter % 10000 == 0: - print(f"Processed {counter} records") - - # Hash each sequence and check whether another sequence with the - # same name already exists and if the hash is different. - sequence_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest() - if record.name in sequence_hash_by_name: - # If the hashes differ (multiple entries with the same - # strain name but different sequences), we keep the first - # sequence and add the strain to a list of duplicates to - # report at the end. - if sequence_hash_by_name.get(record.name) != sequence_hash: - duplicate_strains.add(record.name) - - # If the current strain has been seen before, don't write - # out its sequence again. - continue - - sequence_hash_by_name[record.name] = sequence_hash - write_sequences(record, output_handle) - - if len(duplicate_strains) > 0: - error_mode = "ERROR" - exit_code = 1 - - if args.warn_about_duplicates: - error_mode = "WARNING" - exit_code = 0 - - print( - f"{error_mode}: Detected the following duplicate input strains with different sequences:", - file=sys.stderr - ) - for strain in duplicate_strains: - print(textwrap.indent(strain, " "), file=sys.stderr) - - if not args.warn_about_duplicates: - print( - "Use the `--warn-about-duplicates` flag, if you prefer to accept", - "the first occurrence of duplicate sequences based on the order", - "of the given input sequences.", - file=sys.stderr - ) - - sys.exit(exit_code) diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 720f39b04..0b4a05ccc 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -349,19 +349,16 @@ rule combine_sequences_for_subsampling: "benchmarks/combine_sequences_for_subsampling.txt" conda: config["conda_environment"] params: - warn_about_duplicates="--warn-about-duplicates" if config.get("combine_sequences_for_subsampling", {}).get("warn_about_duplicates") else "", + error_on_duplicate_strains="--error-on-duplicate-strains" if not config.get("combine_sequences_for_subsampling", {}).get("warn_about_duplicates") else "", strain_prefixes=config["strip_strain_prefixes"], shell: """ python3 scripts/sanitize_sequences.py \ --sequences {input} \ --strip-prefixes {params.strain_prefixes:q} \ + {params.error_on_duplicate_strains} \ --output /dev/stdout \ - | python3 scripts/combine-and-dedup-fastas.py \ - --input /dev/stdin \ - {params.warn_about_duplicates} \ - --output /dev/stdout \ - | xz -2 > {output} + | xz -c -2 > {output} """ rule index_sequences: