From b387c79a78ed96b984f2b1af9bb7af248a1d07e3 Mon Sep 17 00:00:00 2001
From: John Huddleston <huddlej@gmail.com>
Date: Mon, 24 May 2021 13:48:42 -0700
Subject: [PATCH] Skip redundant combine and deduplicate script

Now that the sequence sanitizer script has to handle duplicates anyway,
we no longer need to pipe its output to a separate script that does the
same thing. This commit also pipes output from the sanitizer script to
xz to speed up compression.
---
 scripts/combine-and-dedup-fastas.py        | 76 ----------------------
 workflow/snakemake_rules/main_workflow.smk |  9 +--
 2 files changed, 3 insertions(+), 82 deletions(-)
 delete mode 100644 scripts/combine-and-dedup-fastas.py

diff --git a/scripts/combine-and-dedup-fastas.py b/scripts/combine-and-dedup-fastas.py
deleted file mode 100644
index 179618c28..000000000
--- a/scripts/combine-and-dedup-fastas.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import argparse
-from augur.io import open_file, read_sequences, write_sequences
-from Bio import SeqIO
-import hashlib
-import sys
-import textwrap
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Combine and dedup FASTAs",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument('--input', type=str,  nargs="+", metavar="FASTA", required=True, help="input FASTAs")
-    parser.add_argument('--warn-about-duplicates', action="store_true", help="warn the user about duplicate strains instead of exiting with an error. The output will include the first occurrence of a duplicate sequence.")
-    parser.add_argument('--output', type=str, metavar="FASTA", required=True, help="output FASTA")
-    args = parser.parse_args()
-
-    sequence_hash_by_name = {}
-    duplicate_strains = set()
-
-    counter = 0
-    with open_file(args.output, "w") as output_handle:
-        # Stream sequences from all input files into a single output file,
-        # skipping duplicate records (same strain and sequence) and noting
-        # mismatched sequences for the same strain name.  In order to
-        # prefer the latter files, we have to reverse the order of the
-        # files.
-        for record in read_sequences(*reversed(args.input)):
-            counter += 1
-            if counter % 10000 == 0:
-                print(f"Processed {counter} records")
-
-            # Hash each sequence and check whether another sequence with the
-            # same name already exists and if the hash is different.
-            sequence_hash = hashlib.sha256(str(record.seq).encode("utf-8")).hexdigest()
-            if record.name in sequence_hash_by_name:
-                # If the hashes differ (multiple entries with the same
-                # strain name but different sequences), we keep the first
-                # sequence and add the strain to a list of duplicates to
-                # report at the end.
-                if sequence_hash_by_name.get(record.name) != sequence_hash:
-                    duplicate_strains.add(record.name)
-
-                # If the current strain has been seen before, don't write
-                # out its sequence again.
-                continue
-
-            sequence_hash_by_name[record.name] = sequence_hash
-            write_sequences(record, output_handle)
-
-    if len(duplicate_strains) > 0:
-        error_mode = "ERROR"
-        exit_code = 1
-
-        if args.warn_about_duplicates:
-            error_mode = "WARNING"
-            exit_code = 0
-
-        print(
-            f"{error_mode}: Detected the following duplicate input strains with different sequences:",
-            file=sys.stderr
-        )
-        for strain in duplicate_strains:
-            print(textwrap.indent(strain, "    "), file=sys.stderr)
-
-        if not args.warn_about_duplicates:
-            print(
-                "Use the `--warn-about-duplicates` flag, if you prefer to accept",
-                "the first occurrence of duplicate sequences based on the order",
-                "of the given input sequences.",
-                file=sys.stderr
-            )
-
-        sys.exit(exit_code)
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 720f39b04..0b4a05ccc 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -349,19 +349,16 @@ rule combine_sequences_for_subsampling:
         "benchmarks/combine_sequences_for_subsampling.txt"
     conda: config["conda_environment"]
     params:
-        warn_about_duplicates="--warn-about-duplicates" if config.get("combine_sequences_for_subsampling", {}).get("warn_about_duplicates") else "",
+        error_on_duplicate_strains="--error-on-duplicate-strains" if not config.get("combine_sequences_for_subsampling", {}).get("warn_about_duplicates") else "",
         strain_prefixes=config["strip_strain_prefixes"],
     shell:
         """
         python3 scripts/sanitize_sequences.py \
                 --sequences {input} \
                 --strip-prefixes {params.strain_prefixes:q} \
+                {params.error_on_duplicate_strains} \
                 --output /dev/stdout \
-        | python3 scripts/combine-and-dedup-fastas.py \
-            --input /dev/stdin \
-            {params.warn_about_duplicates} \
-            --output /dev/stdout \
-        | xz -2 > {output}
+                | xz -c -2 > {output}
         """
 
 rule index_sequences: