From 57a9ca85064c9e321490339f3cc68650b3ec49ae Mon Sep 17 00:00:00 2001
From: Victor Lin <13424970+victorlin@users.noreply.github.com>
Date: Mon, 26 Aug 2024 22:29:06 -0700
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20merge=20sequences?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 augur/merge.py                                | 52 ++++++++++++++++++-
 tests/functional/merge/cram/merge-sequences.t | 38 ++++++++++++++
 2 files changed, 88 insertions(+), 2 deletions(-)
 create mode 100644 tests/functional/merge/cram/merge-sequences.t

diff --git a/augur/merge.py b/augur/merge.py
index bf98968d8..1ce3a9495 100644
--- a/augur/merge.py
+++ b/augur/merge.py
@@ -81,21 +81,30 @@ def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("merge", help=first_line(__doc__))
 
     input_group = parser.add_argument_group("inputs", "options related to input")
-    input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
+    # FIXME: allow metadata and/or sequences. require at least one.
+    input_group.add_argument("--metadata", nargs="+", action="extend", required=False, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
+    input_group.add_argument("--sequences", nargs="+", action="extend", required=False, metavar="FILE", help="Sequence files. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
 
     input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
     input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
 
     output_group = parser.add_argument_group("outputs", "options related to output")
-    output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
+    # FIXME: allow metadata and/or sequences. require at least one.
+    # FIXME: ensure outputs are paired with respective inputs
+    output_group.add_argument('--output-metadata', required=False, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
+    output_group.add_argument('--output-sequences', required=False, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
     output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)
 
     return parser
 
 
 def run(args):
+    # FIXME: when both metadata and sequences are specified, ensure they are
+    # paired properly (same sequence IDs)
     if args.metadata:
         merge_metadata(args)
+    if args.sequences:
+        merge_sequences(args)
 
 
 def merge_metadata(args):
@@ -248,6 +257,45 @@ def merge_metadata(args):
             print_info(f"WARNING: Skipped deletion of {db_path} due to error, but you may want to clean it up yourself (e.g. if it's large).")
 
 
+# FIXME: return a list of arguments and don't use shell
+def cat(filepath: str):
+    cat = "cat"
+
+    if filepath.endswith(".gz"):
+        cat = "gzcat"
+    if filepath.endswith(".xz"):
+        cat = "xzcat"
+    if filepath.endswith(".zst"):
+        cat = "zstdcat"
+
+    return f"{cat} {filepath}"
+
+
+def merge_sequences(args):
+    # Parse --sequences arguments
+    if not len(args.sequences) >= 2:
+        raise AugurError(f"At least two sequence inputs are required for merging.")
+
+    # FIXME: error if there are any duplicates within a single sequence file
+
+    # Reversed because seqkit rmdup keeps the first entry but this command
+    # should keep the last entry.
+    # FIXME: don't use shell. just using it to get a sense of feasibility.
+    cat_processes = (f"<({cat(filepath)})" for filepath in reversed(args.sequences))
+    shell_cmd = f"cat {' '.join(cat_processes)} | seqkit rmdup"
+    print_debug(F"running shell command {shell_cmd!r}")
+    process = subprocess.Popen(shell_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE)
+
+    # FIXME: handle `-` better
+    output = process.communicate()[0]
+    if args.output_sequences == "-":
+        sys.stdout.write(output.decode())
+    else:
+        with open(args.output, "w") as f:
+            f.write(output.decode())
+
+
+# FIXME: do this for seqkit too
 def sqlite3(*args, **kwargs):
     """
     Internal helper for invoking ``sqlite3``, the SQLite CLI program.
diff --git a/tests/functional/merge/cram/merge-sequences.t b/tests/functional/merge/cram/merge-sequences.t
new file mode 100644
index 000000000..a26b1030f
--- /dev/null
+++ b/tests/functional/merge/cram/merge-sequences.t
@@ -0,0 +1,38 @@
+SETUP
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+  $ export AUGUR_DEBUG=1
+
+Merge sequences without metadata
+
+  $ cat >x.fasta <<~~
+  > >seq1
+  > ATCG
+  > >seq2
+  > GCTA
+  > >seq3
+  > TCGA
+  > ~~
+
+  $ cat >y.fasta <<~~
+  > >seq3
+  > ATCG
+  > >seq4
+  > GCTA
+  > ~~
+
+  $ ${AUGUR} merge \
+  >   --sequences x.fasta y.fasta \
+  >   --output-sequences - > merged.fasta
+  running shell command 'cat <(cat y.fasta) <(cat x.fasta) | seqkit rmdup'
+  [INFO]\x1b[0m 1 duplicated records removed (esc)
+
+  $ cat merged.fasta
+  >seq3
+  ATCG
+  >seq4
+  GCTA
+  >seq1
+  ATCG
+  >seq2
+  GCTA