From 868818de515cf06bed9807c114b192edb1accaff Mon Sep 17 00:00:00 2001
From: Thomas Sibley <tsibley@fredhutch.org>
Date: Tue, 16 Jul 2024 12:24:46 -0700
Subject: [PATCH] augur merge

Support generalized merging of two or more metadata tables.  A long
desired command.  Behaviour is based on much discussion with the team
and bespoke implementations like ncov's combine_metadata.py.
Implementation requirements include handling inputs of arbitrary size
(i.e.  without needing to read any dataset fully into memory) and
handling more than two inputs.
---
 CHANGES.md                          |   2 +
 augur/__init__.py                   |   1 +
 augur/merge.py                      | 296 ++++++++++++++++++++++++++++
 docs/api/developer/augur.merge.rst  |   7 +
 docs/api/developer/augur.rst        |   1 +
 docs/usage/cli/cli.rst              |   1 +
 docs/usage/cli/merge.rst            |   9 +
 tests/functional/merge/cram/merge.t | 182 +++++++++++++++++
 8 files changed, 499 insertions(+)
 create mode 100644 augur/merge.py
 create mode 100644 docs/api/developer/augur.merge.rst
 create mode 100644 docs/usage/cli/merge.rst
 create mode 100644 tests/functional/merge/cram/merge.t

diff --git a/CHANGES.md b/CHANGES.md
index c16ff138d..c4a23b2d5 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -4,6 +4,7 @@
 
 ### Features
 
+* A new command, `augur merge`, now allows for generalized merging of two or more metadata tables. [#1563][] (@tsibley)
 * Two new commands, `augur read-file` and `augur write-file`, now allow external programs to do i/o like Augur by piping from/to these new commands.  They provide handling of compression formats and newlines consistent with the rest of Augur. [#1562][] (@tsibley)
 
 ### Bug Fixes
@@ -12,6 +13,7 @@
 
 [#1561]: https://github.com/nextstrain/augur/pull/1561
 [#1562]: https://github.com/nextstrain/augur/pull/1562
+[#1563]: https://github.com/nextstrain/augur/pull/1563
 
 
 
diff --git a/augur/__init__.py b/augur/__init__.py
index 7b4f2066b..a5c9535c2 100644
--- a/augur/__init__.py
+++ b/augur/__init__.py
@@ -21,6 +21,7 @@
 command_strings = [
     "parse",
     "curate",
+    "merge",
     "index",
     "filter",
     "mask",
diff --git a/augur/merge.py b/augur/merge.py
new file mode 100644
index 000000000..554cd99c0
--- /dev/null
+++ b/augur/merge.py
@@ -0,0 +1,296 @@
+"""
+Merge two or more metadata tables into one.
+
+Tables must be given unique names to identify them in the output and are
+merged in the order given.
+
+Rows are joined by id (e.g. "strain" or "name" or other
+--metadata-id-columns), and ids must be unique within an input table (i.e.
+tables cannot contain duplicate ids).  All rows are output, even if they
+appear in only a single table.
+
+Columns are combined by name, either extending the combined table with a new
+column or overwriting values in an existing column.  For columns appearing in
+more than one table, non-empty values on the right hand side overwrite values
+on the left hand side.  The first table's id column name is used as the output
+id column name.
+
+One generated column per input table is appended to the end of the output
+table to identify the source of each row's data.  Column names are generated
+as "__source_metadata_{NAME}" where "{NAME}" is the table name given to
+--metadata.  Values in each column are 1 or 0 for present or absent in that
+input table.
+
+Metadata tables of arbitrary size can be handled, limited only by available
+disk space.  Tables are never loaded entirely into memory.  The transient disk
+space required is approximately the sum of the uncompressed size of the inputs.
+"""
+import os
+import subprocess
+import sys
+from functools import reduce
+from itertools import starmap
+from shlex import quote as shquote
+from shutil import which
+from tempfile import mkstemp
+from textwrap import dedent
+from typing import Iterable, Tuple, TypeVar
+
+from augur.argparse_ import ExtendOverwriteDefault
+from augur.errors import AugurError
+from augur.io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, Metadata
+from augur.io.print import print_err
+from augur.utils import first_line
+
+
+T = TypeVar('T')
+
+
+class NamedMetadata(Metadata):
+    name: str
+    """User-provided descriptive name for this metadata file."""
+
+    table_name: str
+    """Generated SQLite table name for this metadata file, based on *name*."""
+
+    def __init__(self, name: str, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.name = name
+        self.table_name = f"metadata_{self.name}"
+
+    def __repr__(self):
+        return f"<NamedMetadata {self.name}={self.path}>"
+
+
+def register_parser(parent_subparsers):
+    parser = parent_subparsers.add_parser("merge", help=first_line(__doc__))
+
+    input_group = parser.add_argument_group("inputs", "options related to input")
+    input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="metadata files with assigned names")
+
+    input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help="names of possible metadata columns containing identifier information, ordered by priority. Only one ID column will be inferred.")
+    input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help="delimiters to accept when reading a metadata file. Only one delimiter will be inferred.")
+
+    output_group = parser.add_argument_group("outputs", "options related to output")
+    output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="merged metadata as TSV")
+    output_group.add_argument('--quiet', action="store_true", default=False, help="suppress informational messages on stderr")
+
+    return parser
+
+
+def run(args):
+    print_info = print_err if not args.quiet else lambda *_: None
+
+    # Parse --metadata arguments
+    if not len(args.metadata) >= 2:
+        raise AugurError(f"At least two metadata inputs are required for merging.")
+
+    if unnamed := [repr(x) for x in args.metadata if "=" not in x or x.startswith("=")]:
+        raise AugurError(dedent(f"""\
+            All metadata inputs must be assigned a name, e.g. with NAME=FILE.
+
+            The following inputs were missing a name:
+
+              {indented_list(unnamed, '            ' + '  ')}
+            """))
+
+    metadata = [name_path.split("=", 1) for name_path in args.metadata]
+
+    if duplicate_names := [repr(name) for name, count
+                                       in count_unique(name for name, _ in metadata)
+                                       if count > 1]:
+        raise AugurError(dedent(f"""\
+            Metadata input names must be unique.
+
+            The following names were used more than once:
+
+              {indented_list(duplicate_names, '            ' + '  ')}
+            """))
+
+
+    # Infer delimiters and id columns
+    metadata = [
+        NamedMetadata(name, path, args.metadata_delimiters, args.metadata_id_columns)
+            for name, path in metadata]
+
+
+    # Locate how to re-invoke ourselves (_this_ specific Augur).
+    if sys.executable:
+        augur = f"{shquote(sys.executable)} -m augur"
+    else:
+        # A bit unusual we don't know our own Python executable, but assume we
+        # can access ourselves as the ``augur`` command.
+        augur = f"augur"
+
+
+    # Work with a temporary, on-disk SQLite database under a name we control so
+    # we can access it from multiple (serial) processes.
+    db_fd, db_path = mkstemp(prefix="augur-merge-", suffix=".sqlite")
+    os.close(db_fd)
+
+    # Clean up database file by default
+    delete_db = True
+
+    # Track columns as we see them, in order.  The first metadata's id column
+    # is always the first output column of the merge, so insert it now.
+    output_id_column = metadata[0].id_column
+    output_columns = { output_id_column: [] }
+
+    try:
+        # Read all metadata files into a SQLite db
+        for m in metadata:
+            # All other metadata reading in Augur (i.e. via the csv module)
+            # uses Python's "universal newlines"¹ definition and accepts \n,
+            # \r\n, and \r as newlines interchangably (even mixed within the
+            # same file!).  We accomplish the same behaviour here with SQLite's
+            # less flexible newline handling by relying on the universal
+            # newline translation of `augur read-file`.
+            #   -trs, 24 July 2024
+            #
+            # ¹ <https://docs.python.org/3/glossary.html#term-universal-newlines>
+            newline = os.linesep
+
+            print_info(f"Reading {m.name!r} metadata from {m.path!r}…")
+            sqlite3(db_path,
+                f'.mode csv',
+                f'.separator {sqlite_quote_dot(m.delimiter)} {sqlite_quote_dot(newline)}',
+                f'.import {sqlite_quote_dot(f"|{augur} read-file {shquote(m.path)}")} {sqlite_quote_dot(m.table_name)}',
+
+                f'create unique index {sqlite_quote_id(f"{m.table_name}_id")} on {sqlite_quote_id(m.table_name)}({sqlite_quote_id(m.id_column)});',
+
+                # <https://sqlite.org/pragma.html#pragma_optimize>
+                f'pragma optimize;')
+
+            # We're going to use Metadata.columns to generate the select
+            # statement, so ensure it matches what SQLite's .import created.
+            assert m.columns == (table_columns := sqlite3_table_columns(db_path, m.table_name)), \
+                f"{m.columns!r} == {table_columns!r}"
+
+            # Track which columns appear in which metadata inputs, preserving
+            # the order of both.
+            for column in m.columns:
+                # Match different id column names in different metadata files
+                # since they're logically equivalent.
+                output_column = output_id_column if column == m.id_column else column
+
+                output_columns.setdefault(output_column, [])
+                output_columns[output_column] += [(m.table_name, column)]
+
+
+        # Construct query to produce merged metadata.
+        select_list = [
+            # Output metadata columns coalesced across input metadata columns
+            *(f"""coalesce({', '.join(f"nullif({x}, '')" for x in starmap(sqlite_quote_id, reversed(input_columns)))}, null) as {sqlite_quote_id(output_column)}"""
+                for output_column, input_columns in output_columns.items()),
+
+            # Source columns
+            *(f"""{sqlite_quote_id(m.table_name, m.id_column)} is not null as {sqlite_quote_id(f'__source_metadata_{m.name}')}"""
+                for m in metadata)]
+
+        from_list = [
+            sqlite_quote_id(metadata[0].table_name),
+            *(f"full outer join {sqlite_quote_id(m.table_name)} on {sqlite_quote_id(m.table_name, m.id_column)} in ({', '.join(sqlite_quote_id(m.table_name, m.id_column) for m in reversed(preceding))})"
+                for m, preceding in [(m, metadata[:i]) for i, m in enumerate(metadata[1:], 1)])]
+
+        # Take some small pains to make the query readable since it makes
+        # debugging and development easier.  Note that backslashes aren't
+        # allowed inside f-string expressions, hence the *newline* variable.
+        newline = '\n'
+        query = dedent(f"""\
+            select
+                {(',' + newline + '                ').join(select_list)}
+            from
+                {(newline + '                ').join(from_list)}
+            ;
+            """)
+
+
+        # Write merged metadata as export from SQLite db.
+        #
+        # Assume TSV like nearly all other extant --output-metadata options.
+        print_info(f"Merging metadata and writing to {args.output_metadata!r}…")
+        sqlite3(db_path,
+            f'.mode tabs',
+            f'.headers on',
+            f'.once {sqlite_quote_dot(f"|{augur} write-file {shquote(args.output_metadata)}")}',
+            query)
+
+    except SQLiteError as err:
+        delete_db = False
+        raise AugurError(str(err)) from err
+
+    finally:
+        if delete_db:
+            os.unlink(db_path)
+        else:
+            print_info(f"WARNING: Skipped deletion of {db_path} due to error, but you may want to clean it up yourself (e.g. if it's large).")
+
+
+def sqlite3(*args, **kwargs):
+    """
+    Internal helper for invoking ``sqlite3``, the SQLite CLI program.
+    """
+    sqlite3 = os.environ.get("SQLITE3", which("sqlite3"))
+
+    if not sqlite3:
+        raise AugurError(dedent(f"""\
+            Unable to find the program `sqlite3`.  Is it installed?
+
+            In order to use `augur merge`, the SQLite 3 CLI (version ≥3.39)
+            must be installed separately.  It is typically provided by a
+            Nextstrain runtime.
+            """))
+
+    proc = subprocess.run([sqlite3, "-batch", *args], encoding="utf-8", text=True, **kwargs)
+
+    try:
+        proc.check_returncode()
+    except subprocess.CalledProcessError as err:
+        raise SQLiteError(f"sqlite3 invocation failed") from err
+
+    return proc
+
+
+class SQLiteError(Exception):
+    pass
+
+
+def sqlite3_table_columns(db_path, table: str) -> Iterable[str]:
+    return sqlite3(db_path, f"select name from pragma_table_info({sqlite_quote_id(table)})", capture_output=True).stdout.splitlines();
+
+
+def sqlite_quote_id(*xs):
+    """
+    Quote a SQLite identifier.
+
+    <https://sqlite.org/lang_keywords.html>
+
+    >>> sqlite_quote_id('foo bar')
+    '"foo bar"'
+    >>> sqlite_quote_id('table name', 'column name')
+    '"table name"."column name"'
+    >>> sqlite_quote_id('weird"name')
+    '"weird""name"'
+    """
+    return '.'.join('"' + x.replace('"', '""') + '"' for x in xs)
+
+
+def sqlite_quote_dot(x):
+    """
+    Quote a SQLite CLI dot-command argument.
+
+    <https://sqlite.org/cli.html#dot_command_arguments>
+    """
+    return '"' + x.replace('\\', '\\\\').replace('"', '\\"') + '"'
+
+
+def count_unique(xs: Iterable[T]) -> Iterable[Tuple[T, int]]:
+    # Using reduce() with a dict because it preserves input order, unlike
+    # itertools.groupby(), which requires a sort.  Preserving order is a nice
+    # property for the user since we generate an error message with this.
+    #   -trs, 24 July 2024
+    yield from reduce(lambda counts, x: {**counts, x: counts.get(x, 0) + 1}, xs, counts := {}).items()
+
+
+def indented_list(xs, prefix):
+    return f"\n{prefix}".join(xs)
diff --git a/docs/api/developer/augur.merge.rst b/docs/api/developer/augur.merge.rst
new file mode 100644
index 000000000..e3f404969
--- /dev/null
+++ b/docs/api/developer/augur.merge.rst
@@ -0,0 +1,7 @@
+augur.merge module
+==================
+
+.. automodule:: augur.merge
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/api/developer/augur.rst b/docs/api/developer/augur.rst
index 8d611731b..b778fff5b 100644
--- a/docs/api/developer/augur.rst
+++ b/docs/api/developer/augur.rst
@@ -41,6 +41,7 @@ Submodules
    augur.index
    augur.lbi
    augur.mask
+   augur.merge
    augur.parse
    augur.read_file
    augur.reconstruct_sequences
diff --git a/docs/usage/cli/cli.rst b/docs/usage/cli/cli.rst
index 06dfccf21..ca559654e 100644
--- a/docs/usage/cli/cli.rst
+++ b/docs/usage/cli/cli.rst
@@ -11,6 +11,7 @@ We're in the process of adding examples and more extensive documentation for eac
 
 	parse
 	curate/index
+	merge
 	index
 	filter
 	mask
diff --git a/docs/usage/cli/merge.rst b/docs/usage/cli/merge.rst
new file mode 100644
index 000000000..0e59650ff
--- /dev/null
+++ b/docs/usage/cli/merge.rst
@@ -0,0 +1,9 @@
+===========
+augur merge
+===========
+
+.. argparse::
+    :module: augur
+    :func: make_parser
+    :prog: augur
+    :path: merge
diff --git a/tests/functional/merge/cram/merge.t b/tests/functional/merge/cram/merge.t
new file mode 100644
index 000000000..2c5ec770a
--- /dev/null
+++ b/tests/functional/merge/cram/merge.t
@@ -0,0 +1,182 @@
+SETUP
+
+  $ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
+
+
+BASIC USAGE
+
+Full outer join like behaviour, column coalescing/overwriting, and recording of
+each row's source file(s) in extra columns.
+
+  $ cat >x.tsv <<~~
+  > strain	a	b	c
+  > one	X1a	X1b	X1c
+  > two	X2a	X2b	X2c
+  > ~~
+
+  $ cat >y.tsv <<~~
+  > strain	b	c	f	e	d
+  > two		Y2c	Y2f	Y2e	Y2d
+  > three			Y3f	Y3e	Y3d
+  > ~~
+
+  $ ${AUGUR} merge \
+  >   --metadata X=x.tsv Y=y.tsv \
+  >   --output-metadata - --quiet | tsv-pretty
+  strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one     X1a  X1b  X1c                                   1                    0
+  two     X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                  Y3f  Y3e  Y3d                    0                    1
+
+More than two inputs.
+
+  $ cat >z.tsv <<~~
+  > strain	g	c
+  > one	Z1g	
+  > two	Z2g	Z2c
+  > three	Z3g	
+  > ~~
+
+  $ ${AUGUR} merge \
+  >   --metadata X=x.tsv Y=y.tsv Z=z.tsv \
+  >   --output-metadata - --quiet | tsv-pretty
+  strain  a    b    c    f    e    d    g    __source_metadata_X  __source_metadata_Y  __source_metadata_Z
+  one     X1a  X1b  X1c                 Z1g                    1                    0                    1
+  two     X2a  X2b  Z2c  Y2f  Y2e  Y2d  Z2g                    1                    1                    1
+  three                  Y3f  Y3e  Y3d  Z3g                    0                    1                    1
+
+Supports Augur's standard id column detection.  Note that the first file's id
+column name (e.g. "name" here) is used as the output id column name, per
+Augur's convention of preserving the input id column name.
+
+  $ sed '1s/^strain/name/g' < x.tsv > x-name-column.tsv
+  $ ${AUGUR} merge \
+  >   --metadata X=x-name-column.tsv Y=y.tsv \
+  >   --output-metadata - --quiet | tsv-pretty
+  name   a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one    X1a  X1b  X1c                                   1                    0
+  two    X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                 Y3f  Y3e  Y3d                    0                    1
+
+  $ sed '1s/^strain/name/g' < y.tsv > y-name-column.tsv
+  $ ${AUGUR} merge \
+  >   --metadata X=x.tsv Y=y-name-column.tsv \
+  >   --output-metadata - --quiet | tsv-pretty
+  strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one     X1a  X1b  X1c                                   1                    0
+  two     X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                  Y3f  Y3e  Y3d                    0                    1
+
+Supports --metadata-id-columns.
+
+  $ sed '1s/^strain/id/g' < x.tsv > x-id-column.tsv
+  $ ${AUGUR} merge \
+  >   --metadata X=x-id-column.tsv Y=y.tsv \
+  >   --metadata-id-columns id strain \
+  >   --output-metadata - --quiet | tsv-pretty
+  id     a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one    X1a  X1b  X1c                                   1                    0
+  two    X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                 Y3f  Y3e  Y3d                    0                    1
+
+Supports Augur's standard delimiter detection.
+
+  $ sed 's/\t/,/g' < x.tsv > x.csv
+  $ ${AUGUR} merge \
+  >   --metadata X=x.csv Y=y.tsv \
+  >   --output-metadata - --quiet | tsv-pretty
+  strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one     X1a  X1b  X1c                                   1                    0
+  two     X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                  Y3f  Y3e  Y3d                    0                    1
+
+Supports --metadata-delimiters.
+
+  $ sed 's/\t/|/g' < x.tsv > x.txt
+  $ ${AUGUR} merge \
+  >   --metadata X=x.txt Y=y.tsv \
+  >   --metadata-delimiters '|' $'\t' \
+  >   --output-metadata - --quiet | tsv-pretty
+  strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one     X1a  X1b  X1c                                   1                    0
+  two     X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                  Y3f  Y3e  Y3d                    0                    1
+
+Supports Augur's standard accepted compression formats.
+
+  $ xz   < x.tsv > x.tsv.xz
+  $ zstd < y.tsv > y.tsv.zst
+  $ ${AUGUR} merge \
+  >   --metadata X=x.tsv.xz Y=y.tsv.zst \
+  >   --output-metadata - --quiet | tsv-pretty
+  strain  a    b    c    f    e    d    __source_metadata_X  __source_metadata_Y
+  one     X1a  X1b  X1c                                   1                    0
+  two     X2a  X2b  Y2c  Y2f  Y2e  Y2d                    1                    1
+  three                  Y3f  Y3e  Y3d                    0                    1
+
+
+OFF THE BEATEN PATH
+
+Metadata names are only the part before the first '='.
+
+  $ cp x.tsv x=first.tsv
+  $ ${AUGUR} merge \
+  >   --metadata X=x=first.tsv Y=y.tsv \
+  >   --output-metadata /dev/null
+  Reading 'X' metadata from 'x=first.tsv'…
+  Reading 'Y' metadata from 'y.tsv'…
+  Merging metadata and writing to '/dev/null'…
+
+
+ERROR HANDLING
+
+At least two metadata inputs are required.
+
+  $ ${AUGUR} merge \
+  >   --metadata X=x.tsv \
+  >   --output-metadata -
+  ERROR: At least two metadata inputs are required for merging.
+  [2]
+
+Metadata names are required.
+
+  $ ${AUGUR} merge \
+  >   --metadata x.tsv =y.tsv \
+  >   --output-metadata -
+  ERROR: All metadata inputs must be assigned a name, e.g. with NAME=FILE.
+  
+  The following inputs were missing a name:
+  
+    'x.tsv'
+    '=y.tsv'
+  
+  [2]
+
+Metadata names must be unique.
+
+  $ ${AUGUR} merge \
+  >   --metadata data=x.tsv data=y.tsv \
+  >   --output-metadata -
+  ERROR: Metadata input names must be unique.
+  
+  The following names were used more than once:
+  
+    'data'
+  
+  [2]
+
+Duplicates.
+
+  $ cat >dups.tsv <<~~
+  > strain	a	b	c
+  > one	1a	1b	1c
+  > one	2a	2b	2c
+  > ~~
+  $ ${AUGUR} merge \
+  >   --metadata dups=dups.tsv Y=y.tsv \
+  >   --output-metadata /dev/null
+  Reading 'dups' metadata from 'dups.tsv'…
+  Error: stepping, UNIQUE constraint failed: metadata_dups.strain (19)
+  WARNING: Skipped deletion of */augur-merge-*.sqlite due to error, but you may want to clean it up yourself (e.g. if it's large). (glob)
+  ERROR: sqlite3 invocation failed
+  [2]