Skip to content

Commit

Permalink
🚧 merge sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Aug 27, 2024
1 parent ebe2011 commit 57a9ca8
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 2 deletions.
52 changes: 50 additions & 2 deletions augur/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,30 @@ def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("merge", help=first_line(__doc__))

input_group = parser.add_argument_group("inputs", "options related to input")
input_group.add_argument("--metadata", nargs="+", action="extend", required=True, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
# FIXME: allow metadata and/or sequences. require at least one.
input_group.add_argument("--metadata", nargs="+", action="extend", required=False, metavar="NAME=FILE", help="Required. Metadata table names and file paths. Names are arbitrary monikers used solely for referring to the associated input file in other arguments and in output column names. Paths must be to seekable files, not unseekable streams. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
input_group.add_argument("--sequences", nargs="+", action="extend", required=False, metavar="FILE", help="Sequence files. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)

input_group.add_argument("--metadata-id-columns", default=DEFAULT_ID_COLUMNS, nargs="+", action=ExtendOverwriteDefault, metavar="COLUMN", help=f"Possible metadata column names containing identifiers, considered in the order given. Columns will be considered for all metadata tables. Only one ID column will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_ID_COLUMNS))})" + SKIP_AUTO_DEFAULT_IN_HELP)
input_group.add_argument("--metadata-delimiters", default=DEFAULT_DELIMITERS, nargs="+", action=ExtendOverwriteDefault, metavar="CHARACTER", help=f"Possible field delimiters to use for reading metadata tables, considered in the order given. Delimiters will be considered for all metadata tables. Only one delimiter will be inferred for each table. (default: {' '.join(map(shquote_humanized, DEFAULT_DELIMITERS))})" + SKIP_AUTO_DEFAULT_IN_HELP)

output_group = parser.add_argument_group("outputs", "options related to output")
output_group.add_argument('--output-metadata', required=True, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
# FIXME: allow metadata and/or sequences. require at least one.
# FIXME: ensure outputs are paired with respective inputs
output_group.add_argument('--output-metadata', required=False, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
output_group.add_argument('--output-sequences', required=False, metavar="FILE", help="Required. Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP)
output_group.add_argument('--quiet', action="store_true", default=False, help="Suppress informational and warning messages normally written to stderr. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP)

return parser


def run(args):
# FIXME: when both metadata and sequences are specified, ensure they are
# paired properly (same sequence IDs)
if args.metadata:
merge_metadata(args)
if args.sequences:
merge_sequences(args)


def merge_metadata(args):
Expand Down Expand Up @@ -248,6 +257,45 @@ def merge_metadata(args):
print_info(f"WARNING: Skipped deletion of {db_path} due to error, but you may want to clean it up yourself (e.g. if it's large).")


# FIXME: return a list of arguments and don't use shell
def cat(filepath: str):
cat = "cat"

if filepath.endswith(".gz"):
cat = "gzcat"
if filepath.endswith(".xz"):
cat = "xzcat"
if filepath.endswith(".zst"):
cat = "zstdcat"

return f"{cat} {filepath}"


def merge_sequences(args):
# Parse --sequences arguments
if not len(args.sequences) >= 2:
raise AugurError(f"At least two sequence inputs are required for merging.")

# FIXME: error if there are any duplicates within a single sequence file

# Reversed because seqkit rmdup keeps the first entry but this command
# should keep the last entry.
# FIXME: don't use shell. just using it to get a sense of feasibility.
cat_processes = (f"<({cat(filepath)})" for filepath in reversed(args.sequences))
shell_cmd = f"cat {' '.join(cat_processes)} | seqkit rmdup"
print_debug(F"running shell command {shell_cmd!r}")
process = subprocess.Popen(shell_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE)

# FIXME: handle `-` better
output = process.communicate()[0]
if args.output_sequences == "-":
sys.stdout.write(output.decode())
else:
with open(args.output, "w") as f:
f.write(output.decode())


# FIXME: do this for seqkit too
def sqlite3(*args, **kwargs):
"""
Internal helper for invoking ``sqlite3``, the SQLite CLI program.
Expand Down
38 changes: 38 additions & 0 deletions tests/functional/merge/cram/merge-sequences.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
SETUP

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../bin/augur}"
$ export AUGUR_DEBUG=1

Merge sequences without metadata

$ cat >x.fasta <<~~
> >seq1
> ATCG
> >seq2
> GCTA
> >seq3
> TCGA
> ~~

$ cat >y.fasta <<~~
> >seq3
> ATCG
> >seq4
> GCTA
> ~~

$ ${AUGUR} merge \
> --sequences x.fasta y.fasta \
> --output-sequences - > merged.fasta
running shell command 'cat <(cat y.fasta) <(cat x.fasta) | seqkit rmdup'
[INFO]\x1b[0m 1 duplicated records removed (esc)

$ cat merged.fasta
>seq3
ATCG
>seq4
GCTA
>seq1
ATCG
>seq2
GCTA

0 comments on commit 57a9ca8

Please sign in to comment.