From 54ebcf3a863bbbc6ec181cb2e8ddf556df44f197 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 11 Jul 2024 11:35:43 -0700 Subject: [PATCH] augur {read,write}-file Add commands to read and write files using Augur's conventions. This allows external programs to do i/o like Augur by piping from/to `augur read-file` or `augur write-file`. In some simple testing, the overhead of passing text i/o thru Python vs. not is minimal for our use cases and worth the cost of consistent compression and newline handling. I'll be using this to allow SQLite to read/write files like Augur. --- CHANGES.md | 5 ++ augur/__init__.py | 2 + augur/read_file.py | 73 +++++++++++++++++++++++++ augur/write_file.py | 54 ++++++++++++++++++ docs/api/developer/augur.read_file.rst | 7 +++ docs/api/developer/augur.rst | 2 + docs/api/developer/augur.write_file.rst | 7 +++ docs/usage/cli/cli.rst | 2 + docs/usage/cli/read-file.rst | 9 +++ docs/usage/cli/write-file.rst | 9 +++ 10 files changed, 170 insertions(+) create mode 100644 augur/read_file.py create mode 100644 augur/write_file.py create mode 100644 docs/api/developer/augur.read_file.rst create mode 100644 docs/api/developer/augur.write_file.rst create mode 100644 docs/usage/cli/read-file.rst create mode 100644 docs/usage/cli/write-file.rst diff --git a/CHANGES.md b/CHANGES.md index d780d3e3d..c16ff138d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,11 +2,16 @@ ## __NEXT__ +### Features + +* Two new commands, `augur read-file` and `augur write-file`, now allow external programs to do i/o like Augur by piping from/to these new commands. They provide handling of compression formats and newlines consistent with the rest of Augur. [#1562][] (@tsibley) + ### Bug Fixes * Embedded newlines in quoted field values of metadata files are now properly handled. [#1561][] (@tsibley) [#1561]: https://github.com/nextstrain/augur/pull/1561 +[#1562]: https://github.com/nextstrain/augur/pull/1562 diff --git a/augur/__init__.py b/augur/__init__.py index c6662b426..7b4f2066b 100644 --- a/augur/__init__.py +++ b/augur/__init__.py @@ -42,6 +42,8 @@ "version", "import_", "measurements", + "read_file", + "write_file", ] COMMANDS = [importlib.import_module('augur.' + c) for c in command_strings] diff --git a/augur/read_file.py b/augur/read_file.py new file mode 100644 index 000000000..6b9e45892 --- /dev/null +++ b/augur/read_file.py @@ -0,0 +1,73 @@ +r""" +Read a file like Augur, with transparent optimized decompression and universal newlines. + +Input is read from the given file path, as the compression format detection +requires a seekable stream. The given path may be "-" to explicitly read from +stdin, but no decompression will be done. + +Output is always to stdout. + +Universal newline translation is always performed, so \n, \r\n, and \r in the +input are all translated to the system's native newlines (e.g. \n on Unix, \r\n +on Windows) in the output. +""" +import io +import os +import signal +import sys +from shutil import copyfileobj + +from .io.file import open_file +from .utils import first_line + + +# The buffer size used by xopen() (which underlies open_file()), which notes: +# 128KB [KiB] buffer size also used by cat, pigz etc. It is faster than the 8K +# [KiB] default. +BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) + +SIGPIPE = getattr(signal, "SIGPIPE", None) + + +def register_parser(parent_subparsers): + parser = parent_subparsers.add_parser("read-file", help=first_line(__doc__)) + parser.add_argument("path", metavar="PATH", help="path to file") + return parser + + +def run(args): + with open_file(args.path, "rt", newline=None) as f: + # It's tempting to want to splice(2) here, but it turns out to make + # little sense. Firstly, the availability of splice(2) is Linux, + # Python ≥3.10, and one of the files needs to be a pipe. The chance of + # all of those together is slim-to-none, particularly because even in + # the common case of xopen() reading from a pipe—the stdout of an + # external decompression process—we can't use that pipe directly + # because xopen() always buffers the first block of the file into + # Python.¹ Secondly, we want universal newline handling—so that + # callers get behaviour consistent with the rest of Augur—and that + # rules out splice(2). + # + # Copying the data thru Python instead of with splice(2) seems fast + # enough in some quick trials with large files (e.g. against `zstd` + # directly), and the bottlenecks in pipelines using this command will + # often not be this command's i/o. + # -trs, 11 July 2024, updated 24 July 2024 + # + # ¹ + + # Handle SIGPIPE, which Python converts to BrokenPipeError, gracefully + # and like most Unix programs. See also + # . + try: + copyfileobj(f, sys.stdout, BUFFER_SIZE) + + # Force a flush so if SIGPIPE is going to happen it happens now. + sys.stdout.flush() + except BrokenPipeError: + # Avoid errors from Python automatically flushing stdout on exit. + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + + # Return conventional exit status for "killed by SIGPIPE" on Unix. + return 128 + SIGPIPE if SIGPIPE else 1 diff --git a/augur/write_file.py b/augur/write_file.py new file mode 100644 index 000000000..27955ec25 --- /dev/null +++ b/augur/write_file.py @@ -0,0 +1,54 @@ +r""" +Write a file like Augur, with transparent optimized compression and universal newlines. + +Input is always from stdin. + +Output is to the given file path, as the compression format detection require +it. The given path may be "-" to explicitly write to stdout, but no +decompression will be done. + +Universal newline translation is always performed, so \n, \r\n, and \r in the +input are all translated to the system's native newlines (e.g. \n on Unix, \r\n +on Windows) in the output. +""" +import io +import sys +from shutil import copyfileobj + +from .io.file import open_file +from .utils import first_line + + +# The buffer size used by xopen() (which underlies open_file()), which notes: +# 128KB [KiB] buffer size also used by cat, pigz etc. It is faster than the 8K +# [KiB] default. +BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) + + +def register_parser(parent_subparsers): + parser = parent_subparsers.add_parser("write-file", help=first_line(__doc__)) + parser.add_argument("path", metavar="PATH", help="path to file") + return parser + + +def run(args): + with open_file(args.path, "wt", newline=None) as f: + # It's tempting to want to splice(2) here, but it turns out to make + # little sense. Firstly, the availability of splice(2) is Linux, + # Python ≥3.10, and one of the files needs to be a pipe. The chance of + # all of those together is slim-to-none, particularly because even in + # the common case of xopen() reading from a pipe—the stdout of an + # external decompression process—we can't use that pipe directly + # because xopen() always buffers the first block of the file into + # Python.¹ Secondly, we want universal newline handling—so that + # callers get behaviour consistent with the rest of Augur—and that + # rules out splice(2). + # + # Copying the data thru Python instead of with splice(2) seems fast + # enough in some quick trials with large files (e.g. against `zstd` + # directly), and the bottlenecks in pipelines using this command will + # often not be this command's i/o. + # -trs, 11 July 2024, updated 24 July 2024 + # + # ¹ + copyfileobj(sys.stdin, f, BUFFER_SIZE) diff --git a/docs/api/developer/augur.read_file.rst b/docs/api/developer/augur.read_file.rst new file mode 100644 index 000000000..97b09a3b5 --- /dev/null +++ b/docs/api/developer/augur.read_file.rst @@ -0,0 +1,7 @@ +augur.read\_file module +======================= + +.. automodule:: augur.read_file + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/developer/augur.rst b/docs/api/developer/augur.rst index b2d4dc512..8d611731b 100644 --- a/docs/api/developer/augur.rst +++ b/docs/api/developer/augur.rst @@ -42,6 +42,7 @@ Submodules augur.lbi augur.mask augur.parse + augur.read_file augur.reconstruct_sequences augur.refine augur.sequence_traits @@ -55,3 +56,4 @@ Submodules augur.validate augur.validate_export augur.version + augur.write_file diff --git a/docs/api/developer/augur.write_file.rst b/docs/api/developer/augur.write_file.rst new file mode 100644 index 000000000..f938cbf5f --- /dev/null +++ b/docs/api/developer/augur.write_file.rst @@ -0,0 +1,7 @@ +augur.write\_file module +======================== + +.. automodule:: augur.write_file + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/usage/cli/cli.rst b/docs/usage/cli/cli.rst index 39ff466f0..06dfccf21 100644 --- a/docs/usage/cli/cli.rst +++ b/docs/usage/cli/cli.rst @@ -32,3 +32,5 @@ We're in the process of adding examples and more extensive documentation for eac version import measurements + read-file + write-file diff --git a/docs/usage/cli/read-file.rst b/docs/usage/cli/read-file.rst new file mode 100644 index 000000000..916909dcb --- /dev/null +++ b/docs/usage/cli/read-file.rst @@ -0,0 +1,9 @@ +=============== +augur read-file +=============== + +.. argparse:: + :module: augur + :func: make_parser + :prog: augur + :path: read-file diff --git a/docs/usage/cli/write-file.rst b/docs/usage/cli/write-file.rst new file mode 100644 index 000000000..2a0d06127 --- /dev/null +++ b/docs/usage/cli/write-file.rst @@ -0,0 +1,9 @@ +================ +augur write-file +================ + +.. argparse:: + :module: augur + :func: make_parser + :prog: augur + :path: write-file