diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..46d5b8e
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,15 @@
+# EditorConfig is awesome: http://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+
+[*.{yml,yaml}]
+indent_style = space
+indent_size = 4
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..74d81cc
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.smk linguist-language=Python
+Snakefile linguist-language=Python
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..f0c554b
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,18 @@
+name: docs
+on:
+ workflow_dispatch:
+ push:
+ paths:
+ - 'docs/**'
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.9
+ - run: pip install --upgrade pip
+ - run: pip install -r docs/requirements.txt
+ - run: mkdocs gh-deploy --force
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 0000000..ad50293
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,35 @@
+name: tests
+
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - master
+ - main
+ pull_request:
+ branches_ignore: []
+
+jobs:
+ Dry_Run_and_Lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: docker://snakemake/snakemake:v5.24.2
+ - name: Dry Run with test data
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 \
+ /opt2/mpox-seek run --input \
+ /opt2/.tests/WT_S1_0.fastq.gz /opt2/.tests/WT_S1_1.fastq.gz \
+ /opt2/.tests/WT_S2_0.fastq.gz /opt2/.tests/WT_S2_1.fastq.gz \
+ /opt2/.tests/WT_S3_1.fastq.gz /opt2/.tests/WT_S3_2.fastq.gz \
+ /opt2/.tests/WT_S3_3.fastq.gz /opt2/.tests/WT_S3_4.fastq.gz \
+ /opt2/.tests/WT_S4.fastq.gz /opt2/.tests/WT_S5.fastq.gz \
+ --output /opt2/output --mode local --dry-run
+ - name: View the pipeline config file
+ run: |
+ echo "Generated config file for pipeline...." && cat $PWD/output/config.json
+ - name: Lint Workflow
+ continue-on-error: true
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \
+ echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a403fea
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,171 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+site/
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# DS_Store
+.DS_Store
+._*
+**/.DS_Store
+**/._*
+
+.snakemake*
+**/.snakemake*
+.venv
+.venv/*
+
+# Pipeline Results or Output
+results/
+output/
+tmp/
+scratch/
+
+# mkdocs documentation
+site/
+
+# Pipeline generated files or directories
+.tests/*/
+.snakemake/
+
+# Cached Java directories
+.oracle_jre_usage/
+.java/
+
+# GNU Parallel
+.parallel/
+
+# Temp files
+*.tmp
+**/*.tmp
+
+# Test script and test
+# output directories
+test.sh
+test_*/
+tmp_*/
diff --git a/.tests/WT_S1_0.fastq.gz b/.tests/WT_S1_0.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S1_1.fastq.gz b/.tests/WT_S1_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S2_0.fastq.gz b/.tests/WT_S2_0.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S2_1.fastq.gz b/.tests/WT_S2_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S3_1.fastq.gz b/.tests/WT_S3_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S3_2.fastq.gz b/.tests/WT_S3_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S3_3.fastq.gz b/.tests/WT_S3_3.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S3_4.fastq.gz b/.tests/WT_S3_4.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S4.fastq.gz b/.tests/WT_S4.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S5.fastq.gz b/.tests/WT_S5.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..9820e20
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.1.0-beta] - 2024-03-29
+### Start
+ - Created scaffold from [nanite](https://github.com/OpenOmics/nanite) for building the pipeline
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..20c61ad
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 OpenOmics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..6e8bf73
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/config/cluster.json b/config/cluster.json
new file mode 100644
index 0000000..8412731
--- /dev/null
+++ b/config/cluster.json
@@ -0,0 +1,9 @@
+{
+ "__default__": {
+ "threads": "2",
+ "mem": "8g",
+ "partition": "norm",
+ "gres": "lscratch:32",
+ "time": "0-04:00:00"
+ }
+}
diff --git a/config/config.json b/config/config.json
new file mode 100644
index 0000000..7056c94
--- /dev/null
+++ b/config/config.json
@@ -0,0 +1,4 @@
+{
+ "options": {
+ }
+}
diff --git a/config/containers.json b/config/containers.json
new file mode 100644
index 0000000..490572b
--- /dev/null
+++ b/config/containers.json
@@ -0,0 +1,5 @@
+{
+ "images": {
+ "mpox-seek": "docker://skchronicles/mpox:v0.1.0"
+ }
+}
diff --git a/config/genome.json b/config/genome.json
new file mode 100644
index 0000000..43eddde
--- /dev/null
+++ b/config/genome.json
@@ -0,0 +1,5 @@
+{
+ "references": {
+
+ }
+}
diff --git a/config/install.json b/config/install.json
new file mode 100644
index 0000000..40a4e4a
--- /dev/null
+++ b/config/install.json
@@ -0,0 +1,7 @@
+{
+ "install": {
+ "resource_bundle": {
+
+ }
+ }
+}
diff --git a/config/modules.json b/config/modules.json
new file mode 100644
index 0000000..e685230
--- /dev/null
+++ b/config/modules.json
@@ -0,0 +1,7 @@
+{
+ "modules": {
+ },
+ "conda":{
+ "mpox-seek": "workflow/envs/mpox.yaml"
+ }
+}
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..2f30bcd
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,35 @@
+## Steps for Building Docker Images
+
+Directly below are instructions for building an image using the provided Dockerfile:
+
+```bash
+# See listing of images on computer
+docker image ls
+
+# Build from Dockerfile
+docker build --no-cache -f example.dockerfile --tag=example:v0.1.0 .
+
+# Testing, take a peek inside
+docker run -ti example:v0.1.0 /bin/bash
+
+# Updating Tag before pushing to DockerHub
+docker tag example:v0.1.0 skchronicles/example:v0.1.0
+docker tag example:v0.1.0 skchronicles/example # latest
+
+# Check out new tag(s)
+docker image ls
+
+# Push new tagged image to DockerHub
+docker push skchronicles/example:v0.1.0
+docker push skchronicles/example:latest
+```
+
+### Other Recommended Steps
+
+Scan your image for known vulnerabilities:
+
+```bash
+docker scan example:v0.1.0
+```
+
+> **Please Note**: Any references to `skchronicles` should be replaced your username if you would also like to push the image to a non-org account.
diff --git a/docker/mpox-seek/Dockerfile b/docker/mpox-seek/Dockerfile
new file mode 100644
index 0000000..63390d2
--- /dev/null
+++ b/docker/mpox-seek/Dockerfile
@@ -0,0 +1,99 @@
+# Base image for metavirs,
+# uses Ubuntu Jammy (LTS)
+FROM ubuntu:22.04
+
+# Depedencies of metvirs:
+# - nanofilt/2.8.0 # from src, installed: 2.8.0
+# @requires: python3
+# @requires: python3-biopython
+# @requires: python3-pandas
+# - minimap2/2.24 # apt-get, installed: 2.24
+# - kronatools/2.8.1 # from src, installed: 2.8.1
+# @requires: curl
+LABEL maintainer=kuhnsa@nih.gov
+
+############### INIT ################
+# Create Container filesystem specific
+# working directory and opt directories
+# to avoid collisions with the host's
+# filesystem, i.e. /opt and /data
+RUN mkdir -p /opt2 && mkdir -p /data2
+WORKDIR /opt2
+
+# Set time zone to US east coast
+ENV TZ=America/New_York
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime \
+ && echo $TZ > /etc/timezone
+
+############### SETUP ################
+# This section installs system packages
+# required for your project. If you need
+# extra system packages add them here.
+RUN apt-get update \
+ && apt-get -y upgrade \
+ && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ build-essential \
+ gawk \
+ git \
+ gzip \
+ locales \
+ make \
+ unzip \
+ wget \
+ zlib1g-dev \
+ # kronatools dependencies
+ curl \
+ perl \
+ cpanminus \
+ # nanofilt dependencies
+ python3 \
+ python3-pip \
+ python3-biopython \
+ python3-pandas \
+ && apt-get clean && apt-get purge \
+ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Set the locale
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# Make python3 the default interpreter
+# and install Python Packages
+# and install nanofilt/2.8.0
+RUN ln -sf /usr/bin/python3.8 /usr/bin/python
+RUN pip3 install --upgrade pip \
+ && pip3 install argparse \
+ && pip3 install nanofilt==2.8.0
+
+############### INSTALL ################
+# Install any bioinformatics tools
+# available with apt-get on Ubuntu/22.04
+RUN apt-get update \
+ && apt-get -y upgrade \
+ && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+ # minimap2/2.24
+ minimap2 \
+ # samtools/1.13
+ samtools \
+ && apt-get clean && apt-get purge \
+ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+############### MANUAL ################
+# Install kronatools/v2.8.1) manually,
+# not available to apt-get on Ubuntu/20.04.
+# Dependencies already satisfied:
+# https://github.com/marbl/Krona/releases
+RUN wget https://github.com/marbl/Krona/releases/download/v2.8.1/KronaTools-2.8.1.tar \
+ && tar -xvf KronaTools-2.8.1.tar \
+ && rm KronaTools-2.8.1.tar \
+ && cd KronaTools-2.8.1/ \
+ && ./install.pl
+WORKDIR /opt2
+
+################ POST #################
+# Add Dockerfile and export environment
+# variables and set java8 as default with links
+# to alternative versions
+ADD Dockerfile /opt2/mpox-seek.dockerfile
+RUN chmod -R a+rX /opt2
+ENV PATH="/opt2:$PATH"
+WORKDIR /data2
\ No newline at end of file
diff --git a/docker/mpox-seek/README.md b/docker/mpox-seek/README.md
new file mode 100644
index 0000000..a263ebd
--- /dev/null
+++ b/docker/mpox-seek/README.md
@@ -0,0 +1,35 @@
+## Steps for Building Docker Images
+
+Directly below are instructions for building an image using the provided Dockerfile:
+
+```bash
+# See listing of images on computer
+docker image ls
+
+# Build from Dockerfile
+docker build --no-cache -f Dockerfile --tag=mpox-seek:v0.1.0 .
+
+# Testing, take a peek inside
+docker run -ti mpox-seek:v0.1.0 /bin/bash
+
+# Updating Tag before pushing to DockerHub
+docker tag mpox-seek:v0.1.0 skchronicles/mpox-seek:v0.1.0
+docker tag mpox-seek:v0.1.0 skchronicles/mpox-seek # latest
+
+# Check out new tag(s)
+docker image ls
+
+# Push new tagged image to DockerHub
+docker push skchronicles/mpox-seek:v0.1.0
+docker push skchronicles/mpox-seek:latest
+```
+
+### Other Recommended Steps
+
+Scan your image for known vulnerabilities:
+
+```bash
+docker scan mpox-seek:v0.1.0
+```
+
+> **Please Note**: Any references to `skchronicles` should be replaced your username if you would also like to push the image to a non-org account.
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..aaf7278
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,33 @@
+# Build documentation
+
+> **Please Note:** When a commit is pushed to the `docs/` directory, it triggers a [github actions workflow](https://github.com/OpenOmics/mpox-seek/actions) to build the static-site and push it to the gh-pages branch.
+
+### Installation
+```bash
+# Clone the Repository
+git clone https://github.com/OpenOmics/mpox-seek.git
+cd mpox-seek/
+# Create a virtual environment
+python3 -m venv .venv
+# Activate the virtual environment
+. .venv/bin/activate
+# Update pip
+pip install --upgrade pip
+# Download Dependencies
+pip install -r docs/requirements.txt
+```
+
+### Preview while editing
+MkDocs includes a previewing server, so you can view your updates live and as you write your documentation. The server will automatically rebuild the site upon editing and saving a file.
+```bash
+# Activate the virtual environment
+. .venv/bin/activate
+# Start serving your documentation
+mkdocs serve
+```
+
+### Build static site
+Once you are content with your changes, you can build the static site:
+```bash
+mkdocs build
+```
diff --git a/docs/assets/favicon/favicon.ico b/docs/assets/favicon/favicon.ico
new file mode 100644
index 0000000..e85006a
Binary files /dev/null and b/docs/assets/favicon/favicon.ico differ
diff --git a/docs/assets/icons/doc-book.svg b/docs/assets/icons/doc-book.svg
new file mode 100644
index 0000000..10ced62
--- /dev/null
+++ b/docs/assets/icons/doc-book.svg
@@ -0,0 +1,9 @@
+
+
diff --git a/docs/css/extra.css b/docs/css/extra.css
new file mode 100644
index 0000000..f522c9a
--- /dev/null
+++ b/docs/css/extra.css
@@ -0,0 +1,24 @@
+@keyframes heart {
+ 0%, 40%, 80%, 100% {
+ transform: scale(1);
+ }
+ 20%, 60% {
+ transform: scale(1.15);
+ }
+}
+
+.heart {
+ animation: heart 1500ms infinite;
+}
+
+[data-md-color-scheme="slate"] {
+ --md-primary-fg-color: #1A1B23DE;
+ --md-typeset-a-color: #b1b9ed;
+}
+
+.md-typeset .admonition.custom-grid-button,
+.md-typeset details.custom-grid-button {
+ border-color: var(--md-code-bg-color);
+ border-width: 2px;
+ width: 45%;
+}
diff --git a/docs/examples.md b/docs/examples.md
new file mode 100644
index 0000000..cdcdda5
--- /dev/null
+++ b/docs/examples.md
@@ -0,0 +1,3 @@
+# Examples
+
+This page is under construction. Please come back later for more information!
diff --git a/docs/faq/questions.md b/docs/faq/questions.md
new file mode 100644
index 0000000..89d90f6
--- /dev/null
+++ b/docs/faq/questions.md
@@ -0,0 +1,4 @@
+# Frequently Asked Questions
+
+This page is still under construction. If you need immediate help, please [open an issue](https://github.com/OpenOmics/mpox-seek/issues) on Github!
+
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..c21b3e4
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,77 @@
+
+
+
mpox-seek 🔬
+
+
Targeted ONT Pipeline for Monkeypox
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ This is the home of the pipeline, mpox-seek. mpox-seek is a streamlined oxford nanopore pipeline for targeted monkeypox sequencing.
+
+
+
+
+## Overview
+Welcome to mpox-seek's documentation! This guide is the main source of documentation for users that are getting started with the [Monkeypox Nanopore Pipeline](https://github.com/OpenOmics/mpox-seek/).
+
+The **`./mpox-seek`** pipeline is composed several inter-related sub commands to setup and run the pipeline across different systems. Each of the available sub commands perform different functions:
+
+
+
+!!! inline custom-grid-button ""
+
+ [mpox-seek run
](usage/run.md)
+ Run the mpox-seek pipeline with your input files.
+
+!!! inline custom-grid-button ""
+
+ [mpox-seek unlock
](usage/unlock.md)
+ Unlocks a previous runs output directory.
+
+
+
+
+
+
+!!! inline custom-grid-button ""
+
+ [mpox-seek install
](usage/install.md)
+ Download remote reference files locally.
+
+
+!!! inline custom-grid-button ""
+
+ [mpox-seek cache
](usage/cache.md)
+ Cache remote software containers locally.
+
+
+
+**mpox-seek** is a streamlined viral metagenomics pipeline to align, collapse, and visualize targeted monekypox samples. It relies on technologies like [Singularity1](https://singularity.lbl.gov/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake2](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster.
+
+The pipeline is compatible with data generated from [Oxford Nanopore sequencing Technologies](https://nanoporetech.com/). As input, it accepts a set of gzipped FastQ files and can be run locally on a compute instance or on-premise using a cluster. A user can define the method or mode of execution. The pipeline can submit jobs to a cluster using a job scheduler like SLURM (more coming soon!). A hybrid approach ensures the pipeline is accessible to all users.
+
+Before getting started, we highly recommend reading through the [usage](usage/run.md) section of each available sub command.
+
+For more information about issues or trouble-shooting a problem, please checkout our [FAQ](faq/questions.md) prior to [opening an issue on Github](https://github.com/OpenOmics/mpox-seek/issues).
+
+## Contribute
+
+This site is a living document, created for and by members like you. mpox-seek is maintained by the members of OpenOmics and is improved by continous feedback! We encourage you to contribute new content and make improvements to existing content via pull request to our [GitHub repository :octicons-heart-fill-24:{ .heart }](https://github.com/OpenOmics/mpox-seek).
+
+## References
+**1.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459.
+**2.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600.
diff --git a/docs/license.md b/docs/license.md
new file mode 100644
index 0000000..3f321ba
--- /dev/null
+++ b/docs/license.md
@@ -0,0 +1,21 @@
+# MIT License
+
+*Copyright (c) 2022 OpenOmics*
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..dbe98a1
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,34 @@
+babel>=2.9.1
+click==7.1.2
+future==0.18.2
+gitdb==4.0.5
+GitPython==3.1.7
+htmlmin==0.1.12
+importlib-metadata>=3.10
+Jinja2==2.11.3
+joblib==0.16.0
+jsmin==3.0.0
+livereload==2.6.1
+lunr==0.5.8
+Markdown==3.2.2
+MarkupSafe==1.1.1
+mkdocs>=1.3.0
+mkdocs-awesome-pages-plugin==2.2.1
+mkdocs-git-revision-date-localized-plugin==0.7
+mkdocs-material
+mkdocs-material-extensions
+mkdocs-minify-plugin==0.3.0
+mkdocs-redirects==1.0.1
+nltk>=3.6.6
+pygments>=2.12
+pymdown-extensions
+pytz==2020.1
+PyYAML>=5.4
+regex
+six==1.15.0
+smmap==3.0.4
+tornado==6.0.4
+tqdm==4.48.2
+zipp==3.1.0
+mkdocs-git-revision-date-plugin
+mike
diff --git a/docs/setup.md b/docs/setup.md
new file mode 100644
index 0000000..0a02a26
--- /dev/null
+++ b/docs/setup.md
@@ -0,0 +1,177 @@
+## Dependencies
+
+!!! note inline end "Requirements"
+ **Using Singularity**: `singularity>=3.5` `snakemake>=6.0`
+
+ **Using Conda or Mamba**: `conda/mamba` `snakemake>=6.0`
+
+[Snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) must be installed on the target system. Snakemake is a workflow manager that orchestrates each step of the pipeline. The second dependency, i.e [singularity](https://singularity.lbl.gov/all-releases) OR [conda/mamba](https://github.com/conda-forge/miniforge#mambaforge), handles the dowloading/installation of any remaining software dependencies.
+
+By default, the pipeline will utilize singularity; however, the `--use-conda` option of the [run](usage/run.md) sub command can be provided to use conda/mamba instead of singularity. If possible, we recommend using singularity over conda for reproducibility; however, it is worth noting that singularity and conda produce identical results for this pipeline.
+
+If you are running the pipeline on Windows, please use the [Windows Subsystem for Linux (WSL)](https://learn.microsoft.com/en-us/windows/wsl/install). Singularity can be installed on WSL following these [instructions](https://www.blopig.com/blog/2021/09/using-singularity-on-windows-with-wsl2/).
+
+You can check to see if mpox-seek's software requirements are met by running:
+```bash
+# Check if dependencies
+# are already installed
+which snakemake || echo 'Error: snakemake is not install.'
+which singularity \
+ || which conda \
+ || which mamba \
+ || echo 'Error: singularity or conda or mamba are not installed.'
+```
+
+
+## Installation
+
+Please ensure the software dependencies listed above are satisfied before getting started with this section. Also, please ensure each of the software dependencies listed above are in your `$PATH`. You can re-run the same command above to ensure each of the required dependencies are in your `$PATH`.
+
+You can install mpox-seek locally with the following command:
+```bash
+# Clone mpox-seek from Github
+git clone https://github.com/OpenOmics/mpox-seek.git
+# Change your working directory
+cd mpox-seek/
+# Get usage information
+./mpox-seek -h
+```
+
+## Offline mode
+
+The `mpox-seek` pipeline can be run in an offline mode where external requests are not made at runtime. This will cache and download and remote resources or software containers (if using singlarity). Please note that if you are running the pipeline on Biowulf, you do NOT need to run these next steps. These instructions are for users running the pipeline outside of Biowulf cluster, i.e. on another cluster or locally on a laptop.
+
+#### Download resource bundle
+
+To download the pipeline's resource bundle, please run the following command:
+```bash
+# Dry-run download of the resource bundle
+./mpox-seek install --ref-path /data/$USER/refs \
+ --force --threads 4 --dry-run
+# Download the resource bundle
+./mpox-seek install --ref-path /data/$USER/refs \
+ --force --threads 4
+```
+
+Please remember the path provided to the `--ref-path` option above. During the download process, a new child directory called `mpox-seek` will be created. The path to this directory should be provided to the `--resource-bundle` option of the [run sub command](usage/run.md). For more information, please see the documentation for the [install sub command](usage/install.md).
+
+#### Cache software containers
+
+This next step is only applicable for singularity users. If you are using conda/mamba instead of singularity, you can skip over this section. To cache remote software containers, please run the following command:
+```bash
+# Dry run to see what will
+# be pulled from DockerHub
+./mpox-seek cache --sif-cache /data/$USER/cache --dry-run
+# Cache software containers
+./mpox-seek cache --sif-cache /data/$USER/cache
+```
+
+Please remember the path provided to the `--sif-cache` option above, you will need to provide this path to the `--sif-cache` option of the [run sub command](usage/run.md). For more information, please see the documentation for the [cache sub command](usage/install.md).
+
+#### Cache conda environment
+
+This next step is only applicable to conda/mamba users. If you are using singularity instead of conda/mamba, you can skip over this section. By default, when the `--use-conda` option is
+provided, a conda environment will be built on the fly. Building a conda environment can be slow, and it also makes exeternal requests so you will need internet access. With that being said, it may make sense to create/cache the conda environment once and re-use it. To cache/create mpox-seek's conda environment, please run the following command:
+```bash
+# Create a conda/mamba env
+# called mpox-seek, you only
+# need to run this once
+# on your computer/cluster
+mamba env create -f workflow/envs/mpox-seek.yaml
+```
+
+Running the command above will create a named conda/mamba environment called `mpox-seek`. Now you can provide `--conda-env-name mpox-seek` to the [run sub command](usage/run.md). This will ensure conda/mamba is run in an offline-like mode where no external requests are made at runtime. It will use the local, named conda environment instead of building a new environment on the fly.
+
+## TLDR
+
+Here is everything you need to get quickly get started. This set of instructions assumes you have snakemake and (singularity or conda) already [installed on your target system](#dependencies), and that both are in your `$PATH`.
+
+Following the example below, please replace `--input .tests/*.gz` with your input ONT FastQ files. If you are running the pipeline on Windows, please use the Windows Subsystem for Linux (WSL).
+
+!!! note "Quick Start"
+ === "Other system + singularity offline mode"
+ These instructions are for users/admins setting up the pipeline to run ouside of Biowulf in an offline mode. The pipeline can be run in an offline mode for users that do not have internet access with singularity. This mode is useful for researchers running the pipeline _in the field_ on a local laptop running linux.
+
+ In this example, we will cache/download remote resources in our `$HOME` directory, but please feel free to point to any other location on you computer or target system. You will need about 4 GB of free diskspace for the download.
+ ```bash
+ # Clone mpox-seek from Github
+ git clone https://github.com/OpenOmics/mpox-seek.git
+ # Change your working directory
+ cd mpox-seek/
+ # Get usage information
+ ./mpox-seek -h
+ # Download resource bundle
+ ./mpox-seek install --ref-path $HOME/refs --force --threads 4
+ # Cache software containers
+ ./mpox-seek cache --sif-cache $HOME/SIFs
+ # Dry run mpox-seek pipeline
+ ./mpox-seek run --input .tests/*.gz --output tmp_01/ \
+ --resource-bundle $HOME/refs/mpox-seek \
+ --sif-cache $HOME/SIFs --mode local \
+ --dry-run
+ # Run mpox-seek pipeline
+ # in offline-mode
+ ./mpox-seek run --input .tests/*.gz --output tmp_01/ \
+ --resource-bundle $HOME/refs/mpox-seek \
+ --sif-cache $HOME/SIFs --mode local
+ ```
+
+ === "Other system + conda offline mode"
+ These instructions are for users/admins setting up the pipeline outside of Biowulf. The pipeline can be run in an offline mode for users that do not have internet access with conda/mamba. This mode is useful for researchers running the pipeline _in the field_ on a local laptop running linux, macOS, or [Windows Subsystem for Linux (WSL)](https://learn.microsoft.com/en-us/windows/wsl/install).
+
+ In this example, we will download the resource bundle in our `$HOME` directory, but please feel free to point to any other location on you computer or target system. You will need about 4 GB of free diskspace for the download.
+ ```bash
+ # Clone mpox-seek from Github
+ git clone https://github.com/OpenOmics/mpox-seek.git
+ # Change your working directory
+ cd mpox-seek/
+ # Get usage information
+ ./mpox-seek -h
+ # Download resource bundle
+ ./mpox-seek install --ref-path $HOME/refs --force --threads 4
+ # Cache conda environment,
+ # creates a local conda env
+ # called mpox-seek
+ mamba env create -f workflow/envs/mpox-seek.yaml
+ # Dry run mpox-seek pipeline
+ ./mpox-seek run --input .tests/*.gz --output tmp_01/ \
+ --resource-bundle $HOME/refs/mpox-seek \
+ --mode local --conda-env-name mpox-seek \
+ --use-conda --dry-run
+ # Run mpox-seek pipeline
+ # with conda/mamba in
+ # offline-mode
+ ./mpox-seek run --input .tests/*.gz --output tmp_01/ \
+ --resource-bundle $HOME/refs/mpox-seek \
+ --mode local --conda-env-name mpox-seek \
+ --use-conda
+ ```
+
+ === "Biowulf"
+ If you are running the pipeline on Biowulf, do NOT need to download the resource bundle. These reference files already exist on Biowulf, and the pipeline is setup to automatically use them as needed. Also, we have already cached all of the pipeline's software containers here: `/data/OpenOmics/SIFs/`. If you are on Biowulf, you can `module load` the required dependencies.
+
+ Whenever the pipeline is provided with the `--sif-cache` option, it is run in an offline mode. We always recommend providing `--sif-cache /data/OpenOmics/SIFs/` when running the pipeline on Biowulf. This avoids issues related to DockerHub request limits if multiple users are concurrently run the pipeline on the cluster.
+ ```bash
+ # Grab an interactive node,
+ # do not run on head node!
+ srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash
+ module purge
+ module load singularity snakemake
+ # Clone mpox-seek from Github
+ git clone https://github.com/OpenOmics/mpox-seek.git
+ # Change your working directory
+ cd mpox-seek/
+ # Get usage information
+ ./mpox-seek -h
+ # Dry run mpox-seek pipeline
+ ./mpox-seek run --input .tests/*.gz --output tmp_01/ \
+ --sif-cache /data/OpenOmics/SIFs/ \
+ --mode slurm \
+ --dry-run
+ # Run mpox-seek pipeline
+ # on Biowulf cluster
+ ./mpox-seek run --input .tests/*.gz --output tmp_01/ \
+ --sif-cache /data/OpenOmics/SIFs/ \
+ --mode slurm
+ ```
+
diff --git a/docs/usage/cache.md b/docs/usage/cache.md
new file mode 100644
index 0000000..7eb0743
--- /dev/null
+++ b/docs/usage/cache.md
@@ -0,0 +1,76 @@
+# mpox-seek cache
+
+## 1. About
+The `mpox-seek` executable is composed of several inter-related sub commands. Please see `mpox-seek -h` for all available options.
+
+This part of the documentation describes options and concepts for mpox-seek cache
sub command in more detail.
+
+With minimal configuration, the **`cache`** sub command enables you to cache remote software containers from [Dockerhub](https://hub.docker.com/u/skchronicles). Caching remote software containers allows the pipeline to run in an offline mode where no requests are made. The cache sub command can also be used to pull our pre-built software container onto a new cluster or target system.
+
+These containers are normally pulled onto the filesystem when the pipeline runs; however, due to network issues or DockerHub pull rate limits, it may make sense to pull the resources once so a shared cache can be created. It is worth noting that a singularity cache cannot normally be shared across users. Singularity strictly enforces that a cache is owned by the user. To get around this issue, the cache subcommand can be used to create local SIFs on the filesystem from images on DockerHub. The path of these locally cached SIFs can be passed to the run sub commands --sif-cache option.
+
+Caching software containers is fast and easy! In its most basic form, mpox-seek cache
only has *one required input*.
+
+## 2. Synopsis
+```text
+$ ./mpox-seek cache [--help] [--dry-run] \
+ --sif-cache SIF_CACHE
+```
+
+The synopsis for each command shows its parameters and their usage. Optional parameters are shown in square brackets.
+
+A user **must** provide a directory to cache remote Docker images via the `--sif-cache` argument. Once the cache has pipeline completed, the local sif cache can be passed to the `--sif-cache` option of the mpox-seek run
subcomand. This enables the pipeline to run in an offline mode.
+
+Use you can always use the `-h` option for information on a specific command.
+
+### 2.1 Required Arguments
+
+`--sif-cache SIF_CACHE`
+
+> **Path where a local cache of SIFs will be stored.**
+> *type: path*
+>
+> Any images defined in *config/containers.json* will be pulled into the local filesystem. The path provided to this option can be passed to the `--sif-cache` option of the mpox-seek run
subcomand. This allows for running the build and run pipelines in an offline mode where no requests are made to external sources. This is useful for avoiding network issues or DockerHub pull rate limits. Please see mpox-seek run for more information.
+>
+> ***Example:*** `--sif-cache /data/$USER/cache`
+
+### 2.2 Options
+
+Each of the following arguments are optional and do not need to be provided.
+
+ `-h, --help`
+> **Display Help.**
+> *type: boolean flag*
+>
+> Shows command's synopsis, help message, and an example command
+>
+> ***Example:*** `--help`
+
+---
+ `--dry-run`
+> **Dry run the pipeline.**
+> *type: boolean flag*
+>
+> Only displays what software container will be cached locally. Does not execute anything!
+>
+> ***Example:*** `--dry-run`
+
+## 3. Example
+```bash
+# Step 0.) Grab an interactive node (do not run on head node)
+srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash
+module purge
+module load singularity snakemake
+
+# Step 1.) Dry run to see what will be pulled
+./mpox-seek cache --sif-cache /data/$USER/cache \
+ --dry-run
+
+# Step 2.) Cache remote resources locally.
+# This command will NOT automatically submit
+# a job to the cluster. As so, we recommend
+# submitting this next command to the cluster
+# as a job. Download speeds will vary so it
+# is best to set the wall time a few hours.
+./mpox-seek cache --sif-cache /data/$USER/cache
+```
diff --git a/docs/usage/install.md b/docs/usage/install.md
new file mode 100644
index 0000000..fa426fb
--- /dev/null
+++ b/docs/usage/install.md
@@ -0,0 +1,114 @@
+# mpox-seek install
+
+## 1. About
+The `mpox-seek` executable is composed of several inter-related sub commands. Please see `mpox-seek -h` for all available options.
+
+This part of the documentation describes options and concepts for mpox-seek install
sub command in more detail.
+
+With minimal configuration, the **`install`** sub command enables you to download the pipeline's resource bundle locally. This is necessary when setting up the pipeline on a new target system or cluster.
+
+The pipeline uses a set of reference files to process the data. These reference files are required and need to be available on the local file system prior to execution. This command can be used to download any required reference files of the pipeline.
+
+Since most resource bundles are very large; we recommend using multiple threads for pulling reference files concurrently. The resource bundle can be very large so please ensure you have sufficent disk space prior to running this sub command.
+
+**Please Note:** The resource bundle requires about 2 GB of available disk space. If you are running the pipeline on the Biowulf cluster, you do *NOT* need to download the pipeline's resource bundle. It is already accessible to all HPC users. This sub command is for users running the pipeline outside of the Biowulf cluster.
+
+Downloading the resource bundle is fast and easy! In its most basic form, mpox-seek install
only has *one required input*.
+
+## 2. Synopsis
+```text
+$ mpox-seek install [--help] [--dry-run] \
+ [--force] [--threads] \
+ --ref-path REF_PATH
+```
+
+The synopsis for each command shows its parameters and their usage. Optional parameters are shown in square brackets.
+
+A user **must** provide a output directory for the reference file download via the `--ref-path` argument. Once the download of the resource bundle has completed, a new child directory called mpox-seek will be created. This new directory will contain all of the pipeline's required reference files. The path to this new directory can be passed to the `--resource-bundle` option of the mpox-seek run
subcomand. This allow users outside of Biowulf to run the pipeline.
+
+Use you can always use the `-h` option for information on a specific command.
+
+### 2.1 Required Arguments
+
+`--ref-path REF_PATH`
+
+> **Path where the resource bundle will be downloaded.**
+> *type: path*
+>
+> Any resouces defined in the 'config/install.json' will be pulled onto the local filesystem. After the files have been downloaded, a new directory with the name `mpox-seek` will be created. It contains all the required reference files of the pipeline. The path to this new directory can be passed to the run sub command's `--resource-bundle` option. Please see the run sub command for more information.
+>
+> ***Example:*** `--ref-path /data/$USER/refs`
+
+### 2.2 Options
+
+Each of the following arguments are optional and do not need to be provided.
+
+ `-h, --help`
+> **Display Help.**
+> *type: boolean flag*
+>
+> Shows command's synopsis, help message, and an example command
+>
+> ***Example:*** `--help`
+
+---
+ `--dry-run`
+> **Dry run the pipeline.**
+> *type: boolean flag*
+>
+> Displays what remote resources would be pulled. Does not execute anything!
+>
+> ***Example:*** `--dry-run`
+
+---
+ `--force`
+> **Force downloads all files.**
+> *type: boolean flag*
+>
+> By default, any files that do not exist locally are pulled; however if a previous instance of an install did not exit gracefully, it may be necessary to forcefully re-download all the files.
+>
+> ***Example:*** `--force`
+
+---
+ `--threads`
+> **Number of threads to use for concurrent file downloads.**
+> *type: int*
+> *default: 2*
+>
+> Max number of threads to use for concurrent file downloads.
+>
+> ***Example:*** `--threads 12`
+
+## 3. Example
+```bash
+# Step 0.) Grab an interactive node,
+# do not run on head node!
+srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=24gb --cpus-per-task=12 --pty bash
+module purge
+module load singularity snakemake
+
+# Step 1.) Dry-run download of the resource bundle
+ mpox-seek install --ref-path /data/$USER/refs \
+ --force \
+ --dry-run \
+ --threads 12
+
+# Step 2.) Download the resource bundle,
+# This command will NOT automatically submit
+# a job to the cluster. As so, we recommend
+# submitting this next command to the cluster
+# as a job. Download speeds will vary so it
+# is best to set the wall time to 2 days.
+mpox-seek install --ref-path /data/$USER/refs \
+ --force \
+ --threads 12
+
+# Checkout the downloaded files
+cd /data/$USER/refs
+tree mpox-seek
+# mpox-seek/
+# ├── kronatax_1222
+# │  └── taxonomy.tab
+# └── NCBI
+# └── viral_genomes_taxid.fa
+```
diff --git a/docs/usage/run.md b/docs/usage/run.md
new file mode 100644
index 0000000..6b73d06
--- /dev/null
+++ b/docs/usage/run.md
@@ -0,0 +1,215 @@
+# mpox-seek run
+
+## 1. About
+The `mpox-seek` executable is composed of several inter-related sub commands. Please see `mpox-seek -h` for all available options.
+
+This part of the documentation describes options and concepts for mpox-seek run
sub command in more detail. With minimal configuration, the **`run`** sub command enables you to start running mpox-seek pipeline.
+
+Setting up the mpox-seek pipeline is fast and easy! In its most basic form, mpox-seek run
only has *two required inputs*.
+
+## 2. Synopsis
+```text
+$ mpox-seek run [--help] \
+ [--dry-run] [--job-name JOB_NAME] [--mode {slurm,local}] \
+ [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \
+ [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \
+ [--resource-bundle RESOURCE_BUNDLE] [--use-conda] \
+ [--conda-env-name CONDA_ENV_NAME] \
+ [--quality-filter QUALITY_FILTER] \
+ --input INPUT [INPUT ...] \
+ --output OUTPUT
+```
+
+The synopsis for each command shows its arguments and their usage. Optional arguments are shown in square brackets.
+
+A user **must** provide a list of FastQ (globbing is supported) to analyze via `--input` argument and an output directory to store results via `--output` argument.
+
+Use you can always use the `-h` option for information on a specific command.
+
+### 2.1 Required arguments
+
+Each of the following arguments are required. Failure to provide a required argument will result in a non-zero exit-code.
+
+ `--input INPUT [INPUT ...]`
+> **Input Oxford Nanopore FastQ files(s).**
+> *type: file(s)*
+>
+> One or more FastQ files can be provided. From the command-line, each input file should seperated by a space. Globbing is supported! This makes selecting FastQ files easy. Input FastQ files should always be gzipp-ed. If a sample has multiple fastq files for different barcodes, the pipeline expects each barcoded FastQ file endwith the following extension: `_N.fastq.gz`, where `N` is a number. Internally, the pipeline will concatenate each of these FastQ files prior to processing the data. Here is an example of an input sample with multiple barcode sequences: `S1_0.fastq.gz`, `S1_1.fastq.gz`, `S1_2.fastq.gz`, `S1_3.fastq.gz`. Given this barcoded sample, the pipeline will create the following concatenated FastQ file: `S1.fastq.gz`.
+>
+> ***Example:*** `--input .tests/*.fastq.gz`
+
+---
+ `--output OUTPUT`
+> **Path to an output directory.**
+> *type: path*
+>
+> This location is where the pipeline will create all of its output files, also known as the pipeline's working directory. If the provided output directory does not exist, it will be created automatically.
+>
+> ***Example:*** `--output /data/$USER/mpox-seek_out`
+
+### 2.2 Analysis options
+
+Each of the following arguments are optional, and do not need to be provided.
+
+ `--quality-filter QUALITY_FILTER`
+> **Quality score filter.**
+> *type: int*
+> *default: 8*
+>
+> This option filters reads on a minimum average quality score. Any reads with an average minimum quality score less than this threshold will be removed. The default average minimum quality filter is set to 8.
+>
+> ***Example:*** `--quality-filter 10`
+
+### 2.3 Orchestration options
+
+Each of the following arguments are optional, and do not need to be provided.
+
+ `--dry-run`
+> **Dry run the pipeline.**
+> *type: boolean flag*
+>
+> Displays what steps in the pipeline remain or will be run. Does not execute anything!
+>
+> ***Example:*** `--dry-run`
+
+---
+ `--silent`
+> **Silence standard output.**
+> *type: boolean flag*
+>
+> Reduces the amount of information directed to standard output when submitting master job to the job scheduler. Only the job id of the master job is returned.
+>
+> ***Example:*** `--silent`
+
+---
+ `--mode {slurm,local}`
+> **Execution Method.**
+> *type: string*
+> *default: slurm*
+>
+> Execution Method. Defines the mode or method of execution. Vaild mode options include: slurm or local.
+>
+> ***slurm***
+> The slurm execution method will submit jobs to the [SLURM workload manager](https://slurm.schedmd.com/). It is recommended running mpox-seek in this mode as execution will be significantly faster in a distributed environment. This is the default mode of execution.
+>
+> ***local***
+> Local executions will run serially on compute instance. This is useful for testing, debugging, or when a users does not have access to a high performance computing environment. If this option is not provided, it will default to a local execution mode.
+>
+> ***Example:*** `--mode slurm`
+
+---
+ `--job-name JOB_NAME`
+> **Set the name of the pipeline's master job.**
+> *type: string*
+> *default: pl:mpox-seek*
+>
+> When submitting the pipeline to a job scheduler, like SLURM, this option always you to set the name of the pipeline's master job. By default, the name of the pipeline's master job is set to "pl:mpox-seek".
+>
+> ***Example:*** `--job-name pl_id-42`
+
+---
+ `--singularity-cache SINGULARITY_CACHE`
+> **Overrides the $SINGULARITY_CACHEDIR environment variable.**
+> *type: path*
+> *default: `--output OUTPUT/.singularity`*
+>
+> Singularity will cache image layers pulled from remote registries. This ultimately speeds up the process of pull an image from DockerHub if an image layer already exists in the singularity cache directory. By default, the cache is set to the value provided to the `--output` argument. Please note that this cache cannot be shared across users. Singularity strictly enforces you own the cache directory and will return a non-zero exit code if you do not own the cache directory! See the `--sif-cache` option to create a shareable resource.
+>
+> ***Example:*** `--singularity-cache /data/$USER/.singularity`
+
+---
+ `--sif-cache SIF_CACHE`
+> **Path where a local cache of SIFs are stored.**
+> *type: path*
+>
+> Uses a local cache of SIFs on the filesystem. This SIF cache can be shared across users if permissions are set correctly. If a SIF does not exist in the SIF cache, the image will be pulled from Dockerhub and a warning message will be displayed. The `mpox-seek cache` subcommand can be used to create a local SIF cache. Please see `mpox-seek cache` for more information. This command is extremely useful for avoiding DockerHub pull rate limits. It also remove any potential errors that could occur due to network issues or DockerHub being temporarily unavailable. We recommend running mpox-seek with this option when ever possible.
+>
+> ***Example:*** `--singularity-cache /data/$USER/SIFs`
+
+---
+ `--threads THREADS`
+> **Max number of threads for each process.**
+> *type: int*
+> *default: 2*
+>
+> Max number of threads for each process. This option is more applicable when running the pipeline with `--mode local`. It is recommended setting this vaule to the maximum number of CPUs available on the host machine.
+>
+> ***Example:*** `--threads 12`
+
+---
+ `--tmp-dir TMP_DIR`
+> **Max number of threads for each process.**
+> *type: path*
+> *default: `/lscratch/$SLURM_JOBID`*
+>
+> Path on the file system for writing temporary output files. By default, the temporary directory is set to '/lscratch/$SLURM_JOBID' for backwards compatibility with the NIH's Biowulf cluster; however, if you are running the pipeline on another cluster, this option will need to be specified. Ideally, this path should point to a dedicated location on the filesystem for writing tmp files. On many systems, this location is set to somewhere in /scratch. If you need to inject a variable into this string that should NOT be expanded, please quote this options value in single quotes.
+>
+> ***Example:*** `--tmp-dir /scratch/$USER/`
+
+---
+ `--resource-bundle RESOURCE_BUNDLE`
+> **Path to a resource bundle downloaded with the install sub command.**
+> *type: path*
+>
+> The resource bundle contains the set of required reference files for processing any data. The path provided to this option will be the path to the `mpox-seek` directory that was created when running the install sub command. Please see the install sub command for more information about downloading the pipeline's resource bundle.
+>
+> ***Example:*** `--resource-bundle /data/$USER/refs/mpox-seek`
+
+---
+ `--use-conda`
+> **Use Conda/mamba instead of Singularity.**
+> *type: boolean flag*
+>
+> Use Conda/Mamba instead of Singularity. By default, the pipeline uses singularity for handling required software dependencies. This option overrides that behavior, and it will use Conda/mamba instead of Singularity. The use of Singuarity and Conda are mutually exclusive. Please note that conda and mamba must be in your $PATH prior to running the pipeline. This option will build a conda environment on the fly prior to the pipeline's execution. As so, this step requires internet access. To run mpox-seek in an offline mode with conda, please see the `--conda-env-name` option below.
+>
+> ***Example:*** `--use-conda`
+
+---
+ `--conda-env-name CONDA_ENV_NAME`
+> **Use an existing conda environment.**
+> *type: str*
+>
+> Use an existing conda environment. This option allows mpox-seek to run with conda in an offline mode. If you are using conda without this option, the pipeline will build a conda environment on the fly prior to the its execution. Building a conda environment can sometimes be slow, as it downloads dependencies from the internet, so it may make sense to build it once and re-use it. This will also allow you to use conda/mamba in an offline mode. If you have already built a named conda environment with the supplied yaml file, then you can directly use it with this option. Please provide the name of the conda environment that was specifically built for the mpox-seek pipeline.
+>
+> To create a reusable conda/mamba environment with the name `mpox-seek`, please run the following mamba command:
+> ```bash
+> # Creates a reusable conda
+> # environment called mpox-seek
+> mamba env create -f workflow/envs/mpox-seek.yaml.
+> ```
+
+> ***Example:*** `--conda-env-name mpox-seek`
+
+### 2.4 Miscellaneous options
+Each of the following arguments are optional, and do not need to be provided.
+
+ `-h, --help`
+> **Display Help.**
+> *type: boolean flag*
+>
+> Shows command's synopsis, help message, and an example command
+>
+> ***Example:*** `--help`
+
+## 3. Example
+```bash
+# Step 1.) Grab an interactive node,
+# do not run on head node!
+srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash
+module purge
+module load singularity snakemake
+
+# Step 2A.) Dry-run the pipeline
+./mpox-seek run --input .tests/*.fastq.gz \
+ --output /data/$USER/output \
+ --mode slurm \
+ --dry-run
+
+# Step 2B.) Run the mpox-seek pipeline
+# The slurm mode will submit jobs to
+# the cluster. It is recommended running
+# the pipeline in this mode.
+./mpox-seek run --input .tests/*.fastq.gz \
+ --output /data/$USER/output \
+ --mode slurm
+```
\ No newline at end of file
diff --git a/docs/usage/unlock.md b/docs/usage/unlock.md
new file mode 100644
index 0000000..6e3271c
--- /dev/null
+++ b/docs/usage/unlock.md
@@ -0,0 +1,56 @@
+# mpox-seek unlock
+
+## 1. About
+The `mpox-seek` executable is composed of several inter-related sub commands. Please see `mpox-seek -h` for all available options.
+
+This part of the documentation describes options and concepts for mpox-seek unlock
sub command in more detail. With minimal configuration, the **`unlock`** sub command enables you to unlock a pipeline output directory.
+
+If the pipeline fails ungracefully, it maybe required to unlock the working directory before proceeding again. Snakemake will inform a user when it maybe necessary to unlock a working directory with an error message stating: `Error: Directory cannot be locked`.
+
+Please verify that the pipeline is not running before running this command. If the pipeline is currently running, the workflow manager will report the working directory is locked. The is the default behavior of snakemake, and it is normal. Do NOT run this command if the pipeline is still running! Please kill the master job and it's child jobs prior to running this command.
+
+Unlocking mpox-seek pipeline output directory is fast and easy! In its most basic form, mpox-seek unlock
only has *one required input*.
+
+## 2. Synopsis
+```text
+$ ./mpox-seek unlock [-h] --output OUTPUT
+```
+
+The synopsis for this command shows its parameters and their usage. Optional parameters are shown in square brackets.
+
+A user **must** provide an output directory to unlock via `--output` argument. After running the unlock sub command, you can resume the build or run pipeline from where it left off by re-running it.
+
+Use you can always use the `-h` option for information on a specific command.
+
+### 2.1 Required Arguments
+
+ `--output OUTPUT`
+> **Output directory to unlock.**
+> *type: path*
+>
+> Path to a previous run's output directory. This will remove a lock on the working directory. Please verify that the pipeline is not running before running this command.
+> ***Example:*** `--output /data/$USER/mpox-seek_out`
+
+### 2.2 Options
+
+Each of the following arguments are optional and do not need to be provided.
+
+ `-h, --help`
+> **Display Help.**
+> *type: boolean*
+>
+> Shows command's synopsis, help message, and an example command
+>
+> ***Example:*** `--help`
+
+
+## 3. Example
+```bash
+# Step 0.) Grab an interactive node (do not run on head node)
+srun -N 1 -n 1 --time=12:00:00 -p interactive --mem=8gb --cpus-per-task=4 --pty bash
+module purge
+module load singularity snakemake
+
+# Step 1.) Unlock a pipeline output directory
+mpox-seek unlock --output /data/$USER/output
+```
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..6ec52bf
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,109 @@
+# Project Information
+site_name: mpox-seek
+site_author: Skyler Kuhn
+site_description: >-
+ An awesome targeted ONT Pipeline for Monkeypox
+
+# Repository
+repo_name: OpenOmics/mpox-seek
+repo_url: https://github.com/OpenOmics/mpox-seek
+edit_uri: https://github.com/OpenOmics/mpox-seek/edit/main/docs/
+
+# Extra
+extra_css:
+ - css/extra.css
+
+# Copyright
+copyright: Copyright © 2024 OpenOmics
+
+# Configuration
+theme:
+ name: material
+ features:
+ - navigation.tabs
+ - navigation.top
+ - toc.integrate
+ palette:
+ # Palette toggle for light mode
+ - scheme: default
+ toggle:
+ icon: material/lightbulb-on
+ name: Switch to dark mode
+ # Palette toggle for dark mode
+ - scheme: slate
+ toggle:
+ icon: material/weather-night
+ name: Switch to light mode
+ logo: assets/icons/doc-book.svg
+ favicon: assets/favicon/favicon.ico
+
+# Plugins
+plugins:
+ - search
+ - git-revision-date
+ - minify:
+ minify_html: true
+
+# Customization
+extra:
+ social:
+ - icon: fontawesome/solid/users
+ link: https://ncbr.ncifcrf.gov/
+ - icon: fontawesome/brands/github
+ link: https://github.com/OpenOmics
+ - icon: fontawesome/brands/docker
+ link: https://hub.docker.com/u/skchronicles
+ version:
+ provider: mike
+
+# Extensions
+markdown_extensions:
+ - markdown.extensions.md_in_html
+ - markdown.extensions.admonition
+ - markdown.extensions.attr_list
+ - markdown.extensions.def_list
+ - markdown.extensions.footnotes
+ - markdown.extensions.meta
+ - markdown.extensions.toc:
+ permalink: true
+ - pymdownx.arithmatex:
+ generic: true
+ - pymdownx.betterem:
+ smart_enable: all
+ - pymdownx.caret
+ - pymdownx.critic
+ - pymdownx.details
+ - pymdownx.emoji:
+ emoji_index: !!python/name:materialx.emoji.twemoji
+ emoji_generator: !!python/name:materialx.emoji.to_svg
+ - pymdownx.highlight
+ - pymdownx.inlinehilite
+ - pymdownx.keys
+ - pymdownx.magiclink:
+ repo_url_shorthand: true
+ user: squidfunk
+ repo: mkdocs-material
+ - pymdownx.mark
+ - pymdownx.smartsymbols
+ - pymdownx.snippets:
+ check_paths: true
+ - pymdownx.superfences
+ - pymdownx.tabbed:
+ alternate_style: true
+ - pymdownx.tasklist:
+ custom_checkbox: true
+ - pymdownx.tilde
+
+# Page Tree
+nav:
+ - About: index.md
+ - Setup: setup.md
+ - Commands:
+ - mpox-seek run: usage/run.md
+ - mpox-seek unlock: usage/unlock.md
+ - mpox-seek install: usage/install.md
+ - mpox-seek cache: usage/cache.md
+ - Examples: examples.md
+ - FAQ:
+ - General Questions: faq/questions.md
+ - License: license.md
diff --git a/mpox-seek b/mpox-seek
new file mode 100755
index 0000000..6c9fce3
--- /dev/null
+++ b/mpox-seek
@@ -0,0 +1,1058 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+"""
+ABOUT: This is the main entry for the pipeline.
+REQUIRES:
+ - python>=3.6
+ - snakemake (recommended>=6.0.0 and <8.0.0)
+ - singularity (recommended==latest)
+DISCLAIMER:
+ PUBLIC DOMAIN NOTICE
+ NIAID Collaborative Bioinformatics Resource (NCBR)
+ National Institute of Allergy and Infectious Diseases (NIAID)
+This software/database is a "United States Government Work" under
+the terms of the United States Copyright Act. It was written as
+part of the author's official duties as a United States Government
+employee and thus cannot be copyrighted. This software is freely
+available to the public for use.
+Although all reasonable efforts have been taken to ensure the
+accuracy and reliability of the software and data, NCBR do not and
+cannot warrant the performance or results that may be obtained by
+using this software or data. NCBR and NIH disclaim all warranties,
+express or implied, including warranties of performance,
+merchantability or fitness for any particular purpose.
+Please cite the author and NIH resources like the "Biowulf Cluster"
+in any work or product based on this material.
+USAGE:
+ $ mpox-seek [OPTIONS]
+EXAMPLE:
+ $ mpox-seek run --input *.fastq.gz --output output/
+"""
+
+# Python standard library
+from __future__ import print_function
+from shutil import unpack_archive
+import sys, os, subprocess, re, json, textwrap
+
+# 3rd party imports from pypi
+import argparse # potential python3 3rd party package, added in python/3.5
+
+# Local imports
+from src import version
+from src.run import init, setup, bind, dryrun, runner
+from src.download import main as installer
+from src.shells import bash
+from src.utils import (
+ Colors,
+ cat,
+ check_cache,
+ err,
+ exists,
+ fatal,
+ permissions,
+ require
+)
+
+
+# Pipeline Metadata
+__version__ = version
+__authors__ = 'Skyler Kuhn'
+__email__ = 'skyler.kuhn@nih.gov'
+__home__ = os.path.dirname(os.path.abspath(__file__))
+_name = os.path.basename(sys.argv[0])
+_description = 'A Streamlined ONT Monkeypox Pipeline'
+
+
+def unlock(sub_args):
+ """Unlocks a previous runs output directory. If snakemake fails ungracefully,
+ it maybe required to unlock the working directory before proceeding again.
+ This is rare but it does occasionally happen. Maybe worth add a --force
+ option to delete the '.snakemake/' directory in the future.
+ @param sub_args :
+ Parsed arguments for unlock sub-command
+ """
+ print("Unlocking the pipeline's output directory...")
+ outdir = sub_args.output
+
+ try:
+ unlock_output = subprocess.check_output([
+ 'snakemake', '--unlock',
+ '--cores', '1',
+ '--configfile=config.json'
+ ], cwd = outdir,
+ stderr=subprocess.STDOUT)
+ except subprocess.CalledProcessError as e:
+ # Unlocking process returned a non-zero exit code
+ sys.exit("{}\n{}".format(e, e.output))
+
+ print("Successfully unlocked the pipeline's working directory!")
+
+
+def run(sub_args):
+ """Initialize, setup, and run the pipeline.
+ Calls initialize() to create output directory and copy over pipeline resources,
+ setup() to create the pipeline config file, dryrun() to ensure their are no issues
+ before running the pipeline, and finally run() to execute the Snakemake workflow.
+ @param sub_args :
+ Parsed arguments for run sub-command
+ """
+ # Step 0. Check for required dependencies
+ # The pipelines has only two requirements:
+ # (snakemake and singularity) OR --use-conda
+ # (snakemake and conda)
+ if sub_args.use_conda:
+ require(['snakemake', 'conda'], ['snakemake', 'conda'])
+ else:
+ require(['snakemake', 'singularity'], ['snakemake', 'singularity'])
+
+ # Step 1. Initialize working directory,
+ # copy over required resources to run
+ # the pipeline
+ git_repo = __home__
+ input_files = init(
+ repo_path = git_repo,
+ output_path = sub_args.output,
+ links = sub_args.input
+ )
+
+ # Step 2. Setup pipeline for execution,
+ # dynamically create config.json config
+ # file from user inputs and base config
+ # templates. Also override path of any
+ # reference files from default in the
+ # OpenOmics shared group area to base
+ # path provided by user via the option
+ # --resource-bundle PATH
+ config = setup(sub_args,
+ ifiles = input_files,
+ repo_path = git_repo,
+ output_path = sub_args.output,
+ resource_bundle = sub_args.resource_bundle
+ )
+
+ # Step 3. Resolve docker/singularity bind
+ # paths from the config file.
+ bindpaths = bind(
+ sub_args,
+ config = config
+ )
+
+ config['bindpaths'] = bindpaths
+
+ # Step 4. Save config to output directory
+ with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh:
+ json.dump(config, fh, indent = 4, sort_keys = True)
+
+ # Optional Step: Dry-run pipeline
+ if sub_args.dry_run:
+ # Dryrun pipeline
+ dryrun_output = dryrun(outdir = sub_args.output) # python3 returns byte-string representation
+ print("\nDry-running {} pipeline:\n{}".format(_name, dryrun_output.decode("utf-8")))
+ sys.exit(0)
+
+ # Step 5. Orchestrate pipeline execution,
+ # run pipeline in locally on a compute node
+ # for debugging purposes or submit the master
+ # job to the job scheduler, SLURM, and create
+ # logging file
+ if not exists(os.path.join(sub_args.output, 'logfiles')):
+ # Create directory for logfiles
+ os.makedirs(os.path.join(sub_args.output, 'logfiles'))
+ if sub_args.mode == 'local':
+ log = os.path.join(sub_args.output, 'logfiles', 'snakemake.log')
+ else:
+ log = os.path.join(sub_args.output, 'logfiles', 'master.log')
+ logfh = open(log, 'w')
+ mjob = runner(mode = sub_args.mode,
+ outdir = sub_args.output,
+ # additional_bind_paths = all_bind_paths,
+ alt_cache = sub_args.singularity_cache,
+ threads = int(sub_args.threads),
+ jobname = sub_args.job_name,
+ submission_script=os.path.join(__home__, 'src', 'run.sh'),
+ logger = logfh,
+ additional_bind_paths = ",".join(bindpaths),
+ tmp_dir = sub_args.tmp_dir,
+ )
+
+ # Step 6. Wait for subprocess to complete,
+ # this is blocking and not asynchronous
+ if not sub_args.silent:
+ print("\nRunning {} pipeline in '{}' mode...".format(_name, sub_args.mode))
+ mjob.wait()
+ logfh.close()
+
+ # Step 7. Relay information about submission
+ # of the master job or the exit code of the
+ # pipeline that ran in local mode
+ if sub_args.mode == 'local':
+ if int(mjob.returncode) == 0:
+ print('{} pipeline has successfully completed'.format(_name))
+ else:
+ fatal('{} pipeline failed. Please see error(s) above for more information.'.format(_name))
+ elif sub_args.mode == 'slurm':
+ jobid = open(os.path.join(sub_args.output, 'logfiles', 'mjobid.log')).read().strip()
+ if not sub_args.silent:
+ if int(mjob.returncode) == 0:
+ print('Successfully submitted master job: ', end="")
+ else:
+ fatal('Error occurred when submitting the master job.')
+ print(jobid)
+
+
+def install(sub_args):
+ """Downloads resource bundle locally in parallel chunks.
+ Reference files will be pulled from chunks defined in
+ 'config/install.json' onto the local filesystem. This
+ function is a wrapper to 'src/download.py'. Please see
+ that script for more information.
+ @param sub_args :
+ Parsed arguments for unlock sub-command
+ """
+ # Read in config file for install
+ with open(
+ os.path.join(__home__, 'config', 'install.json')
+ ) as fh:
+ install_config = json.load(fh)
+
+ # Try to install any missing targets
+ download_links = []
+ md5_checksums = []
+ for target in install_config['install']:
+ download_links = list(install_config['install'][target].keys())
+ md5_checksums = list(install_config['install'][target].values())
+ # Set missing required options
+ # for src/download.py, need to
+ # pass links, MD5 checksums and
+ # the output directory
+ sub_args.input = download_links
+ sub_args.md5 = md5_checksums
+ sub_args.output = sub_args.ref_path
+ # Pass options to download.py
+ installer(sub_args)
+
+ # Concatenate the locally
+ # download file chunks to
+ # restore tarball and then
+ # extract the archive
+ if not sub_args.dry_run:
+ for target in install_config['install']:
+ # Gather all chunks
+ download_links = list(install_config['install'][target].keys())
+ local_chunks = [
+ os.path.join(sub_args.ref_path, f.split('/')[-1])
+ for f in download_links
+ ]
+ # Restore the tarball
+ print('Merging chunks... {0}'.format(','.join(local_chunks)))
+ tarball = cat(
+ local_chunks,
+ os.path.join(
+ sub_args.ref_path,
+ 'merged_chunks.tar.gz'
+ )
+ )
+ # Delete local chunks
+ # to reduce diskspace
+ # footprint
+ for f in local_chunks:
+ try:
+ os.remove(f)
+ except OSError:
+ err('Warning: failed to remove local download chunk... {}'.format(f))
+ # Extract the tarball
+ print('Extracting tarball... {0}'.format(tarball))
+ unpack_archive(tarball, sub_args.ref_path)
+ # Delete tarball to
+ # reduce diskspace
+ # footprint
+ try:
+ os.remove(tarball)
+ except OSError:
+ err('Warning: failed to resource bundle tarball... {}'.format(tarball))
+
+
+def cache(sub_args):
+ """Caches remote software containers stored on DockerHub.
+ Local SIFs will be created from images defined in 'config/containers/images.json'.
+ @param sub_args :
+ Parsed arguments for unlock sub-command
+ """
+ # Check for dependencies
+ require(['singularity'], ['singularity'])
+ sif_cache = sub_args.sif_cache
+ # Get absolute PATH to templates in exome-seek git repo
+ repo_path = os.path.dirname(os.path.abspath(__file__))
+ images = os.path.join(repo_path, 'config','containers.json')
+
+ # Create image cache
+ if not exists(sif_cache):
+ # Pipeline output directory does not exist on filesystem
+ os.makedirs(sif_cache)
+ elif exists(sif_cache) and os.path.isfile(sif_cache):
+ # Provided Path for pipeline output directory exists as file
+ raise OSError("""\n\tFatal: Failed to create provided sif cache directory!
+ User provided --sif-cache PATH already exists on the filesystem as a file.
+ Please {} cache again with a different --sif-cache PATH.
+ """.format(_name)
+ )
+
+ # Check if local SIFs already exist on the filesystem
+ with open(images, 'r') as fh:
+ data = json.load(fh)
+
+ pull = []
+ for image, uri in data['images'].items():
+ sif = os.path.join(sif_cache, '{}.sif'.format(os.path.basename(uri).replace(':', '_')))
+ if not exists(sif):
+ # If local sif does not exist on in cache, print warning
+ # and default to pulling from URI in config/containers.json
+ print('Image will be pulled from "{}".'.format(uri), file=sys.stderr)
+ pull.append(uri)
+
+ if not pull:
+ # Nothing to do!
+ print('Singularity image cache is already up to update!')
+ else:
+ # There are image(s) that need to be pulled
+ if not sub_args.dry_run:
+ # container cache script: src/cache.sh
+ # Quote user provided values to avoid shell injections
+ username = os.environ.get('USER', os.environ.get('USERNAME'))
+ exitcode = bash(
+ str(os.path.join(repo_path, 'src', 'cache.sh')) +
+ ' local ' +
+ " -s '{}' ".format(sif_cache) +
+ " -i '{}' ".format(','.join(pull)) +
+ " -t '{0}/{1}/.singularity/' ".format(sif_cache, username)
+ )
+ # Check exitcode of caching script
+ if exitcode != 0:
+ fatal('Fatal: Failed to pull all containers. Please try again!')
+ print('Done: sucessfully pulled all software containers!')
+
+
+def parsed_arguments(name, description):
+ """Parses user-provided command-line arguments. Requires argparse and textwrap
+ package. argparse was added to standard lib in python 3.5 and textwrap was added
+ in python 3.5. To create custom help formatting for subparsers a docstring is
+ used create the help message for required options. argparse does not support named
+ subparser groups, which is normally what would be used to accomphish this reformatting.
+ As so, the help message for require options must be suppressed. If a new required arg
+ is added to a subparser, it must be added to the docstring and the usage statement
+ also must be updated.
+ @param name :
+ Name of the pipeline or command-line tool
+ @param description :
+ Short description of pipeline or command-line tool
+ """
+ # Add styled name and description
+ c = Colors
+ styled_name = "{0}{1}{2}mpox-seek{3}".format(c.bold, c.bg_black, c.cyan, c.end)
+ description = "{0}{1}{2}".format(c.bold, description, c.end)
+
+ # Create a top-level parser
+ parser = argparse.ArgumentParser(description = '{}: {}'.format(styled_name, description))
+
+ # Adding Verison information
+ parser.add_argument('--version', action = 'version', version='%(prog)s {}'.format(__version__))
+
+ # Create sub-command parser
+ subparsers = parser.add_subparsers(help='List of available sub-commands')
+
+ # Sub-parser for the "run" sub-command
+ # Grouped sub-parser arguments are currently
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_run_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}{4}Synopsis:{5}
+ $ {2} run [--help] \\
+ [--dry-run] [--job-name JOB_NAME] [--mode {{slurm,local}}] \\
+ [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \\
+ [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \\
+ [--resource-bundle RESOURCE_BUNDLE] [--use-conda] \\
+ [--conda-env-name CONDA_ENV_NAME] \\
+ [--quality-filter QUALITY_FILTER] \\
+ --input INPUT [INPUT ...] \\
+ --output OUTPUT
+
+ Optional arguments are shown in square brackets above.
+
+ {3}{4}Description:{5}
+ To run the slim oxford nanpore monkeypox pipeline with your data raw data,
+ please provide a space seperated list of FastQ (globbing is supported) and an
+ output directory to store results.
+
+ {3}{4}Required arguments:{5}
+ --input INPUT [INPUT ...]
+ Input Oxford Nanopore FastQ file(s) to process. One
+ or more FastQ files can be provided. Multiple input
+ FastQ files should be seperated by a space. Globbing
+ for multiple files is also supported.
+ Example: --input .tests/*.fastq.gz
+ --output OUTPUT
+ Path to an output directory. This location is where
+ the pipeline will create all of its output files, also
+ known as the pipeline's working directory. If the user
+ provided working directory has not been initialized,
+ it will be created automatically.
+ Example: --output /data/$USER/output
+
+ {3}{4}Analysis options:{5}
+ --quality-filter QUALITY_FILTER
+ Filter reads on a minimum average quality score. The
+ default average minimum quality filter is set to 8.
+ Example: --quality-filter 8
+
+ {3}{4}Orchestration options:{5}
+ --mode {{slurm,local}}
+ Method of execution. Defines the mode of execution.
+ Vaild options for this mode include: local or slurm.
+ Additional modes of exection are coming soon, default:
+ slurm.
+ Here is a brief description of each mode:
+ • local: uses local method of execution. local runs
+ will run serially on compute instance. This is useful
+ for testing, debugging, or when a users does not have
+ access to a high performance computing environment.
+ If this option is not provided, it will default to a
+ slurm mode of execution.
+ • slurm: uses slurm execution backend. This method
+ will submit jobs to a cluster using sbatch. It is
+ recommended running the pipeline in this mode as it
+ will be significantly faster.
+ Example: --mode slurm
+ --job-name JOB_NAME
+ Overrides the name of the pipeline's master job. When
+ submitting the pipeline to a jobscheduler, this option
+ overrides the default name of the master job. This can
+ be useful for tracking the progress or status of a run,
+ default: pl:{2}.
+ Example: --job-name {2}_03-14.1592
+ --dry-run
+ Does not execute anything. Only displays what steps in
+ the pipeline remain or will be run.
+ Example: --dry-run
+ --silent
+ Silence standard output. This will reduces the amount
+ of information displayed to standard output when the
+ master job is submitted to the job scheduler. Only the
+ job id of the master job is returned.
+ Example: --silent
+ --singularity-cache SINGULARITY_CACHE
+ Overrides the $SINGULARITY_CACHEDIR variable. Images
+ from remote registries are cached locally on the file
+ system. By default, the singularity cache is set to:
+ '/path/to/output/directory/.singularity/'. Please note
+ that this cache cannot be shared across users.
+ Example: --singularity-cache /data/$USER
+ --sif-cache SIF_CACHE
+ Path where a local cache of SIFs are stored. This cache
+ can be shared across users if permissions are properly
+ setup. If a SIF does not exist in the SIF cache, the
+ image will be pulled from Dockerhub. {2} cache
+ sub command can be used to create a local SIF cache.
+ Please see {2} cache for more information.
+ Example: --sif-cache /data/$USER/sifs/
+ --threads THREADS
+ Max number of threads for local mode processes. It is
+ recommended setting this vaule to the maximum number
+ of CPUs available on the host machine, default: 2.
+ Example: --threads: 16
+ --tmp-dir TMP_DIR
+ Path on the file system for writing temporary output
+ files. By default, the temporary directory is set to
+ '/lscratch/$SLURM_JOBID' for backwards compatibility
+ with the NIH's Biowulf cluster; however, if you are
+ running the pipeline on another cluster, this option
+ will need to be specified. Ideally, this path should
+ point to a dedicated location on the filesystem for
+ writing tmp files. On many systems, this location is
+ set to somewhere in /scratch. If you need to inject a
+ variable into this string that should NOT be expanded,
+ please quote this options value in single quotes.
+ Example: --tmp-dir '/scratch/$USER/'
+ --resource-bundle RESOURCE_BUNDLE
+ Path to a resource bundle downloaded with the install
+ sub command. The resource bundle contains the set of
+ required reference files for processing any data. The
+ path provided to this option will be the path to the
+ {2} directory that was created when running the
+ {2} install sub command.
+ Example: --resource-bundle /data/$USER/refs/{2}
+ --use-conda
+ Use Conda/Mamba instead of Singularity. By default,
+ the pipeline uses singularity for handling required
+ software dependencies. This option overrides that be-
+ havior, and it will use Conda or mamba instead. Please
+ note that conda or mamba must be in your $PATH prior
+ to running the pipeline. This option will build an env
+ on the fly prior to the pipeline's execution. As so,
+ this step requires internet access. To run {2} in
+ an offline mode with conda, please see the option
+ below.
+ Example: --use-conda
+ --conda-env-name CONDA_ENV_NAME
+ Use an existing conda environment. This option allows
+ {2} to run with conda in an offline mode. If you are
+ using conda without this option, the pipeline will build
+ a conda environment on the fly prior to the pipeline's
+ execution. Building a conda environment can sometimes be
+ slow as it downloads dependencies from the internet, so
+ it may make sense to build it once and re-use it. If you
+ have already built a named conda environment with the
+ supplied yaml file, then you can directly use it with
+ this option. Please provide the name of the conda env
+ that was specifically built for the {2} pipeline. To
+ create a reusable conda/mamba environment with the
+ name '{2}', please run the following mamba command:
+ 'mamba env create -f workflow/envs/mpox-seek.yaml'.
+ Example: --conda-env-name mpox-seek
+ {3}{4}Misc Options:{5}
+ -h, --help Show usage information, help message, and exit.
+ Example: --help
+ """.format(styled_name, description, name, c.bold, c.url, c.end, c.italic))
+
+ # Display example usage in epilog
+ run_epilog = textwrap.dedent("""\
+ {2}{3}Example:{4}
+ # Step 1.) Grab an interactive node,
+ # do not run on head node!
+ srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash
+ module purge
+ module load singularity snakemake
+
+ # Step 2A.) Dry-run the pipeline
+ ./{0} run --input .tests/*.fastq.gz \\
+ --output /data/$USER/output \\
+ --mode slurm \\
+ --dry-run
+
+ # Step 2B.) Run the {0} pipeline
+ # The slurm mode will submit jobs to
+ # the cluster. It is recommended running
+ # the pipeline in this mode.
+ ./{0} run --input .tests/*.fastq.gz \\
+ --output /data/$USER/output \\
+ --mode slurm
+
+ {2}{3}Version:{4}
+ {1}
+ """.format(name, __version__, c.bold, c.url, c.end))
+
+ # Supressing help message of required args to overcome no sub-parser named groups
+ subparser_run = subparsers.add_parser('run',
+ help = 'Run the {} pipeline with input files.'.format(name),
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_run_options,
+ epilog = run_epilog,
+ add_help=False
+ )
+
+ # Required Arguments
+ # Input FastQ files
+ subparser_run.add_argument(
+ '--input',
+ # Check if the file exists and if it is readable
+ type = lambda file: permissions(parser, file, os.R_OK),
+ required = True,
+ nargs = '+',
+ help = argparse.SUPPRESS
+ )
+
+ # Output Directory, i.e
+ # working directory
+ subparser_run.add_argument(
+ '--output',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Analysis Arguments
+ # Minimum average read quality filter
+ subparser_run.add_argument(
+ '--quality-filter',
+ type = int,
+ required = False,
+ default = 8,
+ help = argparse.SUPPRESS
+ )
+
+ # Optional Arguments
+ # Add custom help message
+ subparser_run.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+ # Orchestration Options
+ # Execution Method, run locally
+ # on a compute node or submit to
+ # a supported job scheduler, etc.
+ subparser_run.add_argument(
+ '--mode',
+ type = str,
+ required = False,
+ default = "slurm",
+ choices = ['slurm', 'local'],
+ help = argparse.SUPPRESS
+ )
+
+ # Name of master job
+ subparser_run.add_argument(
+ '--job-name',
+ type = str,
+ required = False,
+ default = 'pl:{}'.format(name),
+ help = argparse.SUPPRESS
+ )
+
+ # Dry-run, does not execute the
+ # workflow, prints what steps remain
+ subparser_run.add_argument(
+ '--dry-run',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Silent output mode
+ subparser_run.add_argument(
+ '--silent',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Singularity cache directory,
+ # default uses output directory
+ subparser_run.add_argument(
+ '--singularity-cache',
+ type = lambda option: check_cache(parser, os.path.abspath(os.path.expanduser(option))),
+ required = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Local SIF cache directory,
+ # default pull from Dockerhub
+ subparser_run.add_argument(
+ '--sif-cache',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Base directory to write
+ # temporary/intermediate files
+ subparser_run.add_argument(
+ '--tmp-dir',
+ type = str,
+ required = False,
+ default = '/lscratch/$SLURM_JOBID/',
+ help = argparse.SUPPRESS
+ )
+
+ # Number of threads for the
+ # pipeline's main proceess
+ # This is only applicable for
+ # local rules or when running
+ # in local mode.
+ subparser_run.add_argument(
+ '--threads',
+ type = int,
+ required = False,
+ default = 2,
+ help = argparse.SUPPRESS
+ )
+
+ # Output Directory of downloaded
+ # resource bundle, see the install
+ # sub command for more information
+ # on how to download any required
+ # references files locally.
+ subparser_run.add_argument(
+ '--resource-bundle',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = False,
+ default = None,
+ help = argparse.SUPPRESS
+ )
+
+ # Use Conda instead of singularity,
+ # will build an env on the fly
+ subparser_run.add_argument(
+ '--use-conda',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Use an existing conda env,
+ # runs conda in offline mode
+ subparser_run.add_argument(
+ '--conda-env-name',
+ type = str,
+ required = False,
+ default = '',
+ help = argparse.SUPPRESS
+ )
+
+ # Sub-parser for the "unlock" sub-command
+ # Grouped sub-parser arguments are currently
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_unlock_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}{4}Synopsis:{5}
+ $ {2} unlock [-h] --output OUTPUT
+
+ Optional arguments are shown in square brackets above.
+
+ {3}{4}Description:{5}
+ If the pipeline fails ungracefully, it maybe required to unlock
+ the working directory before proceeding again. Please verify that
+ the pipeline is not running before running this command. If the
+ pipeline is still running, the workflow manager will report the
+ working directory is locked. This is normal behavior. Do NOT run
+ this command if the pipeline is still running.
+
+ {3}{4}Required arguments:{5}
+ --output OUTPUT Path to a previous run's output directory
+ to unlock. This will remove a lock on the
+ working directory. Please verify that the
+ pipeline is not running before running
+ this command.
+ Example: --output /data/$USER/output
+
+ {3}{4}Misc Options:{5}
+ -h, --help Show usage information, help message,
+ and exit.
+ Example: --help
+ """.format(styled_name, description, name, c.bold, c.url, c.end))
+
+ # Display example usage in epilog
+ unlock_epilog = textwrap.dedent("""\
+ {2}{3}Example:{4}
+ # Unlock output directory of pipeline
+ {0} unlock --output /data/$USER/output
+
+ {2}{3}Version:{4}
+ {1}
+ """.format(name, __version__, c.bold, c.url, c.end))
+
+ # Supressing help message of required args to overcome no sub-parser named groups
+ subparser_unlock = subparsers.add_parser(
+ 'unlock',
+ help = 'Unlocks a previous runs output directory.',
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_unlock_options,
+ epilog = unlock_epilog,
+ add_help = False
+ )
+
+ # Required Arguments
+ # Output Directory (analysis working directory)
+ subparser_unlock.add_argument(
+ '--output',
+ type = str,
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Add custom help message
+ subparser_unlock.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+
+ # Sub-parser for the "install" sub-command
+ # Grouped sub-parser arguments are
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_install_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}{4}Synopsis:{5}
+ $ {2} install [-h] [--dry-run] \\
+ [--force] [--threads] \\
+ --ref-path REF_PATH
+
+ Optional arguments are shown in square brackets above.
+
+ {3}{4}Description:{5}
+ The pipeline uses a set of reference files to process data.
+ These reference files are required and need to be available on
+ the local file system prior to execution. This command can be
+ used to download the pipeline's required reference files.
+
+ Please Note: The resource bundle requires about 2GB of avail-
+ able disk space. If you are running the pipeline on the Biowulf
+ cluster, you do NOT need to download the pipeline's resource
+ bundle. It is already accessible to all HPC users.
+
+ {3}{4}Required arguments:{5}
+ --ref-path REF_PATH Path where the resource bundle will be
+ downloaded. Any resouces defined in the
+ 'config/install.json' will be pulled on
+ to the local filesystem. After the files
+ have been downloaded, a new directory
+ with the name {2} will be created.
+ It contains all the required reference
+ files of the pipeline. The path to this
+ new directory can be passed to the run
+ sub command's --resource-bundle option.
+ Please see the run sub command for more
+ information.
+ Example: --ref-path /data/$USER/refs
+
+ {3}{4}Orchestration options:{5}
+ --dry-run Does not execute anything. Only displays
+ what remote resources would be pulled.
+ Example: --dry-run
+
+ --force Force downloads all files. By default, any
+ files that do not exist locally are pulled;
+ however if a previous instance of an install
+ did not exit gracefully, it may be necessary
+ to forcefully re-download all the files.
+ Example: --force
+
+ --threads THREADS Number of threads to use for concurrent file
+ downloads, default: 2.
+ Example: --threads 12
+
+ {3}{4}Misc Options:{5}
+ -h, --help Show usage information, help message,
+ and exits.
+ Example: --help
+
+ """.format(styled_name, description, name, c.bold, c.url, c.end))
+
+ # Display example usage in epilog
+ install_epilog = textwrap.dedent("""\
+ {2}{3}Example:{4}
+ # Dry-run download of the resource bundle
+ {0} install --ref-path /data/$USER/ref \\
+ --force \\
+ --dry-run \\
+ --threads 12
+
+ # Download the resource bundle
+ {0} install --ref-path /data/$USER/ref \\
+ --force \\
+ --threads 12
+
+ {2}{3}Version:{4}
+ {1}
+ """.format(name, __version__, c.bold, c.url, c.end))
+
+ # Supressing help message of required args
+ # to overcome no sub-parser named groups
+ subparser_install = subparsers.add_parser(
+ 'install',
+ help = 'Download reference files locally.',
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_install_options,
+ epilog = install_epilog,
+ add_help = False
+ )
+
+ # Required Arguments
+ # Output Directory where file will be downloaded
+ subparser_install.add_argument(
+ '--ref-path',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Optional Arguments
+ # Dry-run install command,
+ # does not pull any remote resources,
+ # just shows what will be pulled
+ subparser_install.add_argument(
+ '--dry-run',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help=argparse.SUPPRESS
+ )
+
+ # Forces downloading of all files
+ subparser_install.add_argument(
+ '--force',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help=argparse.SUPPRESS
+ )
+
+ # Number of threads for concurrent downloads
+ subparser_install.add_argument(
+ '--threads',
+ type = int,
+ required = False,
+ default = 2,
+ help = argparse.SUPPRESS
+ )
+
+ # Add custom help message
+ subparser_install.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+
+ # Sub-parser for the "cache" sub-command
+ # Grouped sub-parser arguments are
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_cache_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}Synopsis:{4} Cache software containers locally.
+ $ {2} cache [-h] [--dry-run] \\
+ --sif-cache SIF_CACHE
+
+ Optional arguments are shown in square brackets above.
+
+ {3}Description:{4}
+ Create a local cache of software containers on DockerHub.
+ These containers are normally pulled when the pipeline runs;
+ however, due to network issues or DockerHub pull rate limits,
+ it may make sense to pull the resources once so a shared cache
+ can be created. It is worth noting that a singularity cache
+ cannot normally be shared across users. Singularity strictly
+ enforces that a cache is owned by the user. To get around this
+ issue, the cache subcommand can be used to create local SIFs
+ on the filesystem from images on DockerHub.
+
+ {3}Required arguments:{4}
+ --sif-cache SIF_CACHE
+ Path where a local cache of SIFs will be
+ stored. Images defined in containers.json
+ will be pulled into the local filesystem.
+ The path provided to this option can be
+ passed to the --sif-cache option of the
+ run sub command. Please see {2} run
+ sub command for more information.
+ Example: --sif-cache /data/$USER/cache
+
+ {3}Orchestration options:{4}
+ --dry-run Does not execute anything. Only displays
+ what remote resources would be pulled.
+ Example: --dry-run
+
+ {3}Misc Options:{4}
+ -h, --help Show usage information, help message,
+ and exits.
+ Example: --help
+ """.format(styled_name, description, name, c.bold, c.end))
+
+ # Display example usage in epilog
+ cache_epilog = textwrap.dedent("""\
+ {2}Example:{3}
+ # See what software containers
+ # will be cached locally
+ {0} cache --dry-run --sif-cache /data/$USER/cache
+
+ # Cache software containers
+ {0} cache --sif-cache /data/$USER/cache
+ {2}Version:{3}
+ {1}
+ """.format(name, __version__, c.bold, c.end))
+
+ # Supressing help message of required args
+ # to overcome no sub-parser named groups
+ subparser_cache = subparsers.add_parser(
+ 'cache',
+ help = 'Cache software containers locally.',
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_cache_options,
+ epilog = cache_epilog,
+ add_help = False
+ )
+
+ # Required Arguments
+ # Output Directory (analysis working directory)
+ subparser_cache.add_argument(
+ '--sif-cache',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Optional Arguments
+ # Dry-run cache command (do not pull any remote resources)
+ subparser_cache.add_argument(
+ '--dry-run',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help=argparse.SUPPRESS
+ )
+
+ # Add custom help message
+ subparser_cache.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+ # Define handlers for each sub-parser
+ subparser_run.set_defaults(func = run)
+ subparser_unlock.set_defaults(func = unlock)
+ subparser_install.set_defaults(func = install)
+ subparser_cache.set_defaults(func = cache)
+
+ # Parse command-line args
+ args = parser.parse_args()
+ return args
+
+
+def main():
+
+ # Sanity check for usage
+ if len(sys.argv) == 1:
+ # Nothing was provided
+ fatal('Invalid usage: {} [-h] [--version] ...'.format(_name))
+
+ # Collect args for sub-command
+ args = parsed_arguments(
+ name = _name,
+ description = _description
+ )
+
+ # Display version information
+ err('{} ({})'.format(_name, __version__))
+
+ # Mediator method to call sub-command's set handler function
+ args.func(args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/resources/README.md b/resources/README.md
new file mode 100644
index 0000000..2ae24af
--- /dev/null
+++ b/resources/README.md
@@ -0,0 +1,5 @@
+### Resources
+
+This folder, `resources/`, is meant to contain all resources necessary for running the workflow. This can be small reference files, such as reference sequences or small databases. This directory also contains utility scripts or wrappers to help facilitate running the pipeline.
+
+Whenever feasible, they can also be downloaded programmatically via rules defined in the pipeline.
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..0a5233c
--- /dev/null
+++ b/src/__init__.py
@@ -0,0 +1,16 @@
+import os, sys
+# Makes relative imports to work in Python 3.6
+# without the need of '.' before the name of the
+# package or py file.
+# Allows for consistent syntax of relative imports
+# across python2 and python3.
+here = os.path.dirname(os.path.realpath(__file__))
+sys.path.append(here)
+
+# Ground source of truth for version information
+try:
+ # Import from root of project directory
+ version = open(os.path.join(here, 'VERSION'), 'r').readlines()[0].strip()
+except IOError:
+ # When namespace is __main__
+ version = open(os.path.join(here, '..', 'VERSION'), 'r').readlines()[0].strip()
diff --git a/src/bundle.sh b/src/bundle.sh
new file mode 100755
index 0000000..a4dadb7
--- /dev/null
+++ b/src/bundle.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+usage="Usage: $0 "
+
+function err() { cat <<< "$@" 1>&2; }
+function fatal() { cat <<< "$@" 1>&2; err "$usage"; exit 1; }
+function abspath() { readlink -f "$1"; }
+function bundle() { tar -hczvf "$1" "$2"; }
+
+
+function check() {
+ # Checks command-line usage for required positional arguments.
+ # @INPUT $1 = Path on filesytem to create a tarball
+ # @INPUT $2 = Name of of output tarballl to create
+ # @CALLS fatal() with incorrect usage
+
+ die=false
+ # Path to archive
+ if [ -z "${1:-}" ]; then
+ die=true
+ err "Error: Failed to provide directory to archive."
+ fi
+ # Output tarball name
+ if [ -z "${2:-}" ]; then
+ die=true
+ err "Error: Failed to output file name for archive."
+ fi
+ if $die; then
+ fatal "Fatal: Please try again after providing the required arguments!"
+ fi
+}
+
+
+function chunk() {
+ # Splits resulting tarball into N 15MB chunks
+ # @INPUT $1 = Name of of output tarballl
+ # @CALLS fatal() if provided a non-supported archive
+
+ # Strip archive file extension,
+ # common tarball extensions: .tar.gz or .tgz
+ prefix=''
+ if [ -f $1 ] ; then
+ case $1 in
+ *.tar.gz) prefix="${1%.tar.gz}" ;;
+ *.tgz) prefix="${1%.tgz}" ;;
+ esac
+ else
+ fatal "'$1' is not supported file type"
+ fi
+
+ # Spilt file into N 15MB chunk files
+ split --numeric-suffixes=1 -b 15M "$1" "${prefix}_chunk-"
+
+ # Calculate MD5 of all the chunks
+ md5sum "${prefix}_chunk-"* > "${prefix}_chunks.md5"
+}
+
+
+function main() {
+ # Checks for required positional
+ # command line arguments
+ check "${1:-}" "${2:-}"
+
+ # Converts any relative paths to
+ # absolute paths, creates uninit
+ # output directories as needed,
+ # runs tar command in parent dir
+ # of the provided resource bundle
+ # path.
+ archive_dir=$(abspath "$1")
+ archive=$(basename "${archive_dir%/}")
+ parent_dir=$(dirname "$archive_dir")
+ output_dir=$(dirname "$2")
+ mkdir -p "$output_dir"
+ output_dir=$(abspath "$output_dir")
+
+ cd "$parent_dir"
+
+ # Create archive as a tarball
+ echo "Creating tarball... $2"
+ bundle "$2" "$archive"
+
+ # Splitting tarball into N
+ # chunks for fast parallel
+ # downloading of large
+ # resource bundles
+ echo "Chunking tarball... $2"
+ chunk "$2"
+}
+
+
+main "$@"
diff --git a/src/cache.sh b/src/cache.sh
new file mode 100755
index 0000000..069b4fa
--- /dev/null
+++ b/src/cache.sh
@@ -0,0 +1,231 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+function usage() { cat << EOF
+cache.sh: Wrapper script for caching remote software containers.
+USAGE:
+ cache.sh [OPTIONS] -s -i
+
+SYNOPSIS:
+ This main process dictates how subsequent software containers are
+pulled onto the cluster's local filesystem. cache.sh pulls containers
+from Dockerhub locally. Docker images are converted on the fly into
+singularity image format.
+ The main entry point of the pipeline calls this wrapper script.
+As so, this script can be used to manually by-pass the pipeline for
+a previously failed cache or for the purpose of debugging.
+
+Required Positional Argument:
+ [1] MODE [Type: Str] Defines the mode of execution. More methods
+ can be added later. Valid mode options include:
+ a) local: uses singularity and local compute.
+
+Required Arguments:
+ -s, --sif-cache [Type: Path] Path to output directory to cache
+ software containers, i.e. SIFs.
+
+ -i, --image-uris [Type: Str] Image(s) to pull from Dockerhub.
+ Multiple images are seperated by
+ a comma.
+OPTIONS:
+ -t, --tmp-dir [Type: Path] Path to singularity temp directory.
+ Singularity uses this directory when
+ images are pulled from DockerHub and
+ coverted into SIFs. If not provided,
+ the location to the temp dir will
+ default to the following location:
+ "/tmp/$USER/SIFs/.singularity/".
+
+ -h, --help [Type: Bool] Displays usage and help information.
+
+Example:
+ $ cache.sh local \\
+ -s $PWD/$USER/SIFs \\
+ -t $PWD/$USER/SIFs/.singularity \\
+ -i 'docker://nciccbr/ccbr_arriba_2.0.0:v0.0.1,docker://nciccbr/ccbr_rna:v0.0.1'
+Version:
+ 0.2.0
+EOF
+}
+
+
+# Functions
+function err() { cat <<< "$@" 1>&2; }
+function fatal() { cat <<< "$@" 1>&2; usage; exit 1; }
+function abspath() { readlink -e "$1"; }
+function parser() {
+ # Adds parsed command-line args to GLOBAL $Arguments associative array
+ # + KEYS = short_cli_flag ("j", "o", ...)
+ # + VALUES = parsed_user_value ("MasterJobName" "/scratch/hg38", ...)
+ # @INPUT "$@" = user command-line arguments
+ # @CALLS check() to see if the user provided all the required arguments
+
+ while [[ $# -gt 0 ]]; do
+ key="$1"
+ case $key in
+ -h | --help) usage && exit 0;;
+ -s | --sif-cache) provided "$key" "${2:-}"; Arguments["s"]="$2"; shift; shift;;
+ -i | --image-uris) provided "$key" "${2:-}"; Arguments["i"]="$2"; shift; shift;;
+ -t | --tmp-dir) provided "$key" "${2:-}"; Arguments["t"]="$2"; shift; shift;;
+ -* | --*) err "Error: Failed to parse unsupported argument: '${key}'."; usage && exit 1;;
+ *) err "Error: Failed to parse unrecognized argument: '${key}'. Do any of your inputs have spaces?"; usage && exit 1;;
+ esac
+ done
+
+ # Check for required args
+ check
+}
+
+
+function provided() {
+ # Checks to see if the argument's value exists
+ # @INPUT $1 = name of user provided argument
+ # @INPUT $2 = value of user provided argument
+ # @CALLS fatal() if value is empty string or NULL
+
+ if [[ -z "${2:-}" ]]; then
+ fatal "Fatal: Failed to provide value to '${1}'!";
+ fi
+}
+
+
+function check(){
+ # Checks to see if user provided required arguments
+ # @INPUTS $Arguments = Global Associative Array
+ # @CALLS fatal() if user did NOT provide all the $required args
+
+ # List of required arguments
+ local required=("s" "i")
+ #echo -e "Provided Required Inputs"
+ for arg in "${required[@]}"; do
+ value=${Arguments[${arg}]:-}
+ if [[ -z "${value}" ]]; then
+ fatal "Failed to provide all required args.. missing ${arg}"
+ fi
+ done
+}
+
+
+function retry() {
+ # Tries to run a cmd 5 times before failing
+ # If a command is successful, it will break out of attempt loop
+ # Failed attempts are padding with the following exponential
+ # back-off strategy {4, 16, 64, 256, 1024} in seconds
+ # @INPUTS "$@"" = cmd to run
+ # @CALLS fatal() if command cannot be run in 5 attempts
+ local n=1
+ local max=5
+ local attempt=true # flag for while loop
+ while $attempt; do
+ # Attempt command and break if successful
+ "$@" && attempt=false || {
+ # Try again up to 5 times
+ if [[ $n -le $max ]]; then
+ err "Command failed: $@"
+ delay=$(( 4**$n ))
+ err "Attempt: ${n}/${max}. Trying again in ${delay} seconds!\n"
+ sleep $delay;
+ ((n++))
+ else
+ fatal "Fatal: the command has failed after max attempts!"
+ fi
+ }
+ done
+}
+
+
+function _pull(){
+ # Caches a remote image from DockerHub
+ # INPUT $1 = Snakemake Mode of execution
+ # INPUT $2 = Cache output directory
+ # INPUT $3 = Singularity temp directory
+ # INPUT $4 = Images to pull from DockerHub
+
+ # Check if singularity in $PATH
+ # If not, try to module load singularity as a last resort
+ command -V singularity &> /dev/null || {
+ command -V module &> /dev/null &&
+ module purge && module load singularity
+ } || fatal "Fail to find or load 'singularity', not installed on target system."
+
+ # Execution method, currently pulls
+ # from local compute, in the future
+ # options can be added to submit a
+ # to different job schedulers, like
+ # PBS or SLURM, etc
+ executor=${1}
+
+ # Goto Pipeline Ouput directory
+ # Create a local singularity cache in output directory
+ # cache can be re-used instead of re-pulling from DockerHub everytime
+ cd "$2" && export SINGULARITY_CACHEDIR="${3}"
+
+ # unsetting XDG_RUNTIME_DIR to avoid some unsighly but harmless warnings
+ unset XDG_RUNTIME_DIR
+
+ # Run the workflow with specified executor
+ case "$executor" in
+ local)
+ # Create directory for logfiles
+ for image in ${4//,/$'\t'}; do
+ # Try to pull image from URI with 5 max attempt
+ echo "Singularity pulling ${image}"
+ retry singularity pull -F ${image}
+ done
+ ;;
+ *) echo "${executor} is not available." && \
+ fatal "Failed to provide valid execution backend: ${executor}. Please use local."
+ ;;
+ esac
+}
+
+
+function main(){
+ # Parses args and pulls remote resources
+ # @INPUT "$@" = command-line arguments
+ # @CALLS pull()
+
+ if [ $# -eq 0 ]; then usage; exit 1; fi
+
+ # Associative array to store parsed args
+ declare -Ag Arguments
+
+ # Positional Argument for Executor
+ case $1 in
+ local) Arguments["e"]="$1";;
+ -h | --help | help) usage && exit 0;;
+ -* | --*) err "Error: Failed to provide required positional argument: ."; usage && exit 1;;
+ *) err "Error: Failed to provide valid positional argument. '${1}' is not supported. Valid option(s) are local"; usage && exit 1;;
+ esac
+
+ # Parses remaining user provided command-line arguments
+ parser "${@:2}" # Remove first item of list
+ mkdir -p "${Arguments[s]}"
+ cache=$(abspath "${Arguments[s]}")
+ dockers="${Arguments[i]}"
+
+ # Setting defaults for non-required arguments
+ tmp="${Arguments[t]:-/tmp/$USER/SIFs/.singularity/}"
+ # Sanitize single quoted input
+ tmp=$(echo "$tmp" | sed "s/\${SLURM_JOB_ID}/${SLURM_JOB_ID}/g" | sed "s/\$SLURM_JOB_ID/${SLURM_JOB_ID}/g")
+ mkdir -p "$tmp"
+ tmp=$(abspath "${tmp}")
+
+ # Print cli options prior to running
+ echo -e "cache.sh \t$(date)"
+ echo -e "Running with the following parameters:"
+ for key in "${!Arguments[@]}"; do echo -e "\t${key}\t${Arguments["$key"]}"; done
+
+ # Pull software containers into SIF cache
+ # Cache remote image from DockerHub
+ # INPUT $1 = Snakemake Mode of execution
+ # INPUT $2 = Cache output directory
+ # INPUT $3 = Singularity temp directory
+ # INPUT $4 = Images to pull from DockerHub
+ _pull "${Arguments[e]}" "$cache" "$tmp" "$dockers"
+
+}
+
+
+# Main: check usage, parse args, and run pipeline
+main "$@"
\ No newline at end of file
diff --git a/src/download.py b/src/download.py
new file mode 100755
index 0000000..7aa0ac3
--- /dev/null
+++ b/src/download.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+# Python standard library
+from __future__ import print_function
+from concurrent.futures import ThreadPoolExecutor
+import time, argparse, os, sys, shutil
+
+# Third-party pypi packages
+# pip install as needed
+import requests
+
+# Local imports
+from utils import (
+ fatal,
+ exists,
+ err,
+ md5sum
+)
+
+
+# Constasts
+__version__ = 'v0.1.0'
+__author__ = 'Skyler Kuhn'
+
+# Functions
+def retry(times=5, exceptions=(Exception)):
+ """
+ Decorator to retry running a function. Retries the wrapped function
+ N times with an exponential backoff stragety. A tuple of Exceptions
+ can be passed that trigger a retry attempt. When times is equal to
+ 4 the back-off strategy will be {4, 16, 64, 256} seconds. Calls fatal
+ if the function cannot be run within the defined number of times.
+ @param times :
+ The number of times to repeat the wrapped function,
+ default: 5
+ @param exceptions tuple():
+ Tuple of Python Exceptions that will trigger a retry attempt,
+ default: (Exception)
+ @return