From f1f633c3e86eb066bc7964eebc029e6727bfe798 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Fri, 27 Oct 2023 09:32:13 -0400 Subject: [PATCH 1/2] adds dockerfile for pangolin 4.3.1 and pdata 1.23.1. also adds tool-specific readme and updates the main README.md table --- README.md | 2 +- pangolin/4.3.1-pdata-1.23.1/Dockerfile | 165 +++++++++++++++++++++++++ pangolin/4.3.1-pdata-1.23.1/README.md | 61 +++++++++ 3 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 pangolin/4.3.1-pdata-1.23.1/Dockerfile create mode 100644 pangolin/4.3.1-pdata-1.23.1/README.md diff --git a/README.md b/README.md index a64c99cc2..bc6dfe473 100644 --- a/README.md +++ b/README.md @@ -189,7 +189,7 @@ To learn more about the docker pull rate limits and the open source software pro | [NCBI table2asn](https://hub.docker.com/r/staphb/ncbi-table2asn)
[![docker pulls](https://badgen.net/docker/pulls/staphb/ncbi-table2asn)](https://hub.docker.com/r/staphb/ncbi-table2asn) | | [https://www.ncbi.nlm.nih.gov/genbank/table2asn/](https://www.ncbi.nlm.nih.gov/genbank/table2asn/)
[https://ftp.ncbi.nlm.nih.gov/asn1-converters/versions/2022-06-14/by_program/table2asn/](https://ftp.ncbi.nlm.nih.gov/asn1-converters/versions/2022-06-14/by_program/table2asn/) | | [OrthoFinder](https://hub.docker.com/r/staphb/OrthoFinder)
[![docker pulls](https://badgen.net/docker/pulls/staphb/orthofinder)](https://hub.docker.com/r/staphb/orthofinder) | | https://github.com/davidemms/OrthoFinder | | [Panaroo](https://hub.docker.com/r/staphb/panaroo)
[![docker pulls](https://badgen.net/docker/pulls/staphb/panaroo)](https://hub.docker.com/r/staphb/panaroo) | | https://github.com/gtonkinhill/panaroo | -| [Pangolin](https://hub.docker.com/r/staphb/pangolin)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pangolin)](https://hub.docker.com/r/staphb/pangolin) |
Click to see Pangolin v4.2 and older versions! **Pangolin version & pangoLEARN data release date** **Pangolin version & pangolin-data version**
**Pangolin version & pangolin-data version** | https://github.com/cov-lineages/pangolin
https://github.com/cov-lineages/pangoLEARN
https://github.com/cov-lineages/pango-designation
https://github.com/cov-lineages/scorpio
https://github.com/cov-lineages/constellations
https://github.com/cov-lineages/lineages (archived)
https://github.com/hCoV-2019/pangolin (archived) | +| [Pangolin](https://hub.docker.com/r/staphb/pangolin)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pangolin)](https://hub.docker.com/r/staphb/pangolin) |
Click to see Pangolin v4.2 and older versions! **Pangolin version & pangoLEARN data release date** **Pangolin version & pangolin-data version**
**Pangolin version & pangolin-data version** | https://github.com/cov-lineages/pangolin
https://github.com/cov-lineages/pangoLEARN
https://github.com/cov-lineages/pango-designation
https://github.com/cov-lineages/scorpio
https://github.com/cov-lineages/constellations
https://github.com/cov-lineages/lineages (archived)
https://github.com/hCoV-2019/pangolin (archived) | | [parallel-perl](https://hub.docker.com/r/staphb/parallel-perl)
[![docker pulls](https://badgen.net/docker/pulls/staphb/parallel-perl)](https://hub.docker.com/r/staphb/parallel-perl) | | https://www.gnu.org/software/parallel | | [pasty](https://hub.docker.com/r/staphb/pasty)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pasty)](https://hub.docker.com/r/staphb/pasty) | | https://github.com/rpetit3/pasty | | [pbptyper](https://hub.docker.com/r/staphb/pbptyper)
[![docker pulls](https://badgen.net/docker/pulls/staphb/pbptyper)](https://hub.docker.com/r/staphb/pbptyper) | | https://github.com/rpetit3/pbptyper | diff --git a/pangolin/4.3.1-pdata-1.23.1/Dockerfile b/pangolin/4.3.1-pdata-1.23.1/Dockerfile new file mode 100644 index 000000000..19f89a341 --- /dev/null +++ b/pangolin/4.3.1-pdata-1.23.1/Dockerfile @@ -0,0 +1,165 @@ +FROM mambaorg/micromamba:1.5.1 as app + +# build and run as root users since micromamba image has 'mambauser' set as the $USER +USER root +# set workdir to default for building; set to /data at the end +WORKDIR / + +# ARG variables only persist during build time +# had to include the v for some of these due to GitHub tags. +# using pangolin-data github tag, NOT what is in the GH release title "v1.2.133" +ARG PANGOLIN_VER="v4.3.1" +ARG PANGOLIN_DATA_VER="v1.23.1" +ARG SCORPIO_VER="v0.3.19" +ARG CONSTELLATIONS_VER="v0.1.12" +ARG USHER_VER="0.6.2" + +# metadata labels +LABEL base.image="mambaorg/micromamba:1.5.1" +LABEL dockerfile.version="1" +LABEL software="pangolin" +LABEL software.version=${PANGOLIN_VER} +LABEL description="Conda environment for Pangolin. Pangolin: Software package for assigning SARS-CoV-2 genome sequences to global lineages." +LABEL website="https://github.com/cov-lineages/pangolin" +LABEL license="GNU General Public License v3.0" +LABEL license.url="https://github.com/cov-lineages/pangolin/blob/master/LICENSE.txt" +LABEL maintainer="Curtis Kapsak" +LABEL maintainer.email="kapsakcj@gmail.com" + +# install dependencies; cleanup apt garbage +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget \ + ca-certificates \ + git \ + procps \ + bsdmainutils && \ + apt-get autoclean && rm -rf /var/lib/apt/lists/* + +# get the pangolin repo +RUN wget "https://github.com/cov-lineages/pangolin/archive/${PANGOLIN_VER}.tar.gz" && \ + tar -xf ${PANGOLIN_VER}.tar.gz && \ + rm -v ${PANGOLIN_VER}.tar.gz && \ + mv -v pangolin-* pangolin + +# set the environment; PATH is unnecessary here, but leaving anyways. It's reset later in dockerfile +ENV PATH="$PATH" \ + LC_ALL=C.UTF-8 + +# modify environment.yml to pin specific versions during install +# create the conda environment using modified environment.yml +RUN sed -i "s|usher.*|usher=${USHER_VER}|" /pangolin/environment.yml && \ + sed -i "s|scorpio.git|scorpio.git@${SCORPIO_VER}|" /pangolin/environment.yml && \ + sed -i "s|pangolin-data.git|pangolin-data.git@${PANGOLIN_DATA_VER}|" /pangolin/environment.yml && \ + sed -i "s|constellations.git|constellations.git@${CONSTELLATIONS_VER}|" /pangolin/environment.yml && \ + micromamba create -n pangolin -y -f /pangolin/environment.yml + +# so that mamba/conda env is active when running below commands +ENV ENV_NAME="pangolin" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +WORKDIR /pangolin + +# run pip install step; download optional pre-computed assignment hashes for UShER (useful for running on large batches of samples) +# best to skip using the assigment-cache if running on one sample for speed +# print versions +RUN pip install . && \ + pangolin --add-assignment-cache && \ + micromamba clean -a -y && \ + mkdir /data && \ + pangolin --all-versions && \ + usher --version + +WORKDIR /data + +# hardcode pangolin executable into the PATH variable +ENV PATH="${PATH}:/opt/conda/envs/pangolin/bin/" + +# default command is to pull up help options for virulencefinder; can be overridden of course +CMD ["pangolin", "-h"] + +# new base for testing +FROM app as test + +# so that mamba/conda env is active when running below commands +ENV ENV_NAME="pangolin" +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +# test on test sequences supplied with Pangolin code +RUN pangolin /pangolin/pangolin/test/test_seqs.fasta --analysis-mode usher -o /data/test_seqs-output-pusher && \ + column -t -s, /data/test_seqs-output-pusher/lineage_report.csv + +# test functionality of assignment-cache option +RUN pangolin --use-assignment-cache /pangolin/pangolin/test/test_seqs.fasta + +# download B.1.1.7 genome from Utah +ADD https://raw.githubusercontent.com/StaPH-B/docker-builds/master/tests/SARS-CoV-2/SRR13957123.consensus.fa /test-data/SRR13957123.consensus.fa + +# test on a B.1.1.7 genome +RUN pangolin /test-data/SRR13957123.consensus.fa --analysis-mode usher -o /test-data/SRR13957123-pusher && \ + column -t -s, /test-data/SRR13957123-pusher/lineage_report.csv + + # install unzip for unzipping zip archive from NCBI +RUN apt-get update && apt-get install -y --no-install-recommends unzip + +# install ncbi datasets tool (pre-compiled binary); place in $PATH +RUN wget https://ftp.ncbi.nlm.nih.gov/pub/datasets/command-line/LATEST/linux-amd64/datasets && \ + chmod +x datasets && \ + mv -v datasets /usr/local/bin + +# download assembly for a BA.1 from Florida (https://www.ncbi.nlm.nih.gov/biosample?term=SAMN29506515 and https://www.ncbi.nlm.nih.gov/nuccore/ON924087) +# run pangolin in usher analysis mode +RUN datasets download virus genome accession ON924087.1 --filename ON924087.1.zip && \ + unzip ON924087.1.zip && rm ON924087.1.zip && \ + mv -v ncbi_dataset/data/genomic.fna ON924087.1.genomic.fna && \ + rm -vr ncbi_dataset/ README.md && \ + pangolin ON924087.1.genomic.fna --analysis-mode usher -o ON924087.1-usher && \ + column -t -s, ON924087.1-usher/lineage_report.csv + +# test specific for new lineage, XBB.1.16, introduced in pangolin-data v1.19 +# using this assembly: https://www.ncbi.nlm.nih.gov/nuccore/2440446687 +# biosample here: https://www.ncbi.nlm.nih.gov/biosample?term=SAMN33060589 +# one of the sample included in initial pango-designation here: https://github.com/cov-lineages/pango-designation/issues/1723 +RUN datasets download virus genome accession OQ381818.1 --filename OQ381818.1.zip && \ + unzip OQ381818.1.zip && rm OQ381818.1.zip && \ + mv -v ncbi_dataset/data/genomic.fna OQ381818.1.genomic.fna && \ + rm -vr ncbi_dataset/ README.md && \ + pangolin OQ381818.1.genomic.fna --analysis-mode usher -o OQ381818.1-usher && \ + column -t -s, OQ381818.1-usher/lineage_report.csv + +# testing another XBB.1.16, trying to test scorpio functionality. Want pangolin to NOT assign lineage based on pango hash match. +# this test runs as expected, uses scorpio to check for constellation of mutations, then assign using PUSHER placement +RUN datasets download virus genome accession OR177999.1 --filename OR177999.1.zip && \ +unzip OR177999.1.zip && rm OR177999.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR177999.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR177999.1.genomic.fna --analysis-mode usher -o OR177999.1-usher && \ +column -t -s, OR177999.1-usher/lineage_report.csv + + ## test for BA.2.86 + # virus identified in MI: https://www.ncbi.nlm.nih.gov/nuccore/OR461132.1 +RUN datasets download virus genome accession OR461132.1 --filename OR461132.1.zip && \ +unzip OR461132.1.zip && rm OR461132.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR461132.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR461132.1.genomic.fna --analysis-mode usher -o OR461132.1-usher && \ +column -t -s, OR461132.1-usher/lineage_report.csv + + ## test for JN.2 (BA.2.86 sublineage) JN.2 is an alias of B.1.1.529.2.86.1.2 + # NY CDC Quest sample: https://www.ncbi.nlm.nih.gov/nuccore/OR598183 +RUN datasets download virus genome accession OR598183.1 --filename OR598183.1.zip && \ +unzip OR598183.1.zip && rm OR598183.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR598183.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR598183.1.genomic.fna --analysis-mode usher -o OR598183.1-usher && \ +column -t -s, OR598183.1-usher/lineage_report.csv + +## test for JQ.1 (BA.2.86.3 sublineage); JQ.1 is an alias of B.1.1.529.2.86.3.1 +# THANK YOU ERIN AND UPHL!! https://www.ncbi.nlm.nih.gov/nuccore/OR716684 +# this test is important due to the fact that this lineage was included in the UShER tree, despite being designated after the pangolin-designation 1.23 release +# it previously caused and error/bug in pangolin, but now is fixed +RUN datasets download virus genome accession OR716684.1 --filename OR716684.1.zip && \ +unzip OR716684.1.zip && rm OR716684.1.zip && \ +mv -v ncbi_dataset/data/genomic.fna OR716684.1.genomic.fna && \ +rm -vr ncbi_dataset/ README.md && \ +pangolin OR716684.1.genomic.fna --analysis-mode usher -o OR716684.1-usher && \ +column -t -s, OR716684.1-usher/lineage_report.csv \ No newline at end of file diff --git a/pangolin/4.3.1-pdata-1.23.1/README.md b/pangolin/4.3.1-pdata-1.23.1/README.md new file mode 100644 index 000000000..80028d6c0 --- /dev/null +++ b/pangolin/4.3.1-pdata-1.23.1/README.md @@ -0,0 +1,61 @@ +# pangolin docker image + +Main tool : [pangolin](https://github.com/cov-lineages/pangolin) + +Full documentation: [https://cov-lineages.org/resources/pangolin.html](https://cov-lineages.org/resources/pangolin.html) + +Phylogenetic Assignment of Named Global Outbreak LINeages + +Additional tools: + +- [pangolin-data](https://github.com/cov-lineages/pangolin-data) 1.23.1 +- [pangolin-assignment](https://github.com/cov-lineages/pangolin-assignment) 1.23.1 +- [minimap2](https://github.com/lh3/minimap2) 2.26-r1175 +- [usher](https://github.com/yatisht/usher) 0.6.2 +- [faToVcf](https://github.com/yatisht/usher) 448 +- [scorpio](https://github.com/cov-lineages/scorpio) 0.3.19 +- [constellations](https://github.com/cov-lineages/constellations) 0.1.12 +- [gofasta](https://github.com/virus-evolution/gofasta) 1.2.1 +- [mafft](https://mafft.cbrc.jp/alignment/software/) 7.520 +- python 3.8.18 + +## Rare bug warning :warning: + +This docker image contains `usher` version 0.6.2. This version of usher has a bug that can cause some lineage A samples to be assigned to A.* sublineages or even B or B.* sublineages. + +If you are running `pangolin` on early 2020 sequences that may be lineage A, it is highly recommended to use the assignment cache (and upgrade to usher 0.6.3 when it is available) [See the pangolin-assignment v1.23 release page for more details](https://github.com/cov-lineages/pangolin-assignment/releases/tag/v1.23 + +We plan to upgrade `usher` to >=0.6.3 in the next release of this docker image. + +## pangoLEARN deprecation + +As of pangolin version 4.3, pangoLEARN mode has been deprecated. [More info can be found here on the v4.3 release page.](https://github.com/cov-lineages/pangolin/releases/tag/v4.3) + +> If `--analysis-mode fast` or `--analysis-mode pangolearn` is given, pangolin v4.3 will print out a warning and use UShER mode instead, unless `--datadir` is also given specifying a directory with pangoLEARN model files. The next release of pangolin-data (v1.20) will no longer include the model files which have not been updated since v1.18. + +This docker image contains `pangolin-data` v1.23.1. The pangoLEARN model has not been updated since pangolin-data version 1.18. Only the the underlying UShER tree/protobuf file will be maintained for the forseeable future. + +**Please use the UShER mode of pangolin if you want to stay up-to-date with the most recent lineages.** [See pangolin-data release notes here for more details](https://github.com/cov-lineages/pangolin-data/releases) + +## Example Usage + +```bash +# run Pangolin in the default mode (usher). Can optionally supply --analysis-mode usher +$ pangolin /pangolin/pangolin/test/test_seqs.fasta -o /data/test_seqs-output-pusher + +# view the output CSV +$ column -t -s, /data/test_seqs-output-pusher/lineage_report.csv +taxon lineage conflict ambiguity_score scorpio_call scorpio_support scorpio_conflict scorpio_notes version pangolin_version scorpio_version constellation_version is_designated qc_status qc_notes note +India seq B.1.617.1 0.0 B.1.617.1-like 1.0 0.0 scorpio call: Alt alleles 11; Ref alleles 0; Amb alleles 0; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.617.1(1/1) +b117 B.1.1.7 0.0 Alpha (B.1.1.7-like) 0.91 0.04 scorpio call: Alt alleles 21; Ref alleles 1; Amb alleles 1; Oth alleles 0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: B.1.1.7(2/2) +outgroup_A A 0.0 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.02 Usher placements: A(1/1) +issue_57_torsten_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_has_6000_Ns_in_18000_bases Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_has_no_seq Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +This_seq_is_too_short Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.9 +This_seq_has_lots_of_Ns Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail Ambiguous_content:0.98 +This_seq_is_literally_just_N Unassigned PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False fail failed to map +Japan_seq B 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. +USA_seq B.1.314 0.0 PANGO-v1.16 4.1.3 0.3.17 v0.1.10 True pass Ambiguous_content:0.02 Assigned from designation hash. +Unassigned_omicron_seq BA.1 0.0 Probable Omicron (BA.1-like) 0.71 0.08 scorpio call: Alt alleles 42; Ref alleles 5; Amb alleles 9; Oth alleles 3 PUSHER-v1.16 4.1.3 0.3.17 v0.1.10 False pass Ambiguous_content:0.03 Usher placements: BA.1(1/1) +``` From 99bdc19538c0d6173220e40d4fb8ae2527060923 Mon Sep 17 00:00:00 2001 From: kapsakcj Date: Fri, 27 Oct 2023 09:47:49 -0400 Subject: [PATCH 2/2] fix link on the last 2 pangolin readme's --- pangolin/4.3.1-pdata-1.23.1/README.md | 2 +- pangolin/4.3.1-pdata-1.23/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pangolin/4.3.1-pdata-1.23.1/README.md b/pangolin/4.3.1-pdata-1.23.1/README.md index 80028d6c0..073661a3a 100644 --- a/pangolin/4.3.1-pdata-1.23.1/README.md +++ b/pangolin/4.3.1-pdata-1.23.1/README.md @@ -23,7 +23,7 @@ Additional tools: This docker image contains `usher` version 0.6.2. This version of usher has a bug that can cause some lineage A samples to be assigned to A.* sublineages or even B or B.* sublineages. -If you are running `pangolin` on early 2020 sequences that may be lineage A, it is highly recommended to use the assignment cache (and upgrade to usher 0.6.3 when it is available) [See the pangolin-assignment v1.23 release page for more details](https://github.com/cov-lineages/pangolin-assignment/releases/tag/v1.23 +If you are running `pangolin` on early 2020 sequences that may be lineage A, it is highly recommended to use the assignment cache (and upgrade to usher 0.6.3 when it is available) [See the pangolin-assignment v1.23 release page for more details](https://github.com/cov-lineages/pangolin-assignment/releases/tag/v1.23) We plan to upgrade `usher` to >=0.6.3 in the next release of this docker image. diff --git a/pangolin/4.3.1-pdata-1.23/README.md b/pangolin/4.3.1-pdata-1.23/README.md index 2b1f5dd60..80297285c 100644 --- a/pangolin/4.3.1-pdata-1.23/README.md +++ b/pangolin/4.3.1-pdata-1.23/README.md @@ -23,7 +23,7 @@ Additional tools: This docker image contains `usher` version 0.6.2. This version of usher has a bug that can cause some lineage A samples to be assigned to A.* sublineages or even B or B.* sublineages. -If you are running `pangolin` on early 2020 sequences that may be lineage A, it is highly recommended to use the assignment cache (and upgrade to usher 0.6.3 when it is available) [See the pangolin-assignment v1.23 release page for more details](https://github.com/cov-lineages/pangolin-assignment/releases/tag/v1.23 +If you are running `pangolin` on early 2020 sequences that may be lineage A, it is highly recommended to use the assignment cache (and upgrade to usher 0.6.3 when it is available) [See the pangolin-assignment v1.23 release page for more details](https://github.com/cov-lineages/pangolin-assignment/releases/tag/v1.23) We plan to upgrade `usher` to >=0.6.3 in the next release of this docker image.