sanger-tol · DLBPointon · Apr 4, 2024 · Oct 16, 2023 · Oct 17, 2023 · Oct 16, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -10,6 +10,8 @@ on:
 
 env:
   NXF_ANSI_LOG: false
+  NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity
+  NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity
 
 concurrency:
   group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
@@ -24,9 +26,15 @@ jobs:
     strategy:
       matrix:
         NXF_VER:
-          - "23.04.0"
+          - "22.10.1"
           - "latest-everything"
     steps:
+      - name: Get branch names
+        # Pulls the names of current branches in repo
+        # steps.branch-names.outputs.current_branch is used later and returns the name of the branch the PR is made FROM not to
+        id: branch-names
+        uses: tj-actions/branch-names@v8
+
       - name: Check out pipeline code
         uses: actions/checkout@v3
 
@@ -35,10 +43,34 @@ jobs:
         with:
           version: "${{ matrix.NXF_VER }}"
 
+      - name: Set up Singularity
+        run: |
+          mkdir -p $NXF_SINGULARITY_CACHEDIR
+          mkdir -p $NXF_SINGULARITY_LIBRARYDIR
+
+      - name: Setup apptainer
+        uses: eWaterCycle/setup-apptainer@main
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install nf-core
+        run: |
+          pip install nf-core
+
+      - name: NF-Core Download - download singularity containers
+        # Forcibly download repo on active branch and download SINGULARITY containers into the CACHE dir if not found
+        # Must occur after singularity install or will crash trying to dl containers
+        # Zip up this fresh download and run the checked out version
+        run: |
+          nf-core download sanger-tol/ascc --revision ${{ steps.branch-names.outputs.current_branch }} --compress none -d --force --outdir sanger-ascc --container-cache-utilisation amend --container-system singularity
+
       - name: Download test data
         # Download A fungal test data set that is full enough to show some real output.
         run: |
-          curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest.tar.gz | tar xzf -
+          curl https://tolit.cog.sanger.ac.uk/test-data/resources/ascc/asccTinyTest_V2.tar.gz | tar xzf -
 
       - name: Download the NCBI taxdump database
         run: |
@@ -48,11 +80,11 @@ jobs:
       - name: Download the FCS-gx database
         run: |
           mkdir FCS_gx
-          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv
-          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi
-          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs
-          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl
-          wget -c https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz
+          wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.taxa.tsv -O FCS_gx/all.taxa.tsv
+          wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxi -O FCS_gx/all.gxi
+          wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.gxs -O FCS_gx/all.gxs
+          wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.meta.jsonl -O FCS_gx/all.meta.jsonl
+          wget -cq https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.blast_div.tsv.gz -O FCS_gx/all.blast_div.tsv.gz
 
       - name: Download the BUSCO lineage database
         run: |
@@ -72,7 +104,12 @@ jobs:
       - name: Download the pacbio barcode
         run: |
           mkdir pacbio_barcode
-          wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && cd pacbio_barcode && unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip && mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa && rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX && cd ..
+          wget -O pacbio_barcode/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip -c https://www.pacb.com/wp-content/uploads/SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip
+          cd pacbio_barcode
+          unzip SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip
+          mv SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta pacbio_adaptors.fa
+          rm -rf SMRTbell_Barcoded_Adapter_Plate_3.0_bc2001-bc2096.fasta_.zip __MACOSX
+          cd ../
 
       - name: Download the subset of Diamond database
         run: |
@@ -84,9 +121,9 @@ jobs:
           mkdir vecscreen
           curl -L https://ftp.ncbi.nlm.nih.gov/blast/db/v4/16SMicrobial_v4.tar.gz | tar -C vecscreen -xzf -
 
-      - name: Run pipeline with test data
+      - name: Singularity - Run FULL pipeline with test data
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results --steps ALL
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1,20 +1,18 @@
 repository_type: pipeline
 lint:
-  files_exist:
-    - CODE_OF_CONDUCT.md
-    - assets/nf-core-ascc_logo_light.png
-    - docs/images/nf-core-ascc_logo_light.png
-    - docs/images/nf-core-ascc_logo_dark.png
-    - .github/ISSUE_TEMPLATE/config.yml
-    - .github/workflows/awstest.yml
-    - .github/workflows/awsfulltest.yml
-    - conf/igenomes.config
+  files_exist: false
   files_unchanged:
     - CODE_OF_CONDUCT.md
     - assets/nf-core-ascc_logo_light.png
     - docs/images/nf-core-ascc_logo_light.png
     - docs/images/nf-core-ascc_logo_dark.png
     - .github/ISSUE_TEMPLATE/bug_report.yml
+    - .github/workflows/branch.yml
+    - .github/CONTRIBUTING.md
+    - .github/PULL_REQUEST_TEMPLATE.md
+    - .github/workflows/linting_comment.yml
+    - assets/email_template.html
+    - pyproject.toml
     - LICENSE
     - .github/workflows/linting.yml
     - lib/NfcoreTemplate.groovy

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 [![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
 
-[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
+[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)

diff --git a/assets/github_testing/test.yaml b/assets/github_testing/test.yaml
@@ -1,15 +1,17 @@
-assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
-assembly_title: asccTinyTest
-pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
-pacbio_multiplexing_barcode_names: "bc2008,bc2009"
-reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio
+assembly_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa
+assembly_title: asccTinyTest_V2
+reads_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/pacbio/
 reads_type: "hifi"
+pacbio_barcodes: /home/runner/work/ascc/ascc/pacbio_barcode/pacbio_adaptors.fa
+pacbio_multiplexing_barcode_names: "bc2001,bc2009"
 sci_name: "Plasmodium yoelii yoelii 17XNL"
 taxid: 352914
-mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
-plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
+mito_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
+plastid_fasta_path: /home/runner/work/ascc/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
 kmer_len: 7
-## Below this point will need updating as more subworkflows are built
+dimensionality_reduction_methods: "pca,random_trees"
+# all available methods
+# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
 nt_database: /home/runner/work/ascc/ascc/NT_database/
 nt_database_prefix: 18S_fungal_sequences
 nt_kraken_db_path: /home/runner/work/ascc/ascc/kraken2/kraken2
@@ -20,7 +22,8 @@ busco_lineages_folder: /home/runner/work/ascc/ascc/busco_database/lineages
 fcs_gx_database_path: /home/runner/work/ascc/ascc/FCS_gx/
 diamond_uniprot_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
 diamond_nr_database_path: /home/runner/work/ascc/ascc/diamond/UP000000212_1234679_tax.dmnd
-vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen
+vecscreen_database_path: /home/runner/work/ascc/ascc/vecscreen/
 seqkit:
   sliding: 6000
   window: 100000
+n_neighbours: 13
diff --git a/assets/static-args.yaml b/assets/static-args.yaml
@@ -0,0 +1,3 @@
+kmer_size: 7
+n_neighbors_setting: 13
+autoencoder_epochs_count: -1
diff --git a/assets/test.yaml b/assets/test.yaml
@@ -1,14 +1,17 @@
-assembly_path: /lustre/scratch123/tol/teams/tola/users/ea10/pipeline_testing/20231114_pyoelii_vecscreen/ref/PlasmoDB-58_Pyoeliiyoelii17XNL_Genome_with_adapters2_fh2.fasta
-assembly_title: asccTinyTest
-reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
+assembly_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/assembly/pyoelii_tiny_testfile_with_adapters.fa
+assembly_title: asccTinyTest_V2
+reads_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/pacbio/
 reads_type: "hifi"
 pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
 pacbio_multiplexing_barcode_names: "bc2008,bc2009"
 sci_name: "Plasmodium yoelii yoelii 17XNL"
 taxid: 352914
-mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
-plastid_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
+mito_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_mitochondrion_ncbi.fa
+plastid_fasta_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/asccTinyTest_V2/organellar/Pyoeliiyoelii17XNL_apicoplast_ncbi.fa
 kmer_len: 7
+dimensionality_reduction_methods: "pca,random_trees"
+# all available methods
+# "pca,umap,t-sne,isomap,lle_standard,lle_hessian,lle_modified,mds,se,random_trees,kernel_pca,pca_svd,autoencoder_sigmoid,autoencoder_linear,autoencoder_selu,autoencoder_relu,nmf"
 nt_database: /data/blastdb/Supported/NT/202308/dbv4/
 nt_database_prefix: nt
 nt_kraken_db_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/nt/nt
@@ -17,9 +20,10 @@ ncbi_taxonomy_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdu
 ncbi_rankedlineage_path: /lustre/scratch123/tol/teams/tola/users/ea10/databases/taxdump/rankedlineage.dmp
 busco_lineages_folder: /lustre/scratch123/tol/resources/busco/data/v5/2021-08-27/lineages
 fcs_gx_database_path: /lustre/scratch124/tol/projects/asg/sub_projects/ncbi_decon/0.4.0/gxdb
-vecscreen_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/vecscreen_database
+vecscreen_database_path: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/vecscreen/
 diamond_uniprot_database_path: /lustre/scratch123/tol/teams/tola/users/ea10/ascc_databases/uniprot/uniprot_reference_proteomes_with_taxonnames.dmnd
 diamond_nr_database_path: /lustre/scratch123/tol/resources/nr/latest/nr.dmnd
 seqkit:
   sliding: 100000
   window: 6000
+n_neighbours: 13
diff --git a/bin/VSlistTo1HitPerLine.py b/bin/VSlistTo1HitPerLine.py
@@ -5,8 +5,8 @@
 
 This script converts the VecScreen text list output to one line giving the coordinates for each vector segment in the format:
 VecScreen_Category   ID_string   start_position   end_position
-The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including 
---skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line. 
+The default is to report Strong, Moderate, and Weak matches and also segments of Suspect Origin. Reporting of any category can be suppressed by including
+--skip_reporting_suspect_hits, --skip_reporting_weak_hits, --skip_reporting_moderate_hits or --skip_reporting_strong_hits on the command line.
 "No hits" will be reported for any Query sequence that had no matches in any of the selected categories, unless --skip_reporting_no_hits is included on the command line.
 VecScreen errors will be reported unless --skip_reporting_errors is included on the command line.
 Usage:

diff --git a/bin/get_kmers_counts.py b/bin/get_kmers_counts.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+"""
+Script for counting kmer frequencies per sequence in a FASTA file
+Output (STDOUT): kmer counts as a CSV table
+Developed by Eerik Aunin ([email protected])
+"""
+
+import argparse
+import general_purpose_functions as gpf
+import kcounter
+from collections import OrderedDict
+import pandas as pd
+
+
+def main(fasta_path, out_path, kmer_size):
+    fasta_data = gpf.read_fasta_in_chunks(fasta_path)
+    nucleotides_collection = list()
+    for header, seq in fasta_data:
+        seq = seq.upper()
+        seq_len = len(seq)
+        nucleotides_dict = kcounter.count_kmers(seq, kmer_size, canonical_kmers=True)
+        relative_counts_dict = OrderedDict()
+        relative_counts_dict["header"] = header
+        relative_counts_dict["seq_len"] = seq_len
+        for kmer in nucleotides_dict:
+            kmer_relative_count = nucleotides_dict[kmer] / seq_len
+            relative_counts_dict[kmer] = kmer_relative_count
+        nucleotides_collection.append(relative_counts_dict)
+    df = pd.DataFrame(nucleotides_collection)
+    df = df.fillna(0)
+    df.to_csv(out_path, index=False)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-v", "--version", action="version", version="1.0")
+    parser.add_argument("fasta_path", type=str, help="Path to input FASTA file")
+    parser.add_argument("out_path", type=str, help="Path for output CSV file")
+    parser.add_argument("--kmer_size", type=int, help="kmer size (bp). Default: 7", default=7)
+    args = parser.parse_args()
+    main(args.fasta_path, args.out_path, args.kmer_size)